* [PATCH RFC 1/5] fs: use all available ids
2024-07-19 11:41 [PATCH RFC 0/5] nsfs: iterate through mount namespaces Christian Brauner
@ 2024-07-19 11:41 ` Christian Brauner
2024-07-19 11:41 ` [PATCH RFC 2/5] fs: allow mount namespace fd Christian Brauner
` (5 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Christian Brauner @ 2024-07-19 11:41 UTC (permalink / raw)
To: linux-fsdevel
Cc: Josef Bacik, Jeff Layton, Karel Zak, Stephane Graber,
Christian Brauner, Alexander Mikhalitsyn
The counter is unconditionally incremented for each mount allocation.
If we set it to 1ULL << 32 we're losing 4294967296 as the first valid
non-32 bit mount id.
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
fs/namespace.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index 221db9de4729..328087a4df8a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -70,7 +70,7 @@ static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);
/* Don't allow confusion with old 32bit mount ID */
-#define MNT_UNIQUE_ID_OFFSET (1ULL << 32)
+#define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET);
static struct hlist_head *mount_hashtable __ro_after_init;
--
2.43.0
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH RFC 2/5] fs: allow mount namespace fd
2024-07-19 11:41 [PATCH RFC 0/5] nsfs: iterate through mount namespaces Christian Brauner
2024-07-19 11:41 ` [PATCH RFC 1/5] fs: use all available ids Christian Brauner
@ 2024-07-19 11:41 ` Christian Brauner
2024-07-19 11:41 ` [PATCH RFC 3/5] fs: add put_mnt_ns() cleanup helper Christian Brauner
` (4 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Christian Brauner @ 2024-07-19 11:41 UTC (permalink / raw)
To: linux-fsdevel
Cc: Josef Bacik, Jeff Layton, Karel Zak, Stephane Graber,
Christian Brauner, Alexander Mikhalitsyn
We already allow a mount namespace id, enable mount namespace file
descriptors as well.
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
fs/namespace.c | 39 ++++++++++++++++++++++++++++++++-------
1 file changed, 32 insertions(+), 7 deletions(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index 328087a4df8a..3ee8adb7f215 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5243,12 +5243,37 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
* that, or if not simply grab a passive reference on our mount namespace and
* return that.
*/
-static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id)
+static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq)
{
- if (mnt_ns_id)
- return lookup_mnt_ns(mnt_ns_id);
- refcount_inc(¤t->nsproxy->mnt_ns->passive);
- return current->nsproxy->mnt_ns;
+ struct mnt_namespace *mnt_ns;
+
+ if (kreq->mnt_ns_id && kreq->spare)
+ return ERR_PTR(-EINVAL);
+
+ if (kreq->mnt_ns_id)
+ return lookup_mnt_ns(kreq->mnt_ns_id);
+
+ if (kreq->spare) {
+ struct ns_common *ns;
+
+ CLASS(fd, f)(kreq->spare);
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (!proc_ns_file(f.file))
+ return ERR_PTR(-EINVAL);
+
+ ns = get_proc_ns(file_inode(f.file));
+ if (ns->ops->type != CLONE_NEWNS)
+ return ERR_PTR(-EINVAL);
+
+ mnt_ns = to_mnt_ns(ns);
+ } else {
+ mnt_ns = current->nsproxy->mnt_ns;
+ }
+
+ refcount_inc(&mnt_ns->passive);
+ return mnt_ns;
}
SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
@@ -5269,7 +5294,7 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
if (ret)
return ret;
- ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
+ ns = grab_requested_mnt_ns(&kreq);
if (!ns)
return -ENOENT;
@@ -5396,7 +5421,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
if (!kmnt_ids)
return -ENOMEM;
- ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
+ ns = grab_requested_mnt_ns(&kreq);
if (!ns)
return -ENOENT;
--
2.43.0
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH RFC 3/5] fs: add put_mnt_ns() cleanup helper
2024-07-19 11:41 [PATCH RFC 0/5] nsfs: iterate through mount namespaces Christian Brauner
2024-07-19 11:41 ` [PATCH RFC 1/5] fs: use all available ids Christian Brauner
2024-07-19 11:41 ` [PATCH RFC 2/5] fs: allow mount namespace fd Christian Brauner
@ 2024-07-19 11:41 ` Christian Brauner
2024-07-19 11:41 ` [PATCH RFC 4/5] file: add fput() " Christian Brauner
` (3 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Christian Brauner @ 2024-07-19 11:41 UTC (permalink / raw)
To: linux-fsdevel
Cc: Josef Bacik, Jeff Layton, Karel Zak, Stephane Graber,
Christian Brauner, Alexander Mikhalitsyn
Add a simple helper to put a mount namespace reference.
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
include/linux/mnt_namespace.h | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 8f882f5881e8..70b366b64816 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -3,6 +3,9 @@
#define _NAMESPACE_H_
#ifdef __KERNEL__
+#include <linux/cleanup.h>
+#include <linux/err.h>
+
struct mnt_namespace;
struct fs_struct;
struct user_namespace;
@@ -11,6 +14,7 @@ struct ns_common;
extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
struct user_namespace *, struct fs_struct *);
extern void put_mnt_ns(struct mnt_namespace *ns);
+DEFINE_FREE(put_mnt_ns, struct mnt_namespace *, if (!IS_ERR_OR_NULL(_T)) put_mnt_ns(_T))
extern struct ns_common *from_mnt_ns(struct mnt_namespace *);
extern const struct file_operations proc_mounts_operations;
--
2.43.0
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH RFC 4/5] file: add fput() cleanup helper
2024-07-19 11:41 [PATCH RFC 0/5] nsfs: iterate through mount namespaces Christian Brauner
` (2 preceding siblings ...)
2024-07-19 11:41 ` [PATCH RFC 3/5] fs: add put_mnt_ns() cleanup helper Christian Brauner
@ 2024-07-19 11:41 ` Christian Brauner
2024-07-19 11:41 ` [PATCH RFC 5/5] nsfs: iterate through mount namespaces Christian Brauner
` (2 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Christian Brauner @ 2024-07-19 11:41 UTC (permalink / raw)
To: linux-fsdevel
Cc: Josef Bacik, Jeff Layton, Karel Zak, Stephane Graber,
Christian Brauner, Alexander Mikhalitsyn
Add a simple helper to put a file reference.
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
include/linux/file.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/include/linux/file.h b/include/linux/file.h
index 237931f20739..d1e768b06069 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -11,6 +11,7 @@
#include <linux/posix_types.h>
#include <linux/errno.h>
#include <linux/cleanup.h>
+#include <linux/err.h>
struct file;
@@ -96,6 +97,7 @@ extern void put_unused_fd(unsigned int fd);
DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T),
get_unused_fd_flags(flags), unsigned flags)
+DEFINE_FREE(fput, struct file *, if (!IS_ERR_OR_NULL(_T)) fput(_T))
/*
* take_fd() will take care to set @fd to -EBADF ensuring that
--
2.43.0
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH RFC 5/5] nsfs: iterate through mount namespaces
2024-07-19 11:41 [PATCH RFC 0/5] nsfs: iterate through mount namespaces Christian Brauner
` (3 preceding siblings ...)
2024-07-19 11:41 ` [PATCH RFC 4/5] file: add fput() " Christian Brauner
@ 2024-07-19 11:41 ` Christian Brauner
2024-07-19 14:53 ` [PATCH RFC 0/5] " Josef Bacik
2024-07-22 14:42 ` Jeff Layton
6 siblings, 0 replies; 8+ messages in thread
From: Christian Brauner @ 2024-07-19 11:41 UTC (permalink / raw)
To: linux-fsdevel
Cc: Josef Bacik, Jeff Layton, Karel Zak, Stephane Graber,
Christian Brauner, Alexander Mikhalitsyn
It is already possible to list mounts in other mount namespaces and to
retrieve namespace file descriptors without having to go through procfs
by deriving them from pidfds.
Augment these abilities by adding the ability to retrieve information
about a mount namespace via NS_MNT_GET_INFO. This will return the mount
namespace id and the number of mounts currently in the mount namespace.
The number of mounts can be used to size the buffer that needs to be
used for listmount() and is in general useful without having to actually
iterate through all the mounts. The structure is extensible.
And add the ability to iterate through all mount namespaces over which
the caller holds privilege returning the file descriptor for the next or
previous mount namespace.
To retrieve a mount namespace the caller must be privileged wrt to it's
owning user namespace. This means that PID 1 on the host can list all
mounts in all mount namespaces or that a container can list all mounts
of its nested containers.
Optionally pass a structure for NS_MNT_GET_INFO with
NS_MNT_GET_{PREV,NEXT} to retrieve information about the mount namespace
in one go. Both ioctls can be implemented for other namespace types
easily.
Together with recent api additions this means one can iterate through
all mounts in all mount namespaces without ever touching procfs.
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
fs/mount.h | 13 ++++++
fs/namespace.c | 35 ++++++++++++++--
fs/nsfs.c | 102 +++++++++++++++++++++++++++++++++++++++++++++-
include/uapi/linux/nsfs.h | 15 +++++++
4 files changed, 159 insertions(+), 6 deletions(-)
diff --git a/fs/mount.h b/fs/mount.h
index ad4b1ddebb54..c1db0c709c6a 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -155,3 +155,16 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
bool has_locked_children(struct mount *mnt, struct dentry *dentry);
+struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous);
+static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns)
+{
+ return __lookup_next_mnt_ns(mntns, false);
+}
+static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns)
+{
+ return __lookup_next_mnt_ns(mntns, true);
+}
+static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct mnt_namespace, ns);
+}
diff --git a/fs/namespace.c b/fs/namespace.c
index 3ee8adb7f215..60e20f15e87e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2060,14 +2060,41 @@ static bool is_mnt_ns_file(struct dentry *dentry)
dentry->d_fsdata == &mntns_operations;
}
-static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
+struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
{
- return container_of(ns, struct mnt_namespace, ns);
+ return &mnt->ns;
}
-struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
+struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous)
{
- return &mnt->ns;
+ guard(read_lock)(&mnt_ns_tree_lock);
+ for (;;) {
+ struct rb_node *node;
+
+ if (previous)
+ node = rb_prev(&mntns->mnt_ns_tree_node);
+ else
+ node = rb_next(&mntns->mnt_ns_tree_node);
+ if (!node)
+ return ERR_PTR(-ENOENT);
+
+ mntns = node_to_mnt_ns(node);
+ node = &mntns->mnt_ns_tree_node;
+
+ if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
+ continue;
+
+ /*
+ * Holding mnt_ns_tree_lock prevents the mount namespace from
+ * being freed but it may well be on it's deathbed. We want an
+ * active reference, not just a passive one here as we're
+ * persisting the mount namespace.
+ */
+ if (!refcount_inc_not_zero(&mntns->ns.count))
+ continue;
+
+ return mntns;
+ }
}
static bool mnt_ns_loop(struct dentry *dentry)
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 97c37a9631e5..67ee176b8824 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -12,6 +12,7 @@
#include <linux/user_namespace.h>
#include <linux/nsfs.h>
#include <linux/uaccess.h>
+#include <linux/mnt_namespace.h>
#include "mount.h"
#include "internal.h"
@@ -128,6 +129,30 @@ int open_related_ns(struct ns_common *ns,
}
EXPORT_SYMBOL_GPL(open_related_ns);
+static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns,
+ struct mnt_ns_info __user *uinfo, size_t usize,
+ struct mnt_ns_info *kinfo)
+{
+ /*
+ * If userspace and the kernel have the same struct size it can just
+ * be copied. If userspace provides an older struct, only the bits that
+ * userspace knows about will be copied. If userspace provides a new
+ * struct, only the bits that the kernel knows aobut will be copied and
+ * the size value will be set to the size the kernel knows about.
+ */
+ kinfo->size = min(usize, sizeof(*kinfo));
+ kinfo->mnt_ns_id = mnt_ns->seq;
+ kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts);
+ /* Subtract the root mount of the mount namespace. */
+ if (kinfo->nr_mounts)
+ kinfo->nr_mounts--;
+
+ if (copy_to_user(uinfo, kinfo, kinfo->size))
+ return -EFAULT;
+
+ return 0;
+}
+
static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg)
{
@@ -135,6 +160,8 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
struct pid_namespace *pid_ns;
struct task_struct *tsk;
struct ns_common *ns = get_proc_ns(file_inode(filp));
+ struct mnt_namespace *mnt_ns;
+ bool previous = false;
uid_t __user *argp;
uid_t uid;
int ret;
@@ -156,7 +183,6 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
uid = from_kuid_munged(current_user_ns(), user_ns->owner);
return put_user(uid, argp);
case NS_GET_MNTNS_ID: {
- struct mnt_namespace *mnt_ns;
__u64 __user *idp;
__u64 id;
@@ -211,7 +237,79 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
if (!ret)
ret = -ESRCH;
- break;
+ return ret;
+ }
+ }
+
+ /* extensible ioctls */
+ switch (_IOC_NR(ioctl)) {
+ case _IOC_NR(NS_MNT_GET_INFO): {
+ struct mnt_ns_info kinfo = {};
+ struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg;
+ size_t usize = _IOC_SIZE(ioctl);
+
+ if (ns->ops->type != CLONE_NEWNS)
+ return -EINVAL;
+
+ if (!uinfo)
+ return -EINVAL;
+
+ if (usize < MNT_NS_INFO_SIZE_VER0)
+ return -EINVAL;
+
+ return copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo);
+ }
+ case _IOC_NR(NS_MNT_GET_PREV):
+ previous = true;
+ fallthrough;
+ case _IOC_NR(NS_MNT_GET_NEXT): {
+ struct mnt_ns_info kinfo = {};
+ struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg;
+ struct path path __free(path_put) = {};
+ struct file *f __free(fput) = NULL;
+ size_t usize = _IOC_SIZE(ioctl);
+
+ if (ns->ops->type != CLONE_NEWNS)
+ return -EINVAL;
+
+ if (usize < MNT_NS_INFO_SIZE_VER0)
+ return -EINVAL;
+
+ if (previous)
+ mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns));
+ else
+ mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns));
+ if (IS_ERR(mnt_ns))
+ return PTR_ERR(mnt_ns);
+
+ ns = to_ns_common(mnt_ns);
+ /* Transfer ownership of @mnt_ns reference to @path. */
+ ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
+ if (ret)
+ return ret;
+
+ CLASS(get_unused_fd, fd)(O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ f = dentry_open(&path, O_RDONLY, current_cred());
+ if (IS_ERR(f))
+ return PTR_ERR(f);
+
+ if (uinfo) {
+ /*
+ * If @uinfo is passed return all information about the
+ * mount namespace as well.
+ */
+ ret = copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo);
+ if (ret)
+ return ret;
+ }
+
+ /* Transfer reference of @f to caller's fdtable. */
+ fd_install(fd, no_free_ptr(f));
+ /* File descriptor is live so hand it off to the caller. */
+ return take_fd(fd);
}
default:
ret = -ENOTTY;
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index b133211331f6..bfb9666860a1 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -26,4 +26,19 @@
/* Return thread-group leader id of pid in the target pid namespace. */
#define NS_GET_TGID_IN_PIDNS _IOR(NSIO, 0x9, int)
+struct mnt_ns_info {
+ __u32 size;
+ __u32 nr_mounts;
+ __u64 mnt_ns_id;
+};
+
+#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */
+
+/* Get information about namespace. */
+#define NS_MNT_GET_INFO _IOR(NSIO, 10, struct mnt_ns_info)
+/* Get next namespace. */
+#define NS_MNT_GET_NEXT _IOR(NSIO, 11, struct mnt_ns_info)
+/* Get previous namespace. */
+#define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info)
+
#endif /* __LINUX_NSFS_H */
--
2.43.0
^ permalink raw reply related [flat|nested] 8+ messages in thread* Re: [PATCH RFC 0/5] nsfs: iterate through mount namespaces
2024-07-19 11:41 [PATCH RFC 0/5] nsfs: iterate through mount namespaces Christian Brauner
` (4 preceding siblings ...)
2024-07-19 11:41 ` [PATCH RFC 5/5] nsfs: iterate through mount namespaces Christian Brauner
@ 2024-07-19 14:53 ` Josef Bacik
2024-07-22 14:42 ` Jeff Layton
6 siblings, 0 replies; 8+ messages in thread
From: Josef Bacik @ 2024-07-19 14:53 UTC (permalink / raw)
To: Christian Brauner
Cc: linux-fsdevel, Jeff Layton, Karel Zak, Stephane Graber,
Alexander Mikhalitsyn
On Fri, Jul 19, 2024 at 01:41:47PM +0200, Christian Brauner wrote:
> Hey,
>
> Recently, we added the ability to list mounts in other mount namespaces
> and the ability to retrieve namespace file descriptors without having to
> go through procfs by deriving them from pidfds.
>
> This extends nsfs in two ways:
>
> (1) Add the ability to retrieve information about a mount namespace via
> NS_MNT_GET_INFO. This will return the mount namespace id and the
> number of mounts currently in the mount namespace. The number of
> mounts can be used to size the buffer that needs to be used for
> listmount() and is in general useful without having to actually
> iterate through all the mounts.
>
> The structure is extensible.
>
> (2) Add the ability to iterate through all mount namespaces over which
> the caller holds privilege returning the file descriptor for the
> next or previous mount namespace.
>
> To retrieve a mount namespace the caller must be privileged wrt to
> it's owning user namespace. This means that PID 1 on the host can
> list all mounts in all mount namespaces or that a container can list
> all mounts of its nested containers.
>
> Optionally pass a structure for NS_MNT_GET_INFO with
> NS_MNT_GET_{PREV,NEXT} to retrieve information about the mount
> namespace in one go.
>
> (1) and (2) can be implemented for other namespace types easily.
>
Love this, I think the only thing is a comment in include/uapi/linux/mount.h to
indicate what spare is used for with the new stuff. I'll update the man page
when this stuff lands but it would be good to document it somewhere. Other than
that you can add
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Thanks,
Josef
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [PATCH RFC 0/5] nsfs: iterate through mount namespaces
2024-07-19 11:41 [PATCH RFC 0/5] nsfs: iterate through mount namespaces Christian Brauner
` (5 preceding siblings ...)
2024-07-19 14:53 ` [PATCH RFC 0/5] " Josef Bacik
@ 2024-07-22 14:42 ` Jeff Layton
6 siblings, 0 replies; 8+ messages in thread
From: Jeff Layton @ 2024-07-22 14:42 UTC (permalink / raw)
To: Christian Brauner, linux-fsdevel
Cc: Josef Bacik, Karel Zak, Stephane Graber, Alexander Mikhalitsyn
On Fri, 2024-07-19 at 13:41 +0200, Christian Brauner wrote:
> Hey,
>
> Recently, we added the ability to list mounts in other mount
> namespaces
> and the ability to retrieve namespace file descriptors without having
> to
> go through procfs by deriving them from pidfds.
>
> This extends nsfs in two ways:
>
> (1) Add the ability to retrieve information about a mount namespace
> via
> NS_MNT_GET_INFO. This will return the mount namespace id and the
> number of mounts currently in the mount namespace. The number of
> mounts can be used to size the buffer that needs to be used for
> listmount() and is in general useful without having to actually
> iterate through all the mounts.
>
> The structure is extensible.
>
> (2) Add the ability to iterate through all mount namespaces over
> which
> the caller holds privilege returning the file descriptor for the
> next or previous mount namespace.
>
> To retrieve a mount namespace the caller must be privileged wrt
> to
> it's owning user namespace. This means that PID 1 on the host can
> list all mounts in all mount namespaces or that a container can
> list
> all mounts of its nested containers.
>
> Optionally pass a structure for NS_MNT_GET_INFO with
> NS_MNT_GET_{PREV,NEXT} to retrieve information about the mount
> namespace in one go.
>
> (1) and (2) can be implemented for other namespace types easily.
>
> Together with recent api additions this means one can iterate through
> all mounts in all mount namespaces without ever touching procfs.
> Here's
> a sample program list_all_mounts_everywhere.c:
>
> // SPDX-License-Identifier: GPL-2.0-or-later
>
> #define _GNU_SOURCE
> #include <asm/unistd.h>
> #include <assert.h>
> #include <errno.h>
> #include <fcntl.h>
> #include <getopt.h>
> #include <linux/stat.h>
> #include <sched.h>
> #include <stddef.h>
> #include <stdint.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <sys/ioctl.h>
> #include <sys/param.h>
> #include <sys/pidfd.h>
> #include <sys/stat.h>
> #include <sys/statfs.h>
>
> #define die_errno(format,
> ...) \
> do
> { \
> fprintf(stderr, "%m | %s: %d: %s: " format "\n",
> __FILE__, \
> __LINE__, __func__,
> ##__VA_ARGS__); \
> exit(EXIT_FAILURE);
> \
> } while (0)
>
> /* Get the id for a mount namespace */
> #define NS_GET_MNTNS_ID _IO(0xb7, 0x5)
> /* Get next mount namespace. */
>
> struct mnt_ns_info {
> __u32 size;
> __u32 nr_mounts;
> __u64 mnt_ns_id;
> };
>
> #define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct
> */
>
> /* Get information about namespace. */
> #define NS_MNT_GET_INFO _IOR(0xb7, 10, struct
> mnt_ns_info)
> /* Get next namespace. */
> #define NS_MNT_GET_NEXT _IOR(0xb7, 11, struct
> mnt_ns_info)
> /* Get previous namespace. */
> #define NS_MNT_GET_PREV _IOR(0xb7, 12, struct
> mnt_ns_info)
>
> #define PIDFD_GET_MNT_NAMESPACE _IO(0xFF, 3)
>
> #define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended
> stx_mount_id */
>
> #define __NR_listmount 458
> #define __NR_statmount 457
>
> /*
> * @mask bits for statmount(2)
> */
> #define STATMOUNT_SB_BASIC 0x00000001U /* Want/got
> sb_... */
> #define STATMOUNT_MNT_BASIC 0x00000002U /* Want/got
> mnt_... */
> #define STATMOUNT_PROPAGATE_FROM 0x00000004U /* Want/got
> propagate_from */
> #define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got
> mnt_root */
> #define STATMOUNT_MNT_POINT 0x00000010U /* Want/got
> mnt_point */
> #define STATMOUNT_FS_TYPE 0x00000020U /* Want/got
> fs_type */
> #define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got
> mnt_ns_id */
> #define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got
> mnt_opts */
>
> struct statmount {
> __u32 size; /* Total size, including strings */
> __u32 mnt_opts;
> __u64 mask; /* What results were written */
> __u32 sb_dev_major; /* Device ID */
> __u32 sb_dev_minor;
> __u64 sb_magic; /* ..._SUPER_MAGIC */
> __u32 sb_flags; /*
> SB_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
> __u32 fs_type; /* [str] Filesystem type */
> __u64 mnt_id; /* Unique ID of mount */
> __u64 mnt_parent_id; /* Unique ID of parent (for root ==
> mnt_id) */
> __u32 mnt_id_old; /* Reused IDs used in
> proc/.../mountinfo */
> __u32 mnt_parent_id_old;
> __u64 mnt_attr; /* MOUNT_ATTR_... */
> __u64 mnt_propagation; /*
> MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
> __u64 mnt_peer_group; /* ID of shared peer group */
> __u64 mnt_master; /* Mount receives propagation from
> this ID */
> __u64 propagate_from; /* Propagation from in current
> namespace */
> __u32 mnt_root; /* [str] Root of mount
> relative to root of fs */
> __u32 mnt_point; /* [str] Mountpoint relative to
> current root */
> __u64 mnt_ns_id;
> __u64 __spare2[49];
> char str[]; /* Variable size part containing
> strings */
> };
>
> struct mnt_id_req {
> __u32 size;
> __u32 spare;
> __u64 mnt_id;
> __u64 param;
> __u64 mnt_ns_id;
> };
>
> #define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct
> */
>
> #define LSMT_ROOT 0xffffffffffffffff /* root
> mount */
>
> static int __statmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 mask,
> struct statmount *stmnt, size_t bufsize,
> unsigned int flags)
> {
> struct mnt_id_req req = {
> .size = MNT_ID_REQ_SIZE_VER1,
> .mnt_id = mnt_id,
> .param = mask,
> .mnt_ns_id = mnt_ns_id,
> };
>
> return syscall(__NR_statmount, &req, stmnt, bufsize, flags);
> }
>
> static struct statmount *sys_statmount(__u64 mnt_id, __u64
> mnt_ns_id,
> __u64 mask, unsigned int
> flags)
> {
> size_t bufsize = 1 << 15;
> struct statmount *stmnt = NULL, *tmp = NULL;
> int ret;
>
> for (;;) {
> tmp = realloc(stmnt, bufsize);
> if (!tmp)
> goto out;
>
> stmnt = tmp;
> ret = __statmount(mnt_id, mnt_ns_id, mask, stmnt,
> bufsize, flags);
> if (!ret)
> return stmnt;
>
> if (errno != EOVERFLOW)
> goto out;
>
> bufsize <<= 1;
> if (bufsize >= UINT_MAX / 2)
> goto out;
>
> }
>
> out:
> free(stmnt);
> printf("statmount failed");
> return NULL;
> }
>
> static ssize_t sys_listmount(__u64 mnt_id, __u64 last_mnt_id, __u64
> mnt_ns_id,
> __u64 list[], size_t num, unsigned int
> flags)
> {
> struct mnt_id_req req = {
> .size = MNT_ID_REQ_SIZE_VER1,
> .mnt_id = mnt_id,
> .param = last_mnt_id,
> .mnt_ns_id = mnt_ns_id,
> };
>
> return syscall(__NR_listmount, &req, list, num, flags);
> }
>
> int main(int argc, char *argv[])
> {
> #define LISTMNT_BUFFER 10
> __u64 list[LISTMNT_BUFFER], last_mnt_id = 0;
> int ret, pidfd, fd_mntns;
> struct mnt_ns_info info = {};
>
> pidfd = pidfd_open(getpid(), 0);
> if (pidfd < 0)
> die_errno("pidfd_open failed");
>
> fd_mntns = ioctl(pidfd, PIDFD_GET_MNT_NAMESPACE, 0);
> if (fd_mntns < 0)
> die_errno("ioctl(PIDFD_GET_MNT_NAMESPACE) failed");
>
> ret = ioctl(fd_mntns, NS_MNT_GET_INFO, &info);
> if (ret < 0)
> die_errno("ioctl(NS_GET_MNTNS_ID) failed");
>
> printf("Listing %u mounts for mount namespace %d:%llu\n",
> info.nr_mounts, fd_mntns, info.mnt_ns_id);
> for (;;) {
> ssize_t nr_mounts;
> next:
> nr_mounts = sys_listmount(LSMT_ROOT, last_mnt_id,
> info.mnt_ns_id, list, LISTMNT_BUFFER, 0);
> if (nr_mounts <= 0) {
> printf("Finished listing mounts for mount
> namespace %d:%llu\n\n", fd_mntns, info.mnt_ns_id);
> ret = ioctl(fd_mntns, NS_MNT_GET_NEXT, 0);
> if (ret < 0)
> die_errno("ioctl(NS_MNT_GET_NEXT)
> failed");
> close(ret);
> ret = ioctl(fd_mntns, NS_MNT_GET_NEXT,
> &info);
> if (ret < 0) {
> if (errno == ENOENT) {
> printf("Finished listing all
> mount namespaces\n");
> exit(0);
> }
> die_errno("ioctl(NS_MNT_GET_NEXT)
> failed");
> }
> close(fd_mntns);
> fd_mntns = ret;
> last_mnt_id = 0;
> printf("Listing %u mounts for mount
> namespace %d:%llu\n", info.nr_mounts, fd_mntns, info.mnt_ns_id);
> goto next;
> }
>
> for (size_t cur = 0; cur < nr_mounts; cur++) {
> struct statmount *stmnt;
>
> last_mnt_id = list[cur];
>
> stmnt = sys_statmount(last_mnt_id,
> info.mnt_ns_id,
> STATMOUNT_SB_BASIC |
> STATMOUNT_MNT_BASIC |
> STATMOUNT_MNT_ROOT |
> STATMOUNT_MNT_POINT |
> STATMOUNT_MNT_NS_ID |
> STATMOUNT_MNT_OPTS |
> STATMOUNT_FS_TYPE,
> 0);
> if (!stmnt) {
> printf("Failed to statmount(%llu) in
> mount namespace(%llu)\n", last_mnt_id, info.mnt_ns_id);
> continue;
> }
>
> printf("mnt_id(%u/%llu) |
> mnt_parent_id(%u/%llu): %s @ %s ==> %s with options: %s\n",
> stmnt->mnt_id_old, stmnt->mnt_id,
> stmnt->mnt_parent_id_old, stmnt-
> >mnt_parent_id,
> stmnt->str + stmnt->fs_type,
> stmnt->str + stmnt->mnt_root,
> stmnt->str + stmnt->mnt_point,
> stmnt->str + stmnt->mnt_opts);
> free(stmnt);
> }
> }
>
> exit(0);
> }
>
> Thanks!
> Christian
>
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
> ---
> base-commit: 720261cfc7329406a50c2a8536e0039b9dd9a4e5
> change-id: 20240705-work-mount-namespace-126b73a11f5c
>
This all looks pretty straightforward to me. I do wish that we had
proper libc bindings for this...or maybe even a new userland library?
I just get the feeling that all of this syscall() and ioctl() usage is
eventually going to bite us in the ass. I don't have any concrete
proposal for that however, and we do have some immediate need for this
functionality, so, you can add
Reviewed-by: Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 8+ messages in thread