* [RFC PATCH iproute2-next 1/5] ip: Mount netns in child process instead of from inside the new namespace
2023-10-09 18:27 [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces Toke Høiland-Jørgensen
@ 2023-10-09 18:27 ` Toke Høiland-Jørgensen
2023-10-09 18:27 ` [RFC PATCH iproute2-next 2/5] ip: Split out code creating namespace mount dir so it can be reused Toke Høiland-Jørgensen
` (4 subsequent siblings)
5 siblings, 0 replies; 17+ messages in thread
From: Toke Høiland-Jørgensen @ 2023-10-09 18:27 UTC (permalink / raw)
To: David Ahern, Stephen Hemminger
Cc: netdev, Toke Høiland-Jørgensen, Nicolas Dichtel,
Christian Brauner, Eric W . Biederman, David Laight
Refactor the netns creation code so that we offload the mounting of the
namespace file to a child process instead of bind mounting from inside the newly
created namespace.
This is done in preparation for also persisting a mount namespace; the mount
namespace reference cannot be bind-mounted from inside that namespace itself, so
we need to mount that from a child process anyway. The child process
approach (as well as some of the helper functions used for it) is adapted from
the code in the unshare(1) utility that is part of util-linux.
No functional change is intended with this patch.
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
ip/ipnetns.c | 184 ++++++++++++++++++++++++++++++++++++---------------
1 file changed, 130 insertions(+), 54 deletions(-)
diff --git a/ip/ipnetns.c b/ip/ipnetns.c
index 9d996832aef8..a05d84514326 100644
--- a/ip/ipnetns.c
+++ b/ip/ipnetns.c
@@ -1,4 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <stdlib.h>
#define _ATFILE_SOURCE
#include <sys/file.h>
#include <sys/types.h>
@@ -7,6 +8,7 @@
#include <sys/inotify.h>
#include <sys/mount.h>
#include <sys/syscall.h>
+#include <sys/eventfd.h>
#include <stdio.h>
#include <string.h>
#include <sched.h>
@@ -25,6 +27,9 @@
#include "namespace.h"
#include "json_print.h"
+/* synchronize parent and child by pipe */
+#define PIPE_SYNC_BYTE 0x06
+
static int usage(void)
{
fprintf(stderr,
@@ -46,7 +51,6 @@ static int usage(void)
static struct rtnl_handle rtnsh = { .fd = -1 };
static int have_rtnl_getnsid = -1;
-static int saved_netns = -1;
static struct link_filter filter;
static int ipnetns_accept_msg(struct rtnl_ctrl_data *ctrl,
@@ -768,31 +772,131 @@ static int create_netns_dir(void)
return 0;
}
-/* Obtain a FD for the current namespace, so we can reenter it later */
-static void netns_save(void)
+/**
+ * waitchild() - Wait for a process to exit successfully
+ * @pid: PID of the process to wait for
+ *
+ * Wait for a process to exit successfully and return its exit status.
+ */
+static int waitchild(int pid)
{
- if (saved_netns != -1)
- return;
+ int rc, status;
+
+ do {
+ rc = waitpid(pid, &status, 0);
+ if (rc < 0) {
+ if (errno == EINTR)
+ continue;
+ return -errno;
+ }
+ if (WIFEXITED(status) &&
+ WEXITSTATUS(status) != EXIT_SUCCESS)
+ return WEXITSTATUS(status);
+ } while (rc < 0);
- saved_netns = open("/proc/self/ns/net", O_RDONLY | O_CLOEXEC);
- if (saved_netns == -1) {
- perror("Cannot open init namespace");
- exit(1);
+ return 0;
+}
+
+/**
+ * sync_with_child() - Tell our child we're ready and wait for it to exit
+ * @pid: The pid of our child
+ * @fd: A file descriptor created with eventfd()
+ *
+ * This tells a child created with fork_and_wait() that we are ready for it to
+ * continue. Once we have done that, wait for our child to exit.
+ */
+static int sync_with_child(pid_t pid, int fd)
+{
+ uint64_t ch = PIPE_SYNC_BYTE;
+
+ write(fd, &ch, sizeof(ch));
+ close(fd);
+
+ return waitchild(pid);
+}
+
+/**
+ * fork_and_wait() - Fork and wait to be sync'd with
+ * @fd - A file descriptor created with eventfd() which should be passed to
+ * sync_with_child()
+ *
+ * This creates an eventfd and forks. The parent process returns immediately,
+ * but the child waits for a %PIPE_SYNC_BYTE on the eventfd before returning.
+ * This allows the parent to perform some tasks before the child starts its
+ * work. The parent should call sync_with_child() once it is ready for the
+ * child to continue.
+ *
+ * Return: The pid from fork()
+ */
+static pid_t fork_and_wait(int *fd)
+{
+ uint64_t ch;
+ pid_t pid;
+
+ *fd = eventfd(0, 0);
+ if (*fd < 0) {
+ fprintf(stderr, "eventfd failed: %s\n", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ fprintf(stderr, "fork failed: %s\n", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ if (!pid) {
+ /* wait for the our parent to tell us to continue */
+ if (read(*fd, (char *)&ch, sizeof(ch)) != sizeof(ch) ||
+ ch != PIPE_SYNC_BYTE) {
+ fprintf(stderr, "failed to read eventfd\n");
+ exit(EXIT_FAILURE);
+ }
+ close(*fd);
}
+
+ return pid;
}
-static void netns_restore(void)
+static int bind_ns_file(const char *parent, const char *nsfile,
+ const char *ns_name, pid_t target_pid)
{
- if (saved_netns == -1)
- return;
+ char ns_path[PATH_MAX], proc_path[PATH_MAX];
+ int fd;
- if (setns(saved_netns, CLONE_NEWNET)) {
- perror("setns");
- exit(1);
+ snprintf(ns_path, sizeof(ns_path), "%s/%s", parent, ns_name);
+ snprintf(proc_path, sizeof(proc_path), "/proc/%d/ns/%s", target_pid, nsfile);
+
+ /* Create the filesystem state */
+ fd = open(ns_path, O_RDONLY|O_CREAT|O_EXCL, 0);
+ if (fd < 0) {
+ fprintf(stderr, "Cannot create namespace file \"%s\": %s\n",
+ ns_path, strerror(errno));
+ return -1;
}
+ close(fd);
- close(saved_netns);
- saved_netns = -1;
+ if (mount(proc_path, ns_path, "none", MS_BIND, NULL) < 0) {
+ fprintf(stderr, "Bind %s -> %s failed: %s\n", proc_path,
+ ns_path, strerror(errno));
+ unlink(ns_path);
+ return -1;
+ }
+ return 0;
+}
+
+static pid_t bind_ns_files_from_child(const char *ns_name, pid_t target_pid,
+ int *fd)
+{
+ pid_t child;
+
+ child = fork_and_wait(fd);
+ if (child)
+ return child;
+
+ if (bind_ns_file(NETNS_RUN_DIR, "net", ns_name, target_pid))
+ exit(EXIT_FAILURE);
+ exit(EXIT_SUCCESS);
}
static int netns_add(int argc, char **argv, bool create)
@@ -808,10 +912,9 @@ static int netns_add(int argc, char **argv, bool create)
* userspace tweaks like remounting /sys, or bind mounting
* a new /etc/resolv.conf can be shared between users.
*/
- char netns_path[PATH_MAX], proc_path[PATH_MAX];
const char *name;
- pid_t pid;
- int fd;
+ pid_t pid, child;
+ int event_fd;
int lock;
int made_netns_run_dir_mount = 0;
@@ -820,6 +923,7 @@ static int netns_add(int argc, char **argv, bool create)
fprintf(stderr, "No netns name specified\n");
return -1;
}
+ pid = getpid();
} else {
if (argc < 2) {
fprintf(stderr, "No netns name and PID specified\n");
@@ -833,8 +937,6 @@ static int netns_add(int argc, char **argv, bool create)
}
name = argv[0];
- snprintf(netns_path, sizeof(netns_path), "%s/%s", NETNS_RUN_DIR, name);
-
if (create_netns_dir())
return -1;
@@ -894,46 +996,20 @@ static int netns_add(int argc, char **argv, bool create)
close(lock);
}
- /* Create the filesystem state */
- fd = open(netns_path, O_RDONLY|O_CREAT|O_EXCL, 0);
- if (fd < 0) {
- fprintf(stderr, "Cannot create namespace file \"%s\": %s\n",
- netns_path, strerror(errno));
- return -1;
- }
- close(fd);
+ child = bind_ns_files_from_child(name, pid, &event_fd);
+ if (child < 0)
+ exit(EXIT_FAILURE);
if (create) {
- netns_save();
if (unshare(CLONE_NEWNET) < 0) {
fprintf(stderr, "Failed to create a new network namespace \"%s\": %s\n",
name, strerror(errno));
- goto out_delete;
+ close(event_fd);
+ exit(EXIT_FAILURE);
}
-
- strcpy(proc_path, "/proc/self/ns/net");
- } else {
- snprintf(proc_path, sizeof(proc_path), "/proc/%d/ns/net", pid);
- }
-
- /* Bind the netns last so I can watch for it */
- if (mount(proc_path, netns_path, "none", MS_BIND, NULL) < 0) {
- fprintf(stderr, "Bind %s -> %s failed: %s\n",
- proc_path, netns_path, strerror(errno));
- goto out_delete;
}
- netns_restore();
- return 0;
-out_delete:
- if (create) {
- netns_restore();
- netns_delete(argc, argv);
- } else if (unlink(netns_path) < 0) {
- fprintf(stderr, "Cannot remove namespace file \"%s\": %s\n",
- netns_path, strerror(errno));
- }
- return -1;
+ return sync_with_child(child, event_fd);
}
int set_netnsid_from_name(const char *name, int nsid)
--
2.42.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [RFC PATCH iproute2-next 2/5] ip: Split out code creating namespace mount dir so it can be reused
2023-10-09 18:27 [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces Toke Høiland-Jørgensen
2023-10-09 18:27 ` [RFC PATCH iproute2-next 1/5] ip: Mount netns in child process instead of from inside the new namespace Toke Høiland-Jørgensen
@ 2023-10-09 18:27 ` Toke Høiland-Jørgensen
2023-10-09 18:27 ` [RFC PATCH iproute2-next 3/5] lib/namespace: Factor out code for reuse Toke Høiland-Jørgensen
` (3 subsequent siblings)
5 siblings, 0 replies; 17+ messages in thread
From: Toke Høiland-Jørgensen @ 2023-10-09 18:27 UTC (permalink / raw)
To: David Ahern, Stephen Hemminger
Cc: netdev, Toke Høiland-Jørgensen, Nicolas Dichtel,
Christian Brauner, Eric W . Biederman, David Laight
Move the code creating the parent directory for namespace references into its
own function, so it can be reused for creating a separate directory to contain
mount namespace references.
No functional change is intended with this patch.
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
ip/ipnetns.c | 127 ++++++++++++++++++++++++++++-----------------------
1 file changed, 69 insertions(+), 58 deletions(-)
diff --git a/ip/ipnetns.c b/ip/ipnetns.c
index a05d84514326..529790482683 100644
--- a/ip/ipnetns.c
+++ b/ip/ipnetns.c
@@ -758,13 +758,13 @@ static int netns_delete(int argc, char **argv)
return on_netns_del(argv[0], NULL);
}
-static int create_netns_dir(void)
+static int ensure_dir(const char *path)
{
/* Create the base netns directory if it doesn't exist */
- if (mkdir(NETNS_RUN_DIR, S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH)) {
+ if (mkdir(path, S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH)) {
if (errno != EEXIST) {
fprintf(stderr, "mkdir %s failed: %s\n",
- NETNS_RUN_DIR, strerror(errno));
+ path, strerror(errno));
return -1;
}
}
@@ -899,53 +899,15 @@ static pid_t bind_ns_files_from_child(const char *ns_name, pid_t target_pid,
exit(EXIT_SUCCESS);
}
-static int netns_add(int argc, char **argv, bool create)
+static int prepare_ns_mount_dir(const char *target_dir, int mount_flag)
{
- /* This function creates a new network namespace and
- * a new mount namespace and bind them into a well known
- * location in the filesystem based on the name provided.
- *
- * If create is true, a new namespace will be created,
- * otherwise an existing one will be attached to the file.
- *
- * The mount namespace is created so that any necessary
- * userspace tweaks like remounting /sys, or bind mounting
- * a new /etc/resolv.conf can be shared between users.
- */
- const char *name;
- pid_t pid, child;
- int event_fd;
+ int made_dir_mount = 0;
int lock;
- int made_netns_run_dir_mount = 0;
- if (create) {
- if (argc < 1) {
- fprintf(stderr, "No netns name specified\n");
- return -1;
- }
- pid = getpid();
- } else {
- if (argc < 2) {
- fprintf(stderr, "No netns name and PID specified\n");
- return -1;
- }
-
- if (get_s32(&pid, argv[1], 0) || !pid) {
- fprintf(stderr, "Invalid PID: %s\n", argv[1]);
- return -1;
- }
- }
- name = argv[0];
-
- if (create_netns_dir())
+ if (ensure_dir(target_dir))
return -1;
- /* Make it possible for network namespace mounts to propagate between
- * mount namespaces. This makes it likely that a unmounting a network
- * namespace file in one namespace will unmount the network namespace
- * file in all namespaces allowing the network namespace to be freed
- * sooner.
- * These setup steps need to happen only once, as if multiple ip processes
+ /* These setup steps need to happen only once, as if multiple ip processes
* try to attempt the same operation at the same time, the mountpoints will
* be recursively created multiple times, eventually causing the system
* to lock up. For example, this has been observed when multiple netns
@@ -955,23 +917,23 @@ static int netns_add(int argc, char **argv, bool create)
* this cannot happen, but proceed nonetheless if it cannot happen for any
* reason.
*/
- lock = open(NETNS_RUN_DIR, O_RDONLY|O_DIRECTORY, 0);
+ lock = open(target_dir, O_RDONLY|O_DIRECTORY, 0);
if (lock < 0) {
- fprintf(stderr, "Cannot open netns runtime directory \"%s\": %s\n",
- NETNS_RUN_DIR, strerror(errno));
+ fprintf(stderr, "Cannot open ns runtime directory \"%s\": %s\n",
+ target_dir, strerror(errno));
return -1;
}
if (flock(lock, LOCK_EX) < 0) {
- fprintf(stderr, "Warning: could not flock netns runtime directory \"%s\": %s\n",
- NETNS_RUN_DIR, strerror(errno));
+ fprintf(stderr, "Warning: could not flock ns runtime directory \"%s\": %s\n",
+ target_dir, strerror(errno));
close(lock);
lock = -1;
}
- while (mount("", NETNS_RUN_DIR, "none", MS_SHARED | MS_REC, NULL)) {
+ while (mount("", target_dir, "none", mount_flag | MS_REC, NULL)) {
/* Fail unless we need to make the mount point */
- if (errno != EINVAL || made_netns_run_dir_mount) {
+ if (errno != EINVAL || made_dir_mount) {
fprintf(stderr, "mount --make-shared %s failed: %s\n",
- NETNS_RUN_DIR, strerror(errno));
+ target_dir, strerror(errno));
if (lock != -1) {
flock(lock, LOCK_UN);
close(lock);
@@ -979,23 +941,72 @@ static int netns_add(int argc, char **argv, bool create)
return -1;
}
- /* Upgrade NETNS_RUN_DIR to a mount point */
- if (mount(NETNS_RUN_DIR, NETNS_RUN_DIR, "none", MS_BIND | MS_REC, NULL)) {
+ /* Upgrade target directory to a mount point */
+ if (mount(target_dir, target_dir, "none", MS_BIND | MS_REC, NULL)) {
fprintf(stderr, "mount --bind %s %s failed: %s\n",
- NETNS_RUN_DIR, NETNS_RUN_DIR, strerror(errno));
+ target_dir, target_dir, strerror(errno));
if (lock != -1) {
flock(lock, LOCK_UN);
close(lock);
}
return -1;
}
- made_netns_run_dir_mount = 1;
+ made_dir_mount = 1;
}
if (lock != -1) {
flock(lock, LOCK_UN);
close(lock);
}
+ return 0;
+}
+
+static int netns_add(int argc, char **argv, bool create)
+{
+ /* This function creates a new network namespace and
+ * a new mount namespace and bind them into a well known
+ * location in the filesystem based on the name provided.
+ *
+ * If create is true, a new namespace will be created,
+ * otherwise an existing one will be attached to the file.
+ *
+ * The mount namespace is created so that any necessary
+ * userspace tweaks like remounting /sys, or bind mounting
+ * a new /etc/resolv.conf can be shared between users.
+ */
+ const char *name;
+ pid_t pid, child;
+ int event_fd;
+
+ if (create) {
+ if (argc < 1) {
+ fprintf(stderr, "No netns name specified\n");
+ return -1;
+ }
+ pid = getpid();
+ } else {
+ if (argc < 2) {
+ fprintf(stderr, "No netns name and PID specified\n");
+ return -1;
+ }
+
+ if (get_s32(&pid, argv[1], 0) || !pid) {
+ fprintf(stderr, "Invalid PID: %s\n", argv[1]);
+ return -1;
+ }
+ }
+ name = argv[0];
+
+ /* Pass the MS_SHARED flag to the mount of the network namespace
+ * directory to make it possible for network namespace mounts to
+ * propagate between mount namespaces. This makes it likely that a
+ * unmounting a network namespace file in one namespace will unmount the
+ * network namespace file in all namespaces allowing the network
+ * namespace to be freed sooner.
+ */
+ if (prepare_ns_mount_dir(NETNS_RUN_DIR, MS_SHARED))
+ return -1;
+
child = bind_ns_files_from_child(name, pid, &event_fd);
if (child < 0)
exit(EXIT_FAILURE);
@@ -1079,7 +1090,7 @@ static int netns_monitor(int argc, char **argv)
return -1;
}
- if (create_netns_dir())
+ if (ensure_dir(NETNS_RUN_DIR))
return -1;
if (inotify_add_watch(fd, NETNS_RUN_DIR, IN_CREATE | IN_DELETE) < 0) {
--
2.42.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [RFC PATCH iproute2-next 3/5] lib/namespace: Factor out code for reuse
2023-10-09 18:27 [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces Toke Høiland-Jørgensen
2023-10-09 18:27 ` [RFC PATCH iproute2-next 1/5] ip: Mount netns in child process instead of from inside the new namespace Toke Høiland-Jørgensen
2023-10-09 18:27 ` [RFC PATCH iproute2-next 2/5] ip: Split out code creating namespace mount dir so it can be reused Toke Høiland-Jørgensen
@ 2023-10-09 18:27 ` Toke Høiland-Jørgensen
2023-10-09 18:27 ` [RFC PATCH iproute2-next 4/5] ip: Also create and persist mount namespace when creating netns Toke Høiland-Jørgensen
` (2 subsequent siblings)
5 siblings, 0 replies; 17+ messages in thread
From: Toke Høiland-Jørgensen @ 2023-10-09 18:27 UTC (permalink / raw)
To: David Ahern, Stephen Hemminger
Cc: netdev, Toke Høiland-Jørgensen, Nicolas Dichtel,
Christian Brauner, Eric W . Biederman, David Laight
Factor out the code that switches namespaces and the code that sets up a new
mount namespace into utility functions that can be reused when we add mount
namespace pinning.
No functional change is intended with this patch.
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
include/namespace.h | 1 +
lib/namespace.c | 73 ++++++++++++++++++++++++++++++++-------------
2 files changed, 54 insertions(+), 20 deletions(-)
diff --git a/include/namespace.h b/include/namespace.h
index e47f9b5d49d1..b694a12e8397 100644
--- a/include/namespace.h
+++ b/include/namespace.h
@@ -49,6 +49,7 @@ static inline int setns(int fd, int nstype)
}
#endif /* HAVE_SETNS */
+int prepare_mountns(const char *name, bool do_unshare);
int netns_switch(char *netns);
int netns_get_fd(const char *netns);
int netns_foreach(int (*func)(char *nsname, void *arg), void *arg);
diff --git a/lib/namespace.c b/lib/namespace.c
index 1202fa85f97d..5e310762f34b 100644
--- a/lib/namespace.c
+++ b/lib/namespace.c
@@ -11,6 +11,25 @@
#include "utils.h"
#include "namespace.h"
+static struct namespace_typename {
+ int type; /* CLONE_NEW* */
+ const char *name; /* <type> */
+} namespace_names[] = {
+ { .type = CLONE_NEWNET, .name = "network" },
+ { .type = CLONE_NEWNS, .name = "mount" },
+ { .name = NULL }
+};
+
+static const char *ns_typename(int nstype)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(namespace_names); i++)
+ if (namespace_names[i].type == nstype)
+ return namespace_names[i].name;
+ return NULL;
+}
+
static void bind_etc(const char *name)
{
char etc_netns_path[sizeof(NETNS_ETC_DIR) + NAME_MAX];
@@ -42,30 +61,12 @@ static void bind_etc(const char *name)
closedir(dir);
}
-int netns_switch(char *name)
+int prepare_mountns(const char *name, bool do_unshare)
{
- char net_path[PATH_MAX];
- int netns;
unsigned long mountflags = 0;
struct statvfs fsstat;
- snprintf(net_path, sizeof(net_path), "%s/%s", NETNS_RUN_DIR, name);
- netns = open(net_path, O_RDONLY | O_CLOEXEC);
- if (netns < 0) {
- fprintf(stderr, "Cannot open network namespace \"%s\": %s\n",
- name, strerror(errno));
- return -1;
- }
-
- if (setns(netns, CLONE_NEWNET) < 0) {
- fprintf(stderr, "setting the network namespace \"%s\" failed: %s\n",
- name, strerror(errno));
- close(netns);
- return -1;
- }
- close(netns);
-
- if (unshare(CLONE_NEWNS) < 0) {
+ if (do_unshare && unshare(CLONE_NEWNS) < 0) {
fprintf(stderr, "unshare failed: %s\n", strerror(errno));
return -1;
}
@@ -97,6 +98,38 @@ int netns_switch(char *name)
return 0;
}
+static int switch_ns(const char *parent_dir, const char *name, int nstype)
+{
+ char ns_path[PATH_MAX];
+ int ns_fd;
+
+ snprintf(ns_path, sizeof(ns_path), "%s/%s", parent_dir, name);
+ ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC);
+ if (ns_fd < 0) {
+ fprintf(stderr, "Cannot open %s namespace \"%s\": %s\n",
+ ns_typename(nstype), name, strerror(errno));
+ return -1;
+ }
+
+ if (setns(ns_fd, nstype) < 0) {
+ fprintf(stderr, "setting the %s namespace \"%s\" failed: %s\n",
+ ns_typename(nstype), name, strerror(errno));
+ close(ns_fd);
+ return -1;
+ }
+ close(ns_fd);
+ return 0;
+}
+
+int netns_switch(char *name)
+{
+
+ if (switch_ns(NETNS_RUN_DIR, name, CLONE_NEWNET))
+ return -1;
+
+ return prepare_mountns(name, true);
+}
+
int netns_get_fd(const char *name)
{
char pathbuf[PATH_MAX];
--
2.42.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [RFC PATCH iproute2-next 4/5] ip: Also create and persist mount namespace when creating netns
2023-10-09 18:27 [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces Toke Høiland-Jørgensen
` (2 preceding siblings ...)
2023-10-09 18:27 ` [RFC PATCH iproute2-next 3/5] lib/namespace: Factor out code for reuse Toke Høiland-Jørgensen
@ 2023-10-09 18:27 ` Toke Høiland-Jørgensen
2023-10-09 18:27 ` [RFC PATCH iproute2-next 5/5] lib/namespace: Also mount a bpffs instance inside new mount namespaces Toke Høiland-Jørgensen
2023-10-09 20:32 ` [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces Eric W. Biederman
5 siblings, 0 replies; 17+ messages in thread
From: Toke Høiland-Jørgensen @ 2023-10-09 18:27 UTC (permalink / raw)
To: David Ahern, Stephen Hemminger
Cc: netdev, Toke Høiland-Jørgensen, Nicolas Dichtel,
Christian Brauner, Eric W . Biederman, David Laight
When creating a new network namespace, persist not only the network namespace
reference itself, but also create and persist a new mount namespace that is
paired with the network namespace. This means that multiple subsequent
invocations of 'ip netns exec' will reuse the same mount namespace instead of
creating a new namespace on every entry, as was the behaviour before this patch.
The persistent mount namespace has the benefit that any new mounts created
inside the namespace will persist. Most notably, this is useful when using bpffs
instances along with 'ip netns', as these were previously transient to a single
'ip netns' invocation.
To preserve backwards compatibility, when changing namespaces we will fall back
to the old behaviour of creating a new mount namespace when switching netns, if
we can't find a persisted namespace to enter. This can happen if the netns
instance was created with a previous version of iproute2 that doesn't persist
the mount namespace.
One caveat of the mount namespace persistence is that we can't make the
containing directory mount shared, the way we do with the netns mounts. This
means that if 'ip netns del' is invoked *inside* a namespace created with 'ip
netns', the mount namespace reference will not be deleted and will stick around
in the original mount namespace where it was created. This is unavoidable
because it is not possible to create a bind-mounted reference to a mount
namespace inside that same mount namespace (as that would create a circular
reference).
In such a situation, we may end up with the network namespace reference being
removed but the mount namespace reference sticking around (the same thing can
happen if 'ip netns del' is executed with an older version of iproute2). In this
situation, a subsequent 'ip netns add' with the same namespace name will end up
reusing the old mount namespace reference.
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
Makefile | 2 ++
ip/ipnetns.c | 64 +++++++++++++++++++++++++++++++++++++++++++------
lib/namespace.c | 8 ++++++-
3 files changed, 66 insertions(+), 8 deletions(-)
diff --git a/Makefile b/Makefile
index 5c559c8dc805..aeb1ddc53c6a 100644
--- a/Makefile
+++ b/Makefile
@@ -19,6 +19,7 @@ SBINDIR?=/sbin
CONF_ETC_DIR?=/etc/iproute2
CONF_USR_DIR?=$(LIBDIR)/iproute2
NETNS_RUN_DIR?=/var/run/netns
+MNTNS_RUN_DIR?=/var/run/netns-mnt
NETNS_ETC_DIR?=/etc/netns
DATADIR?=$(PREFIX)/share
HDRDIR?=$(PREFIX)/include/iproute2
@@ -41,6 +42,7 @@ endif
DEFINES+=-DCONF_USR_DIR=\"$(CONF_USR_DIR)\" \
-DCONF_ETC_DIR=\"$(CONF_ETC_DIR)\" \
-DNETNS_RUN_DIR=\"$(NETNS_RUN_DIR)\" \
+ -DMNTNS_RUN_DIR=\"$(MNTNS_RUN_DIR)\" \
-DNETNS_ETC_DIR=\"$(NETNS_ETC_DIR)\" \
-DCONF_COLOR=$(CONF_COLOR)
diff --git a/ip/ipnetns.c b/ip/ipnetns.c
index 529790482683..551819577755 100644
--- a/ip/ipnetns.c
+++ b/ip/ipnetns.c
@@ -733,13 +733,24 @@ static int netns_identify(int argc, char **argv)
static int on_netns_del(char *nsname, void *arg)
{
- char netns_path[PATH_MAX];
+ char ns_path[PATH_MAX];
+ struct stat st;
+
+ snprintf(ns_path, sizeof(ns_path), "%s/%s", MNTNS_RUN_DIR, nsname);
+ if (!stat(ns_path, &st)) { /* may not exist if created by old iproute2 */
+ umount2(ns_path, MNT_DETACH);
+ if (unlink(ns_path) < 0) {
+ fprintf(stderr, "Cannot remove namespace file \"%s\": %s\n",
+ ns_path, strerror(errno));
+ return -1;
+ }
+ }
- snprintf(netns_path, sizeof(netns_path), "%s/%s", NETNS_RUN_DIR, nsname);
- umount2(netns_path, MNT_DETACH);
- if (unlink(netns_path) < 0) {
+ snprintf(ns_path, sizeof(ns_path), "%s/%s", NETNS_RUN_DIR, nsname);
+ umount2(ns_path, MNT_DETACH);
+ if (unlink(ns_path) < 0) {
fprintf(stderr, "Cannot remove namespace file \"%s\": %s\n",
- netns_path, strerror(errno));
+ ns_path, strerror(errno));
return -1;
}
return 0;
@@ -885,17 +896,46 @@ static int bind_ns_file(const char *parent, const char *nsfile,
return 0;
}
+static ino_t get_mnt_ino(pid_t pid)
+{
+ char path[PATH_MAX];
+ struct stat st;
+
+ snprintf(path, sizeof(path), "/proc/%u/ns/mnt", (unsigned) pid);
+
+ if (stat(path, &st) != 0) {
+ fprintf(stderr, "stat of %s failed: %s\n",
+ path, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ return st.st_ino;
+}
+
static pid_t bind_ns_files_from_child(const char *ns_name, pid_t target_pid,
int *fd)
{
+ ino_t mnt_ino;
pid_t child;
+ mnt_ino = get_mnt_ino(getpid());
+
child = fork_and_wait(fd);
if (child)
return child;
if (bind_ns_file(NETNS_RUN_DIR, "net", ns_name, target_pid))
exit(EXIT_FAILURE);
+
+ /* We can only bind the mount namespace reference if the target pid is
+ * actually in a different mount namespace than ourselves. We ignore any
+ * errors in creating the mount namespace reference because an old
+ * namespace mount may be present if a network namespace with the same
+ * name was previously removed by an older version of iproute2; in this
+ * case that old reference will just be reused.
+ */
+ if (mnt_ino != get_mnt_ino(target_pid))
+ bind_ns_file(MNTNS_RUN_DIR, "mnt", ns_name, target_pid);
+
exit(EXIT_SUCCESS);
}
@@ -1003,8 +1043,13 @@ static int netns_add(int argc, char **argv, bool create)
* unmounting a network namespace file in one namespace will unmount the
* network namespace file in all namespaces allowing the network
* namespace to be freed sooner.
+ *
+ * The mount namespace directory cannot be shared because it's not
+ * possible to mount references to a mount namespace inside that
+ * namespace itself.
*/
- if (prepare_ns_mount_dir(NETNS_RUN_DIR, MS_SHARED))
+ if (prepare_ns_mount_dir(NETNS_RUN_DIR, MS_SHARED) ||
+ prepare_ns_mount_dir(MNTNS_RUN_DIR, MS_SLAVE))
return -1;
child = bind_ns_files_from_child(name, pid, &event_fd);
@@ -1012,12 +1057,17 @@ static int netns_add(int argc, char **argv, bool create)
exit(EXIT_FAILURE);
if (create) {
- if (unshare(CLONE_NEWNET) < 0) {
+ if (unshare(CLONE_NEWNET | CLONE_NEWNS) < 0) {
fprintf(stderr, "Failed to create a new network namespace \"%s\": %s\n",
name, strerror(errno));
close(event_fd);
exit(EXIT_FAILURE);
}
+
+ if (prepare_mountns(name, false)) {
+ close(event_fd);
+ exit(EXIT_FAILURE);
+ }
}
return sync_with_child(child, event_fd);
diff --git a/lib/namespace.c b/lib/namespace.c
index 5e310762f34b..5f2449fb0003 100644
--- a/lib/namespace.c
+++ b/lib/namespace.c
@@ -127,7 +127,13 @@ int netns_switch(char *name)
if (switch_ns(NETNS_RUN_DIR, name, CLONE_NEWNET))
return -1;
- return prepare_mountns(name, true);
+ /* Try to enter an existing persisted mount namespace. If this fails,
+ * preserve the old behaviour of creating a new namespace on entry.
+ */
+ if (switch_ns(MNTNS_RUN_DIR, name, CLONE_NEWNS))
+ return prepare_mountns(name, true);
+
+ return 0;
}
int netns_get_fd(const char *name)
--
2.42.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [RFC PATCH iproute2-next 5/5] lib/namespace: Also mount a bpffs instance inside new mount namespaces
2023-10-09 18:27 [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces Toke Høiland-Jørgensen
` (3 preceding siblings ...)
2023-10-09 18:27 ` [RFC PATCH iproute2-next 4/5] ip: Also create and persist mount namespace when creating netns Toke Høiland-Jørgensen
@ 2023-10-09 18:27 ` Toke Høiland-Jørgensen
2023-10-09 20:32 ` [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces Eric W. Biederman
5 siblings, 0 replies; 17+ messages in thread
From: Toke Høiland-Jørgensen @ 2023-10-09 18:27 UTC (permalink / raw)
To: David Ahern, Stephen Hemminger
Cc: netdev, Toke Høiland-Jørgensen, Nicolas Dichtel,
Christian Brauner, Eric W . Biederman, David Laight
When creating a new mount namespace, we remount /sys inside that namespace,
which means there is no bpffs available unless it is manually remounted later.
To make it easier to work with BPF in combination with 'ip netns', make sure we
always mount a bpffs instance to /sys/fs/bpf after creating a new namespace.
Since bpffs may not always be available, we only warn if the mounting fails, but
carry on regardless.
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
lib/namespace.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/lib/namespace.c b/lib/namespace.c
index 5f2449fb0003..62456ab24e4f 100644
--- a/lib/namespace.c
+++ b/lib/namespace.c
@@ -93,6 +93,9 @@ int prepare_mountns(const char *name, bool do_unshare)
return -1;
}
+ if (mount("bpf", "/sys/fs/bpf", "bpf", mountflags, NULL) < 0)
+ fprintf(stderr, "could not mount /sys/fs/bpf inside namespace: %s. continuing anyway\n",strerror(errno));
+
/* Setup bind mounts for config files in /etc */
bind_etc(name);
return 0;
--
2.42.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* Re: [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces
2023-10-09 18:27 [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces Toke Høiland-Jørgensen
` (4 preceding siblings ...)
2023-10-09 18:27 ` [RFC PATCH iproute2-next 5/5] lib/namespace: Also mount a bpffs instance inside new mount namespaces Toke Høiland-Jørgensen
@ 2023-10-09 20:32 ` Eric W. Biederman
2023-10-09 22:03 ` Toke Høiland-Jørgensen
2023-10-10 8:42 ` David Laight
5 siblings, 2 replies; 17+ messages in thread
From: Eric W. Biederman @ 2023-10-09 20:32 UTC (permalink / raw)
To: Toke Høiland-Jørgensen
Cc: David Ahern, Stephen Hemminger, netdev, Nicolas Dichtel,
Christian Brauner, David Laight
Toke Høiland-Jørgensen <toke@redhat.com> writes:
> The 'ip netns' command is used for setting up network namespaces with persistent
> named references, and is integrated into various other commands of iproute2 via
> the -n switch.
>
> This is useful both for testing setups and for simple script-based namespacing
> but has one drawback: the lack of persistent mounts inside the spawned
> namespace. This is particularly apparent when working with BPF programs that use
> pinning to bpffs: by default no bpffs is available inside a namespace, and
> even if mounting one, that fs disappears as soon as the calling
> command exits.
It would be entirely reasonable to copy mounts like /sys/fs/bpf from the
original mount namespace into the temporary mount namespace used by
"ip netns".
I would call it a bug that "ip netns" doesn't do that already.
I suspect that "ip netns" does copy the mounts from the old sysfs onto
the new sysfs is your entire problem.
Or is their a reason that bpffs should be per network namespace?
> The underlying cause for this is that iproute2 will create a new mount namespace
> every time it switches into a network namespace. This is needed to be able to
> mount a /sys filesystem that shows the correct network device information, but
> has the unfortunate side effect of making mounts entirely transient for any 'ip
> netns' invocation.
Mount propagation can be made to work if necessary, that would solve the
transient problem.
> This series is an attempt to fix this situation, by persisting a mount namespace
> alongside the persistent network namespace (in a separate directory,
> /run/netns-mnt). Doing this allows us to still have a consistent /sys inside
> the namespace, but with persistence so any mounts survive.
I really don't like that direction.
"ip netns" was designed and really should continue to be a command that
makes the world look like it has a single network namespace, for
compatibility with old code. Part of that old code "ip netns" supports
is "ip" itself.
I think you are making bpffs unnecessarily per network namespace.
> This mode does come with some caveats. I'm sending this as RFC to get feedback
> on whether this is the right thing to do, especially considering backwards
> compatibility. On balance, I think that the approach taken here of
> unconditionally persisting the mount namespace, and using that persistent
> reference whenever it exists, is better than the current behaviour, and that
> while it does represent a change in behaviour it is backwards compatible in a
> way that won't cause issues. But please do comment on this; see the patch
> description of patch 4 for details.
As I understand it this will cause a problem for any application that
is network namespace aware and does not use "ip netns" to wrap itself.
I am fairly certain that pinning the mount namespace will result in
never seeing an update of /etc/resolve.conf. At least if you
are on a system that has /etc/netns/NAME/resolve.conf
Unless I am missing something I think you are trying to solve the wrong
problem. I think all it will take is for the new mount of /sys to have
the same mounts on it as the previous mount of /sys.
Eric
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces
2023-10-09 20:32 ` [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces Eric W. Biederman
@ 2023-10-09 22:03 ` Toke Høiland-Jørgensen
2023-10-10 0:14 ` Eric W. Biederman
2023-10-10 8:42 ` David Laight
1 sibling, 1 reply; 17+ messages in thread
From: Toke Høiland-Jørgensen @ 2023-10-09 22:03 UTC (permalink / raw)
To: Eric W. Biederman
Cc: David Ahern, Stephen Hemminger, netdev, Nicolas Dichtel,
Christian Brauner, David Laight
"Eric W. Biederman" <ebiederm@xmission.com> writes:
> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>
>> The 'ip netns' command is used for setting up network namespaces with persistent
>> named references, and is integrated into various other commands of iproute2 via
>> the -n switch.
>>
>> This is useful both for testing setups and for simple script-based namespacing
>> but has one drawback: the lack of persistent mounts inside the spawned
>> namespace. This is particularly apparent when working with BPF programs that use
>> pinning to bpffs: by default no bpffs is available inside a namespace, and
>> even if mounting one, that fs disappears as soon as the calling
>> command exits.
>
> It would be entirely reasonable to copy mounts like /sys/fs/bpf from the
> original mount namespace into the temporary mount namespace used by
> "ip netns".
>
> I would call it a bug that "ip netns" doesn't do that already.
>
> I suspect that "ip netns" does copy the mounts from the old sysfs onto
> the new sysfs is your entire problem.
How would it do that? Walk mtab and remount everything identically after
remounting /sys? Or is there a smarter way to go about this?
> Or is their a reason that bpffs should be per network namespace?
Well, I first ran into this issue because of a bug report to
xdp-tools/libxdp about things not working correctly in network
namespaces:
https://github.com/xdp-project/xdp-tools/issues/364
And libxdp does assume that there's a separate bpffs per network
namespace: it persists things into the bpffs that is tied to the network
devices in the current namespace. So if the bpffs is shared, an
application running inside the network namespace could access XDP
programs loaded in the root namespace. I don't know, but suspect, that
such assumptions would be relatively common in networking BPF programs
that use pinning (the pinning support in libbpf and iproute2 itself at
least have the same leaking problem if the bpffs is shared).
>> The underlying cause for this is that iproute2 will create a new mount namespace
>> every time it switches into a network namespace. This is needed to be able to
>> mount a /sys filesystem that shows the correct network device information, but
>> has the unfortunate side effect of making mounts entirely transient for any 'ip
>> netns' invocation.
>
> Mount propagation can be made to work if necessary, that would solve the
> transient problem.
Is mount propagation different from the remount thing you mentioned
above, or is this something different?
(Sorry for being hopelessly naive about this, as you probably guessed
from my previous email asking about this, I'm only now learning about
all the intricacies fs mounts).
>> This series is an attempt to fix this situation, by persisting a mount namespace
>> alongside the persistent network namespace (in a separate directory,
>> /run/netns-mnt). Doing this allows us to still have a consistent /sys inside
>> the namespace, but with persistence so any mounts survive.
>
> I really don't like that direction.
>
> "ip netns" was designed and really should continue to be a command that
> makes the world look like it has a single network namespace, for
> compatibility with old code. Part of that old code "ip netns" supports
> is "ip" itself.
Well my idea with this change was to keep the functionality as close to
what 'ip' currently does, but just have mounts persist across
invocations.
> I think you are making bpffs unnecessarily per network namespace.
See above.
>> This mode does come with some caveats. I'm sending this as RFC to get feedback
>> on whether this is the right thing to do, especially considering backwards
>> compatibility. On balance, I think that the approach taken here of
>> unconditionally persisting the mount namespace, and using that persistent
>> reference whenever it exists, is better than the current behaviour, and that
>> while it does represent a change in behaviour it is backwards compatible in a
>> way that won't cause issues. But please do comment on this; see the patch
>> description of patch 4 for details.
>
> As I understand it this will cause a problem for any application that
> is network namespace aware and does not use "ip netns" to wrap itself.
>
> I am fairly certain that pinning the mount namespace will result in
> never seeing an update of /etc/resolve.conf. At least if you
> are on a system that has /etc/netns/NAME/resolve.conf
I was actually wondering about that /etc bind mounting support while I
was looking at this code. Could you please elaborate a bit on what that
is used for, exactly? :)
Also, if staleness of the /etc bind mounts is an issue, those could be
redone on every entry, couldn't they?
-Toke
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces
2023-10-09 22:03 ` Toke Høiland-Jørgensen
@ 2023-10-10 0:14 ` Eric W. Biederman
2023-10-10 13:38 ` Toke Høiland-Jørgensen
0 siblings, 1 reply; 17+ messages in thread
From: Eric W. Biederman @ 2023-10-10 0:14 UTC (permalink / raw)
To: Toke Høiland-Jørgensen
Cc: David Ahern, Stephen Hemminger, netdev, Nicolas Dichtel,
Christian Brauner, David Laight
Toke Høiland-Jørgensen <toke@redhat.com> writes:
> "Eric W. Biederman" <ebiederm@xmission.com> writes:
>
>> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>>
>>> The 'ip netns' command is used for setting up network namespaces with persistent
>>> named references, and is integrated into various other commands of iproute2 via
>>> the -n switch.
>>>
>>> This is useful both for testing setups and for simple script-based namespacing
>>> but has one drawback: the lack of persistent mounts inside the spawned
>>> namespace. This is particularly apparent when working with BPF programs that use
>>> pinning to bpffs: by default no bpffs is available inside a namespace, and
>>> even if mounting one, that fs disappears as soon as the calling
>>> command exits.
>>
>> It would be entirely reasonable to copy mounts like /sys/fs/bpf from the
>> original mount namespace into the temporary mount namespace used by
>> "ip netns".
>>
>> I would call it a bug that "ip netns" doesn't do that already.
>>
>> I suspect that "ip netns" does copy the mounts from the old sysfs onto
>> the new sysfs is your entire problem.
>
> How would it do that? Walk mtab and remount everything identically after
> remounting /sys? Or is there a smarter way to go about this?
There are not many places to look so something like this is probably sufficient:
# stat all of the possible/probable mount points and see if there is
# something mounted there. If so recursive bind whatever is there onto
# the new /sys
for dir in /old/sys/fs/* /old/sys/kernel/*; do
if [ $(stat --format '%d' "$dir") = $(stat --format '%d' "$dir/") ; then
newdir = $(echo $dir | sed -e s/old/new/)
mount --rbind $dir/ $newdir/
fi
done
If the concern is being robust for the future the mount points can also
be enumerated by looking in one of /proc/self/mounts,
/proc/self/mountinfo, or /proc/self/mountstats.
I am not certain which is less work parsing a file with lots of fields,
or reading a directory and stating the returned files from readdir.
>> Or is their a reason that bpffs should be per network namespace?
>
> Well, I first ran into this issue because of a bug report to
> xdp-tools/libxdp about things not working correctly in network
> namespaces:
>
> https://github.com/xdp-project/xdp-tools/issues/364
>
> And libxdp does assume that there's a separate bpffs per network
> namespace: it persists things into the bpffs that is tied to the network
> devices in the current namespace. So if the bpffs is shared, an
> application running inside the network namespace could access XDP
> programs loaded in the root namespace. I don't know, but suspect, that
> such assumptions would be relatively common in networking BPF programs
> that use pinning (the pinning support in libbpf and iproute2 itself at
> least have the same leaking problem if the bpffs is shared).
Are the names of the values truly network namespace specific?
I did not see any mention of the things that are persisted in the ticket
you pointed me at, and unfortunately I am not familiar with xdp.
Last I looked until all of the cpu side channels are closed it is
unfortunately unsafe to load ebpf programs with anything less than
CAP_SYS_ADMIN (aka with permission to see and administer the entire
system). So from a system point of view I really don't see a
fundamental danger from having a global /sys/fs/bpf.
If there are name conflicts in /sys/fs/bpf because of duplicate names in
different network namespaces I can see that being a problem.
At that point the name conflicts either need to be fixed or we
fundamentally need to have multiple mount points for bpffs.
Probably under something like /run/netns-mounts/NAME/.
With ip netns updated to mount the appropriate filesystem.
>>> The underlying cause for this is that iproute2 will create a new mount namespace
>>> every time it switches into a network namespace. This is needed to be able to
>>> mount a /sys filesystem that shows the correct network device information, but
>>> has the unfortunate side effect of making mounts entirely transient for any 'ip
>>> netns' invocation.
>>
>> Mount propagation can be made to work if necessary, that would solve the
>> transient problem.
>
> Is mount propagation different from the remount thing you mentioned
> above, or is this something different?
>
> (Sorry for being hopelessly naive about this, as you probably guessed
> from my previous email asking about this, I'm only now learning about
> all the intricacies fs mounts).
Mount propagation is a way to configure a mount namespace (before
creating a new one) that will cause mounts created in the first mount
namespace to be created in it's children, and cause mounts created in
the children to be created in the parent (depending on how things are
configured).
It is not my favorite feature (it makes locking of mount namespaces
terrible) and it is probably too clever by half, unfortunately systemd
started enabling mount propagation by default, so we are stuck with it.
>>> This series is an attempt to fix this situation, by persisting a mount namespace
>>> alongside the persistent network namespace (in a separate directory,
>>> /run/netns-mnt). Doing this allows us to still have a consistent /sys inside
>>> the namespace, but with persistence so any mounts survive.
>>
>> I really don't like that direction.
>>
>> "ip netns" was designed and really should continue to be a command that
>> makes the world look like it has a single network namespace, for
>> compatibility with old code. Part of that old code "ip netns" supports
>> is "ip" itself.
>
> Well my idea with this change was to keep the functionality as close to
> what 'ip' currently does, but just have mounts persist across
> invocations.
>
>> I think you are making bpffs unnecessarily per network namespace.
>
> See above.
>
>>> This mode does come with some caveats. I'm sending this as RFC to get feedback
>>> on whether this is the right thing to do, especially considering backwards
>>> compatibility. On balance, I think that the approach taken here of
>>> unconditionally persisting the mount namespace, and using that persistent
>>> reference whenever it exists, is better than the current behaviour, and that
>>> while it does represent a change in behaviour it is backwards compatible in a
>>> way that won't cause issues. But please do comment on this; see the patch
>>> description of patch 4 for details.
>>
>> As I understand it this will cause a problem for any application that
>> is network namespace aware and does not use "ip netns" to wrap itself.
>>
>> I am fairly certain that pinning the mount namespace will result in
>> never seeing an update of /etc/resolve.conf. At least if you
>> are on a system that has /etc/netns/NAME/resolve.conf
>
> I was actually wondering about that /etc bind mounting support while I
> was looking at this code. Could you please elaborate a bit on what that
> is used for, exactly? :)
The idea is that you can have separate static configuration depending
upon your network namespace.
A real world case is vpning into several company internal networks.
Each company network uses overlapping portions of the 192.168.x.x
network.
Each company network will want it's own dns servers and possibly other
network configuration as well.
For it to make sense you really only need one company network and a home
network. One of which you could stash in a network namespace to prevent
conflicts.
I don't know if supporting that ever caught on very strongly, but
I tried to build a template that would work for that and similar cases.
> Also, if staleness of the /etc bind mounts is an issue, those could be
> redone on every entry, couldn't they?
They already are ;)
Eric
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces
2023-10-10 0:14 ` Eric W. Biederman
@ 2023-10-10 13:38 ` Toke Høiland-Jørgensen
2023-10-10 19:19 ` Eric W. Biederman
0 siblings, 1 reply; 17+ messages in thread
From: Toke Høiland-Jørgensen @ 2023-10-10 13:38 UTC (permalink / raw)
To: Eric W. Biederman
Cc: David Ahern, Stephen Hemminger, netdev, Nicolas Dichtel,
Christian Brauner, David Laight
"Eric W. Biederman" <ebiederm@xmission.com> writes:
> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>
>> "Eric W. Biederman" <ebiederm@xmission.com> writes:
>>
>>> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>>>
>>>> The 'ip netns' command is used for setting up network namespaces with persistent
>>>> named references, and is integrated into various other commands of iproute2 via
>>>> the -n switch.
>>>>
>>>> This is useful both for testing setups and for simple script-based namespacing
>>>> but has one drawback: the lack of persistent mounts inside the spawned
>>>> namespace. This is particularly apparent when working with BPF programs that use
>>>> pinning to bpffs: by default no bpffs is available inside a namespace, and
>>>> even if mounting one, that fs disappears as soon as the calling
>>>> command exits.
>>>
>>> It would be entirely reasonable to copy mounts like /sys/fs/bpf from the
>>> original mount namespace into the temporary mount namespace used by
>>> "ip netns".
>>>
>>> I would call it a bug that "ip netns" doesn't do that already.
>>>
>>> I suspect that "ip netns" does copy the mounts from the old sysfs onto
>>> the new sysfs is your entire problem.
>>
>> How would it do that? Walk mtab and remount everything identically after
>> remounting /sys? Or is there a smarter way to go about this?
>
> There are not many places to look so something like this is probably sufficient:
>
> # stat all of the possible/probable mount points and see if there is
> # something mounted there. If so recursive bind whatever is there onto
> # the new /sys
>
> for dir in /old/sys/fs/* /old/sys/kernel/*; do
> if [ $(stat --format '%d' "$dir") = $(stat --format '%d' "$dir/") ; then
What is this comparison supposed to do? I couldn't find any directories
in /sys/fs/* where this was *not* true, regardless of whether there's a
mount there or not.
> newdir = $(echo $dir | sed -e s/old/new/)
> mount --rbind $dir/ $newdir/
> fi
> done
>
> If the concern is being robust for the future the mount points can also
> be enumerated by looking in one of /proc/self/mounts,
> /proc/self/mountinfo, or /proc/self/mountstats.
>
> I am not certain which is less work parsing a file with lots of fields,
> or reading a directory and stating the returned files from readdir.
Right, fair point.
>>> Or is their a reason that bpffs should be per network namespace?
>>
>> Well, I first ran into this issue because of a bug report to
>> xdp-tools/libxdp about things not working correctly in network
>> namespaces:
>>
>> https://github.com/xdp-project/xdp-tools/issues/364
>>
>> And libxdp does assume that there's a separate bpffs per network
>> namespace: it persists things into the bpffs that is tied to the network
>> devices in the current namespace. So if the bpffs is shared, an
>> application running inside the network namespace could access XDP
>> programs loaded in the root namespace. I don't know, but suspect, that
>> such assumptions would be relatively common in networking BPF programs
>> that use pinning (the pinning support in libbpf and iproute2 itself at
>> least have the same leaking problem if the bpffs is shared).
>
> Are the names of the values truly network namespace specific?
>
> I did not see any mention of the things that are persisted in the ticket
> you pointed me at, and unfortunately I am not familiar with xdp.
>
> Last I looked until all of the cpu side channels are closed it is
> unfortunately unsafe to load ebpf programs with anything less than
> CAP_SYS_ADMIN (aka with permission to see and administer the entire
> system). So from a system point of view I really don't see a
> fundamental danger from having a global /sys/fs/bpf.
>
> If there are name conflicts in /sys/fs/bpf because of duplicate names in
> different network namespaces I can see that being a problem.
Yeah, you're right that someone loading a BPF program generally has
permissions enough that they can break out of any containment if they
want, but applications do make assumptions about the contents of the
pinning directory that can lead to conflicts.
A couple of examples:
- libxdp will persist files in /sys/fs/bpf/dispatch-$ifindex-$prog_id
- If someone sets the 'pinning' attribute on a map definition in a BPF
file, libbpf will pin those files in /sys/fs/bpf/$map_name
The first one leads to obvious conflicts if shared across network
namespaces because of ifindex collisions. The second one leads to
potential false sharing of state across what are supposed to be
independent networking domains (e.g., if the bpffs is shared, loading
xdp-filter inside a namespace will share the state with another instance
loaded in another namespace, which would no doubt be surprising).
> At that point the name conflicts either need to be fixed or we
> fundamentally need to have multiple mount points for bpffs.
> Probably under something like /run/netns-mounts/NAME/.
>
> With ip netns updated to mount the appropriate filesystem.
I don't think it's feasible to fix the conflicts; they've been around
for a while and are part of application API in some cases. Plus, we
don't know of all BPF-using applications.
We could have 'ip' manage separate bpffs mounts per namespace and
bind-mount them into each netns (I think that's what you're suggesting),
but that would basically achieve the same thing as the mountns
persisting I am proposing in this series, but only as a special case for
bpffs. So why not do the more flexible thing and persist the whole
mountns (so applications inside the namespace can actually mount
additional things and have them stick around)? The current behaviour
seems very surprising...
>>>> The underlying cause for this is that iproute2 will create a new mount namespace
>>>> every time it switches into a network namespace. This is needed to be able to
>>>> mount a /sys filesystem that shows the correct network device information, but
>>>> has the unfortunate side effect of making mounts entirely transient for any 'ip
>>>> netns' invocation.
>>>
>>> Mount propagation can be made to work if necessary, that would solve the
>>> transient problem.
>>
>> Is mount propagation different from the remount thing you mentioned
>> above, or is this something different?
>>
>> (Sorry for being hopelessly naive about this, as you probably guessed
>> from my previous email asking about this, I'm only now learning about
>> all the intricacies fs mounts).
>
> Mount propagation is a way to configure a mount namespace (before
> creating a new one) that will cause mounts created in the first mount
> namespace to be created in it's children, and cause mounts created in
> the children to be created in the parent (depending on how things are
> configured).
>
> It is not my favorite feature (it makes locking of mount namespaces
> terrible) and it is probably too clever by half, unfortunately systemd
> started enabling mount propagation by default, so we are stuck with it.
Right. AFAICT the current iproute2 code explicitly tries to avoid that
when creating a mountns (it does a 'mount --make-rslave /'); so you're
saying we should change that?
>>>> This series is an attempt to fix this situation, by persisting a mount namespace
>>>> alongside the persistent network namespace (in a separate directory,
>>>> /run/netns-mnt). Doing this allows us to still have a consistent /sys inside
>>>> the namespace, but with persistence so any mounts survive.
>>>
>>> I really don't like that direction.
>>>
>>> "ip netns" was designed and really should continue to be a command that
>>> makes the world look like it has a single network namespace, for
>>> compatibility with old code. Part of that old code "ip netns" supports
>>> is "ip" itself.
>>
>> Well my idea with this change was to keep the functionality as close to
>> what 'ip' currently does, but just have mounts persist across
>> invocations.
>>
>>> I think you are making bpffs unnecessarily per network namespace.
>>
>> See above.
>>
>>>> This mode does come with some caveats. I'm sending this as RFC to get feedback
>>>> on whether this is the right thing to do, especially considering backwards
>>>> compatibility. On balance, I think that the approach taken here of
>>>> unconditionally persisting the mount namespace, and using that persistent
>>>> reference whenever it exists, is better than the current behaviour, and that
>>>> while it does represent a change in behaviour it is backwards compatible in a
>>>> way that won't cause issues. But please do comment on this; see the patch
>>>> description of patch 4 for details.
>>>
>>> As I understand it this will cause a problem for any application that
>>> is network namespace aware and does not use "ip netns" to wrap itself.
>>>
>>> I am fairly certain that pinning the mount namespace will result in
>>> never seeing an update of /etc/resolve.conf. At least if you
>>> are on a system that has /etc/netns/NAME/resolve.conf
>>
>> I was actually wondering about that /etc bind mounting support while I
>> was looking at this code. Could you please elaborate a bit on what that
>> is used for, exactly? :)
>
> The idea is that you can have separate static configuration depending
> upon your network namespace.
>
> A real world case is vpning into several company internal networks.
> Each company network uses overlapping portions of the 192.168.x.x
> network.
> Each company network will want it's own dns servers and possibly other
> network configuration as well.
>
> For it to make sense you really only need one company network and a home
> network. One of which you could stash in a network namespace to prevent
> conflicts.
>
> I don't know if supporting that ever caught on very strongly, but
> I tried to build a template that would work for that and similar cases.
Hmm, I actually use a network namespace for something like that myself,
but I'm not using that functionality because I had no idea it existed
until now... :)
>> Also, if staleness of the /etc bind mounts is an issue, those could be
>> redone on every entry, couldn't they?
>
> They already are ;)
Right, by the transient mount namespaces, what I meant was that this
could be preserved even with a persistent mount namespace.
-Toke
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces
2023-10-10 13:38 ` Toke Høiland-Jørgensen
@ 2023-10-10 19:19 ` Eric W. Biederman
2023-10-11 13:49 ` Toke Høiland-Jørgensen
0 siblings, 1 reply; 17+ messages in thread
From: Eric W. Biederman @ 2023-10-10 19:19 UTC (permalink / raw)
To: Toke Høiland-Jørgensen
Cc: David Ahern, Stephen Hemminger, netdev, Nicolas Dichtel,
Christian Brauner, David Laight
Toke Høiland-Jørgensen <toke@redhat.com> writes:
> "Eric W. Biederman" <ebiederm@xmission.com> writes:
>
>> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>>
>>> "Eric W. Biederman" <ebiederm@xmission.com> writes:
>>>
>>>> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>>
>> There are not many places to look so something like this is probably sufficient:
>>
>> # stat all of the possible/probable mount points and see if there is
>> # something mounted there. If so recursive bind whatever is there onto
>> # the new /sys
>>
>> for dir in /old/sys/fs/* /old/sys/kernel/*; do
>> if [ $(stat --format '%d' "$dir") = $(stat --format '%d' "$dir/") ; then
>
> What is this comparison supposed to do? I couldn't find any directories
> in /sys/fs/* where this was *not* true, regardless of whether there's a
> mount there or not.
Bah. I think I got my logic scrambled. I can only get it to work
by comparing the filesystems device on /sys/fs to the device on
/sys/fs/cgroup etc.
The idea is that st_dev changes between filesystems. So you can detect
a filesystem change based on st_dev.
I thought the $dir vs $dir/ would have allowed stating the underlying
directory verses the mount, but apparently my memory go that one wrong.
Which makes my command actually something like:
sys_dev=$(stat --format='%d' /sys)
for dir in /old/sys/fs/* /old/sys/kernel/*; do
if [ $(stat --format '%d' "$dir") -ne $sys_dev ] ; then
echo $dir is a mount point
fi
done
>>>> Or is their a reason that bpffs should be per network namespace?
>>>
>>> Well, I first ran into this issue because of a bug report to
>>> xdp-tools/libxdp about things not working correctly in network
>>> namespaces:
>>>
>>> https://github.com/xdp-project/xdp-tools/issues/364
>>>
>>> And libxdp does assume that there's a separate bpffs per network
>>> namespace: it persists things into the bpffs that is tied to the network
>>> devices in the current namespace. So if the bpffs is shared, an
>>> application running inside the network namespace could access XDP
>>> programs loaded in the root namespace. I don't know, but suspect, that
>>> such assumptions would be relatively common in networking BPF programs
>>> that use pinning (the pinning support in libbpf and iproute2 itself at
>>> least have the same leaking problem if the bpffs is shared).
>>
>> Are the names of the values truly network namespace specific?
>>
>> I did not see any mention of the things that are persisted in the ticket
>> you pointed me at, and unfortunately I am not familiar with xdp.
>>
>> Last I looked until all of the cpu side channels are closed it is
>> unfortunately unsafe to load ebpf programs with anything less than
>> CAP_SYS_ADMIN (aka with permission to see and administer the entire
>> system). So from a system point of view I really don't see a
>> fundamental danger from having a global /sys/fs/bpf.
>>
>> If there are name conflicts in /sys/fs/bpf because of duplicate names in
>> different network namespaces I can see that being a problem.
>
> Yeah, you're right that someone loading a BPF program generally has
> permissions enough that they can break out of any containment if they
> want, but applications do make assumptions about the contents of the
> pinning directory that can lead to conflicts.
>
> A couple of examples:
>
> - libxdp will persist files in /sys/fs/bpf/dispatch-$ifindex-$prog_id
>
> - If someone sets the 'pinning' attribute on a map definition in a BPF
> file, libbpf will pin those files in /sys/fs/bpf/$map_name
>
> The first one leads to obvious conflicts if shared across network
> namespaces because of ifindex collisions. The second one leads to
> potential false sharing of state across what are supposed to be
> independent networking domains (e.g., if the bpffs is shared, loading
> xdp-filter inside a namespace will share the state with another instance
> loaded in another namespace, which would no doubt be surprising).
Sigh. So non-default network namespaces can't use /sys/fs/bpf,
because of silly userspace assumptions. So the entries need to be
namespaced to prevent conflicts.
>> At that point the name conflicts either need to be fixed or we
>> fundamentally need to have multiple mount points for bpffs.
>> Probably under something like /run/netns-mounts/NAME/.
>>
>> With ip netns updated to mount the appropriate filesystem.
>
> I don't think it's feasible to fix the conflicts; they've been around
> for a while and are part of application API in some cases. Plus, we
> don't know of all BPF-using applications.
>
> We could have 'ip' manage separate bpffs mounts per namespace and
> bind-mount them into each netns (I think that's what you're suggesting),
> but that would basically achieve the same thing as the mountns
> persisting I am proposing in this series, but only as a special case for
> bpffs. So why not do the more flexible thing and persist the whole
> mountns (so applications inside the namespace can actually mount
> additional things and have them stick around)? The current behaviour
> seems very surprising...
I don't like persisting the entire mount namespace because it is hard
for a system administrator to see, it is difficult for something outside
of that mount namespace to access, and it is as easy to persist a
mistake as it is to persist something deliberate.
My proposal:
On "ip netns add NAME"
- create the network namespace and mount it at /run/netns/NAME
- mount the appropriate sysfs at /run/netns-mounts/NAME/sys
- mount the appropriate bpffs at /run/netns-mounts/NAME/sys/fs/bpf
On "ip netns delete NAME"
- umount --recursive /run/netns-mounts/NAME
- unlink /run/netns-mounts/NAME
- cleanup /run/netns/NAME as we do today.
On "ip netns exec NAME"
- Walk through /run/netns-mounts/NAME like we do /etc/netns/NAME/
and perform bind mounts.
That way everything that needs to persist really and truly persists.
Applications that don't use "ip netns exec" can continue to use the
network namespace and everything that goes along with it without
problems.
>> Mount propagation is a way to configure a mount namespace (before
>> creating a new one) that will cause mounts created in the first mount
>> namespace to be created in it's children, and cause mounts created in
>> the children to be created in the parent (depending on how things are
>> configured).
>>
>> It is not my favorite feature (it makes locking of mount namespaces
>> terrible) and it is probably too clever by half, unfortunately systemd
>> started enabling mount propagation by default, so we are stuck with it.
>
> Right. AFAICT the current iproute2 code explicitly tries to avoid that
> when creating a mountns (it does a 'mount --make-rslave /'); so you're
> saying we should change that?
If it makes sense.
I believe I added the 'mount --make-rslave /' because otherwise all
mount activity was propagating back, and making a mess. Especially when
I was unmounting /sys.
I am not a huge fan of mount propagation it has lots of surprising
little details that need to be set just right, to not cause problems.
With my proposal above I think we could in some carefully chosen
places enable mount propagation without problem. But I would really
like to see an application that is performing mounts inside of
"ip netns exec" to see how it matters.
Code without concrete real world test use cases tends to get things
wrong.
Eric
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces
2023-10-10 19:19 ` Eric W. Biederman
@ 2023-10-11 13:49 ` Toke Høiland-Jørgensen
2023-10-11 14:55 ` Eric W. Biederman
0 siblings, 1 reply; 17+ messages in thread
From: Toke Høiland-Jørgensen @ 2023-10-11 13:49 UTC (permalink / raw)
To: Eric W. Biederman
Cc: David Ahern, Stephen Hemminger, netdev, Nicolas Dichtel,
Christian Brauner, David Laight
"Eric W. Biederman" <ebiederm@xmission.com> writes:
> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>
>> "Eric W. Biederman" <ebiederm@xmission.com> writes:
>>
>>> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>>>
>>>> "Eric W. Biederman" <ebiederm@xmission.com> writes:
>>>>
>>>>> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>>>
>>> There are not many places to look so something like this is probably sufficient:
>>>
>>> # stat all of the possible/probable mount points and see if there is
>>> # something mounted there. If so recursive bind whatever is there onto
>>> # the new /sys
>>>
>>> for dir in /old/sys/fs/* /old/sys/kernel/*; do
>>> if [ $(stat --format '%d' "$dir") = $(stat --format '%d' "$dir/") ; then
>>
>> What is this comparison supposed to do? I couldn't find any directories
>> in /sys/fs/* where this was *not* true, regardless of whether there's a
>> mount there or not.
>
> Bah. I think I got my logic scrambled. I can only get it to work
> by comparing the filesystems device on /sys/fs to the device on
> /sys/fs/cgroup etc.
>
> The idea is that st_dev changes between filesystems. So you can detect
> a filesystem change based on st_dev.
>
> I thought the $dir vs $dir/ would have allowed stating the underlying
> directory verses the mount, but apparently my memory go that one wrong.
>
> Which makes my command actually something like:
>
> sys_dev=$(stat --format='%d' /sys)
>
> for dir in /old/sys/fs/* /old/sys/kernel/*; do
> if [ $(stat --format '%d' "$dir") -ne $sys_dev ] ; then
> echo $dir is a mount point
> fi
> done
Ah, right that makes sense! I thought I was missing something when I
couldn't get your other example to work...
>>>>> Or is their a reason that bpffs should be per network namespace?
>>>>
>>>> Well, I first ran into this issue because of a bug report to
>>>> xdp-tools/libxdp about things not working correctly in network
>>>> namespaces:
>>>>
>>>> https://github.com/xdp-project/xdp-tools/issues/364
>>>>
>>>> And libxdp does assume that there's a separate bpffs per network
>>>> namespace: it persists things into the bpffs that is tied to the network
>>>> devices in the current namespace. So if the bpffs is shared, an
>>>> application running inside the network namespace could access XDP
>>>> programs loaded in the root namespace. I don't know, but suspect, that
>>>> such assumptions would be relatively common in networking BPF programs
>>>> that use pinning (the pinning support in libbpf and iproute2 itself at
>>>> least have the same leaking problem if the bpffs is shared).
>>>
>>> Are the names of the values truly network namespace specific?
>>>
>>> I did not see any mention of the things that are persisted in the ticket
>>> you pointed me at, and unfortunately I am not familiar with xdp.
>>>
>>> Last I looked until all of the cpu side channels are closed it is
>>> unfortunately unsafe to load ebpf programs with anything less than
>>> CAP_SYS_ADMIN (aka with permission to see and administer the entire
>>> system). So from a system point of view I really don't see a
>>> fundamental danger from having a global /sys/fs/bpf.
>>>
>>> If there are name conflicts in /sys/fs/bpf because of duplicate names in
>>> different network namespaces I can see that being a problem.
>>
>> Yeah, you're right that someone loading a BPF program generally has
>> permissions enough that they can break out of any containment if they
>> want, but applications do make assumptions about the contents of the
>> pinning directory that can lead to conflicts.
>>
>> A couple of examples:
>>
>> - libxdp will persist files in /sys/fs/bpf/dispatch-$ifindex-$prog_id
>>
>> - If someone sets the 'pinning' attribute on a map definition in a BPF
>> file, libbpf will pin those files in /sys/fs/bpf/$map_name
>>
>> The first one leads to obvious conflicts if shared across network
>> namespaces because of ifindex collisions. The second one leads to
>> potential false sharing of state across what are supposed to be
>> independent networking domains (e.g., if the bpffs is shared, loading
>> xdp-filter inside a namespace will share the state with another instance
>> loaded in another namespace, which would no doubt be surprising).
>
> Sigh. So non-default network namespaces can't use /sys/fs/bpf,
> because of silly userspace assumptions. So the entries need to be
> namespaced to prevent conflicts.
Yup, basically.
>>> At that point the name conflicts either need to be fixed or we
>>> fundamentally need to have multiple mount points for bpffs.
>>> Probably under something like /run/netns-mounts/NAME/.
>>>
>>> With ip netns updated to mount the appropriate filesystem.
>>
>> I don't think it's feasible to fix the conflicts; they've been around
>> for a while and are part of application API in some cases. Plus, we
>> don't know of all BPF-using applications.
>>
>> We could have 'ip' manage separate bpffs mounts per namespace and
>> bind-mount them into each netns (I think that's what you're suggesting),
>> but that would basically achieve the same thing as the mountns
>> persisting I am proposing in this series, but only as a special case for
>> bpffs. So why not do the more flexible thing and persist the whole
>> mountns (so applications inside the namespace can actually mount
>> additional things and have them stick around)? The current behaviour
>> seems very surprising...
>
> I don't like persisting the entire mount namespace because it is hard
> for a system administrator to see, it is difficult for something outside
> of that mount namespace to access, and it is as easy to persist a
> mistake as it is to persist something deliberate.
>
> My proposal:
>
> On "ip netns add NAME"
> - create the network namespace and mount it at /run/netns/NAME
> - mount the appropriate sysfs at /run/netns-mounts/NAME/sys
> - mount the appropriate bpffs at /run/netns-mounts/NAME/sys/fs/bpf
>
> On "ip netns delete NAME"
> - umount --recursive /run/netns-mounts/NAME
> - unlink /run/netns-mounts/NAME
> - cleanup /run/netns/NAME as we do today.
>
> On "ip netns exec NAME"
> - Walk through /run/netns-mounts/NAME like we do /etc/netns/NAME/
> and perform bind mounts.
If we setup the full /sys hierarchy in /run/netns-mounts/NAME this
basically becomes a single recursive bind mount, doesn't it?
What about if we also include bind mounts from the host namespace into
that separate /sys instance? Will those be included into a recursive
bind into /sys inside the mount-ns, or will we have to walk the tree and
do separate bind mounts for each directory?
Anyway, this scheme sounds like it'll solve the issue I was trying to
address so I don't mind doing it this way. I'll try it out and respin
the patch series.
>>> Mount propagation is a way to configure a mount namespace (before
>>> creating a new one) that will cause mounts created in the first mount
>>> namespace to be created in it's children, and cause mounts created in
>>> the children to be created in the parent (depending on how things are
>>> configured).
>>>
>>> It is not my favorite feature (it makes locking of mount namespaces
>>> terrible) and it is probably too clever by half, unfortunately systemd
>>> started enabling mount propagation by default, so we are stuck with it.
>>
>> Right. AFAICT the current iproute2 code explicitly tries to avoid that
>> when creating a mountns (it does a 'mount --make-rslave /'); so you're
>> saying we should change that?
>
> If it makes sense.
>
> I believe I added the 'mount --make-rslave /' because otherwise all
> mount activity was propagating back, and making a mess. Especially when
> I was unmounting /sys.
>
> I am not a huge fan of mount propagation it has lots of surprising
> little details that need to be set just right, to not cause problems.
Ah, you were talking about propagation from inside the mountns to
outside? Didn't catch that at first...
> With my proposal above I think we could in some carefully chosen
> places enable mount propagation without problem.
One thing that comes to mind would be that if we create persistent /sys
instances in /run/netns-mounts per the above, it would make sense for
any modifications done inside the netns to be propagated back to the
mount in /run; is this possible with a bind mount? Not sure I quite
understand how propagation would work in this case (since it would be a
separate (bind) mount point inside the namespace).
> But I would really like to see an application that is performing
> mounts inside of "ip netns exec" to see how it matters.
Two examples come to mind:
- I believe there are some applications that will mount a private bpffs
instance for their own use case. Not sure if those applications switch
in and out of namespaces, though, and if they do whether they are
namespace-aware themselves
- Interactive use ('ip netns exec $SHELL'), which I sometimes use for
testing various things. I've mostly had issues with bpffs in this
setting, though, so if we solve that as per the above, maybe that's
not needed.
> Code without concrete real world test use cases tends to get things
> wrong.
Heh, amen to that :)
-Toke
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces
2023-10-11 13:49 ` Toke Høiland-Jørgensen
@ 2023-10-11 14:55 ` Eric W. Biederman
2023-10-11 15:03 ` Toke Høiland-Jørgensen
0 siblings, 1 reply; 17+ messages in thread
From: Eric W. Biederman @ 2023-10-11 14:55 UTC (permalink / raw)
To: Toke Høiland-Jørgensen
Cc: David Ahern, Stephen Hemminger, netdev, Nicolas Dichtel,
Christian Brauner, David Laight
Toke Høiland-Jørgensen <toke@redhat.com> writes:
> "Eric W. Biederman" <ebiederm@xmission.com> writes:
>
>> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>>
>>> "Eric W. Biederman" <ebiederm@xmission.com> writes:
>>>
>>>> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>>>>
>>>>> "Eric W. Biederman" <ebiederm@xmission.com> writes:
>>>>>
>>>>>> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>> My proposal:
>>
>> On "ip netns add NAME"
>> - create the network namespace and mount it at /run/netns/NAME
>> - mount the appropriate sysfs at /run/netns-mounts/NAME/sys
>> - mount the appropriate bpffs at /run/netns-mounts/NAME/sys/fs/bpf
>>
>> On "ip netns delete NAME"
>> - umount --recursive /run/netns-mounts/NAME
>> - unlink /run/netns-mounts/NAME
>> - cleanup /run/netns/NAME as we do today.
>>
>> On "ip netns exec NAME"
>> - Walk through /run/netns-mounts/NAME like we do /etc/netns/NAME/
>> and perform bind mounts.
>
> If we setup the full /sys hierarchy in /run/netns-mounts/NAME this
> basically becomes a single recursive bind mount, doesn't it?
Yes.
> What about if we also include bind mounts from the host namespace into
> that separate /sys instance? Will those be included into a recursive
> bind into /sys inside the mount-ns, or will we have to walk the tree and
> do separate bind mounts for each directory?
if /run/netns-mounts/NAME/sys has everything you want.
mount --rbind /run/netns-mounts/NAME/sys /sys
Will result in a /sys that has everything you want.
> Anyway, this scheme sounds like it'll solve the issue I was trying to
> address so I don't mind doing it this way. I'll try it out and respin
> the patch series.
Thanks that sounds like a way forward.
>>>> Mount propagation is a way to configure a mount namespace (before
>>>> creating a new one) that will cause mounts created in the first mount
>>>> namespace to be created in it's children, and cause mounts created in
>>>> the children to be created in the parent (depending on how things are
>>>> configured).
>>>>
>>>> It is not my favorite feature (it makes locking of mount namespaces
>>>> terrible) and it is probably too clever by half, unfortunately systemd
>>>> started enabling mount propagation by default, so we are stuck with it.
>>>
>>> Right. AFAICT the current iproute2 code explicitly tries to avoid that
>>> when creating a mountns (it does a 'mount --make-rslave /'); so you're
>>> saying we should change that?
>>
>> If it makes sense.
>>
>> I believe I added the 'mount --make-rslave /' because otherwise all
>> mount activity was propagating back, and making a mess. Especially when
>> I was unmounting /sys.
>>
>> I am not a huge fan of mount propagation it has lots of surprising
>> little details that need to be set just right, to not cause problems.
>
> Ah, you were talking about propagation from inside the mountns to
> outside? Didn't catch that at first...
>
>> With my proposal above I think we could in some carefully chosen
>> places enable mount propagation without problem.
>
> One thing that comes to mind would be that if we create persistent /sys
> instances in /run/netns-mounts per the above, it would make sense for
> any modifications done inside the netns to be propagated back to the
> mount in /run; is this possible with a bind mount? Not sure I quite
> understand how propagation would work in this case (since it would be a
> separate (bind) mount point inside the namespace).
Basically yes, but the challenge is in the details.
If the initial propagation is setup properly it will work. The
weirdness is how propagation works. There is a weird detail that
it needs to be setup on the parent and not on the mount point.
I think the formula is something like:
mount --bind /run/netns-mounts/NAME/sys/ /run/netns-mounts/NAME/sys/
mount --make-rshared /run/netns-mounts/NAME/sys/
mount -t sysfs /run/netns-mounts/NAME/sys
My memory is that systemd by default does
mount --make-rshared /
So the challenge may be to simply limit what is propagated to a
controlled subset.
Eric
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces
2023-10-11 14:55 ` Eric W. Biederman
@ 2023-10-11 15:03 ` Toke Høiland-Jørgensen
0 siblings, 0 replies; 17+ messages in thread
From: Toke Høiland-Jørgensen @ 2023-10-11 15:03 UTC (permalink / raw)
To: Eric W. Biederman
Cc: David Ahern, Stephen Hemminger, netdev, Nicolas Dichtel,
Christian Brauner, David Laight
"Eric W. Biederman" <ebiederm@xmission.com> writes:
> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>
>> "Eric W. Biederman" <ebiederm@xmission.com> writes:
>>
>>> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>>>
>>>> "Eric W. Biederman" <ebiederm@xmission.com> writes:
>>>>
>>>>> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>>>>>
>>>>>> "Eric W. Biederman" <ebiederm@xmission.com> writes:
>>>>>>
>>>>>>> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>
>>> My proposal:
>>>
>>> On "ip netns add NAME"
>>> - create the network namespace and mount it at /run/netns/NAME
>>> - mount the appropriate sysfs at /run/netns-mounts/NAME/sys
>>> - mount the appropriate bpffs at /run/netns-mounts/NAME/sys/fs/bpf
>>>
>>> On "ip netns delete NAME"
>>> - umount --recursive /run/netns-mounts/NAME
>>> - unlink /run/netns-mounts/NAME
>>> - cleanup /run/netns/NAME as we do today.
>>>
>>> On "ip netns exec NAME"
>>> - Walk through /run/netns-mounts/NAME like we do /etc/netns/NAME/
>>> and perform bind mounts.
>>
>> If we setup the full /sys hierarchy in /run/netns-mounts/NAME this
>> basically becomes a single recursive bind mount, doesn't it?
>
> Yes.
>
>> What about if we also include bind mounts from the host namespace into
>> that separate /sys instance? Will those be included into a recursive
>> bind into /sys inside the mount-ns, or will we have to walk the tree and
>> do separate bind mounts for each directory?
>
> if /run/netns-mounts/NAME/sys has everything you want.
>
> mount --rbind /run/netns-mounts/NAME/sys /sys
>
> Will result in a /sys that has everything you want.
>
>> Anyway, this scheme sounds like it'll solve the issue I was trying to
>> address so I don't mind doing it this way. I'll try it out and respin
>> the patch series.
>
> Thanks that sounds like a way forward.
>
>
>>>>> Mount propagation is a way to configure a mount namespace (before
>>>>> creating a new one) that will cause mounts created in the first mount
>>>>> namespace to be created in it's children, and cause mounts created in
>>>>> the children to be created in the parent (depending on how things are
>>>>> configured).
>>>>>
>>>>> It is not my favorite feature (it makes locking of mount namespaces
>>>>> terrible) and it is probably too clever by half, unfortunately systemd
>>>>> started enabling mount propagation by default, so we are stuck with it.
>>>>
>>>> Right. AFAICT the current iproute2 code explicitly tries to avoid that
>>>> when creating a mountns (it does a 'mount --make-rslave /'); so you're
>>>> saying we should change that?
>>>
>>> If it makes sense.
>>>
>>> I believe I added the 'mount --make-rslave /' because otherwise all
>>> mount activity was propagating back, and making a mess. Especially when
>>> I was unmounting /sys.
>>>
>>> I am not a huge fan of mount propagation it has lots of surprising
>>> little details that need to be set just right, to not cause problems.
>>
>> Ah, you were talking about propagation from inside the mountns to
>> outside? Didn't catch that at first...
>>
>>> With my proposal above I think we could in some carefully chosen
>>> places enable mount propagation without problem.
>>
>> One thing that comes to mind would be that if we create persistent /sys
>> instances in /run/netns-mounts per the above, it would make sense for
>> any modifications done inside the netns to be propagated back to the
>> mount in /run; is this possible with a bind mount? Not sure I quite
>> understand how propagation would work in this case (since it would be a
>> separate (bind) mount point inside the namespace).
>
> Basically yes, but the challenge is in the details.
>
> If the initial propagation is setup properly it will work. The
> weirdness is how propagation works. There is a weird detail that
> it needs to be setup on the parent and not on the mount point.
>
> I think the formula is something like:
>
> mount --bind /run/netns-mounts/NAME/sys/ /run/netns-mounts/NAME/sys/
> mount --make-rshared /run/netns-mounts/NAME/sys/
> mount -t sysfs /run/netns-mounts/NAME/sys
>
> My memory is that systemd by default does
>
> mount --make-rshared /
>
> So the challenge may be to simply limit what is propagated to a
> controlled subset.
Alright, I'll play around with it and bug you some more if I can't get
it to work properly; thanks for the pointers! :)
-Toke
^ permalink raw reply [flat|nested] 17+ messages in thread
* RE: [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces
2023-10-09 20:32 ` [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces Eric W. Biederman
2023-10-09 22:03 ` Toke Høiland-Jørgensen
@ 2023-10-10 8:42 ` David Laight
2023-10-10 19:32 ` Eric W. Biederman
1 sibling, 1 reply; 17+ messages in thread
From: David Laight @ 2023-10-10 8:42 UTC (permalink / raw)
To: 'Eric W. Biederman', Toke Høiland-Jørgensen
Cc: David Ahern, Stephen Hemminger, netdev@vger.kernel.org,
Nicolas Dichtel, Christian Brauner
From: Eric W. Biederman
> Sent: 09 October 2023 21:33
>
> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>
> > The 'ip netns' command is used for setting up network namespaces with persistent
> > named references, and is integrated into various other commands of iproute2 via
> > the -n switch.
> >
> > This is useful both for testing setups and for simple script-based namespacing
> > but has one drawback: the lack of persistent mounts inside the spawned
> > namespace. This is particularly apparent when working with BPF programs that use
> > pinning to bpffs: by default no bpffs is available inside a namespace, and
> > even if mounting one, that fs disappears as soon as the calling
> > command exits.
>
> It would be entirely reasonable to copy mounts like /sys/fs/bpf from the
> original mount namespace into the temporary mount namespace used by
> "ip netns".
>
> I would call it a bug that "ip netns" doesn't do that already.
>
> I suspect that "ip netns" does copy the mounts from the old sysfs onto
> the new sysfs is your entire problem.
When I was getting a program to run in multiple network namespaces
(has sockets in 2 namespaces) I rather expected that netns(net_ns_fd,0)
would 'magically' change /proc/net to refer to the new namespace.
I think that could be done in the code that follows the /proc/net
mountpoint - IIRC something similar is done for /proc/self.
However that would need flags to both setns() and 'ip netns exec'
since programs will rely on the existing behaviour.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces
2023-10-10 8:42 ` David Laight
@ 2023-10-10 19:32 ` Eric W. Biederman
2023-10-10 21:51 ` David Laight
0 siblings, 1 reply; 17+ messages in thread
From: Eric W. Biederman @ 2023-10-10 19:32 UTC (permalink / raw)
To: David Laight
Cc: Toke Høiland-Jørgensen, David Ahern, Stephen Hemminger,
netdev@vger.kernel.org, Nicolas Dichtel, Christian Brauner
David Laight <David.Laight@ACULAB.COM> writes:
> From: Eric W. Biederman
>> Sent: 09 October 2023 21:33
>>
>> Toke Høiland-Jørgensen <toke@redhat.com> writes:
>>
>> > The 'ip netns' command is used for setting up network namespaces with persistent
>> > named references, and is integrated into various other commands of iproute2 via
>> > the -n switch.
>> >
>> > This is useful both for testing setups and for simple script-based namespacing
>> > but has one drawback: the lack of persistent mounts inside the spawned
>> > namespace. This is particularly apparent when working with BPF programs that use
>> > pinning to bpffs: by default no bpffs is available inside a namespace, and
>> > even if mounting one, that fs disappears as soon as the calling
>> > command exits.
>>
>> It would be entirely reasonable to copy mounts like /sys/fs/bpf from the
>> original mount namespace into the temporary mount namespace used by
>> "ip netns".
>>
>> I would call it a bug that "ip netns" doesn't do that already.
>>
>> I suspect that "ip netns" does copy the mounts from the old sysfs onto
>> the new sysfs is your entire problem.
>
> When I was getting a program to run in multiple network namespaces
> (has sockets in 2 namespaces) I rather expected that netns(net_ns_fd,0)
> would 'magically' change /proc/net to refer to the new namespace.
> I think that could be done in the code that follows the /proc/net
> mountpoint - IIRC something similar is done for /proc/self.
/proc/self/net does follow your current network namespace last I looked.
Of course if you are threaded you may need to look at
/proc/thread-self/net as your network namespace is per thread.
It is also quite evil. The problem is that having different entries
cached under the same name is a major mess. Ever since I made that
mistake I have been aiming at designs that don't fight the dcache.
Even in that case I think I limited it to just a entry where
ugliness happens.
> However that would need flags to both setns() and 'ip netns exec'
> since programs will rely on the existing behaviour.
You might want to look again.
Eric
^ permalink raw reply [flat|nested] 17+ messages in thread
* RE: [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces
2023-10-10 19:32 ` Eric W. Biederman
@ 2023-10-10 21:51 ` David Laight
0 siblings, 0 replies; 17+ messages in thread
From: David Laight @ 2023-10-10 21:51 UTC (permalink / raw)
To: 'Eric W. Biederman'
Cc: Toke Høiland-Jørgensen, David Ahern, Stephen Hemminger,
netdev@vger.kernel.org, Nicolas Dichtel, Christian Brauner
From: Eric W. Biederman
> Sent: 10 October 2023 20:33
>
> David Laight <David.Laight@ACULAB.COM> writes:
>
> > From: Eric W. Biederman
> >> Sent: 09 October 2023 21:33
> >>
...
> > When I was getting a program to run in multiple network namespaces
> > (has sockets in 2 namespaces) I rather expected that netns(net_ns_fd,0)
> > would 'magically' change /proc/net to refer to the new namespace.
> > I think that could be done in the code that follows the /proc/net
> > mountpoint - IIRC something similar is done for /proc/self.
>
> /proc/self/net does follow your current network namespace last I looked.
>
> Of course if you are threaded you may need to look at
> /proc/thread-self/net as your network namespace is per thread.
Yes, I remember that now, and /proc/net is the wrong symlink.
> It is also quite evil. The problem is that having different entries
> cached under the same name is a major mess. Ever since I made that
> mistake I have been aiming at designs that don't fight the dcache.
>
> Even in that case I think I limited it to just a entry where
> ugliness happens.
It is nice from a user point of view...
I'd guess a 'magic symlink' that points off somewhere fixed
would be a little cleaner.
> > However that would need flags to both setns() and 'ip netns exec'
> > since programs will rely on the existing behaviour.
>
> You might want to look again.
The problem was with /sys/class/net
I ended up doing:
ip netns exec fubar program args 3</sys/class/net
So that open("/sys/class/net/xxx") was inside the fubar namespace
and openat(3, "xxx") was in the default namespace.
But I think:
> On "ip netns add NAME"
> - create the network namespace and mount it at /run/netns/NAME
> - mount the appropriate sysfs at /run/netns-mounts/NAME/sys
> - mount the appropriate bpffs at /run/netns-mounts/NAME/sys/fs/bpf
would make it possible for a program to read (eg)
/sys/class/net/interface/speed for interfaces in multiple
network namespaces.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
^ permalink raw reply [flat|nested] 17+ messages in thread