netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Toke Høiland-Jørgensen" <toke@redhat.com>
To: David Ahern <dsahern@gmail.com>,
	Stephen Hemminger <stephen@networkplumber.org>
Cc: netdev@vger.kernel.org,
	"Toke Høiland-Jørgensen" <toke@redhat.com>,
	"Nicolas Dichtel" <nicolas.dichtel@6wind.com>,
	"Christian Brauner" <brauner@kernel.org>,
	"Eric W . Biederman" <ebiederm@xmission.com>,
	"David Laight" <David.Laight@ACULAB.COM>
Subject: [RFC PATCH iproute2-next 1/5] ip: Mount netns in child process instead of from inside the new namespace
Date: Mon,  9 Oct 2023 20:27:49 +0200	[thread overview]
Message-ID: <20231009182753.851551-2-toke@redhat.com> (raw)
In-Reply-To: <20231009182753.851551-1-toke@redhat.com>

Refactor the netns creation code so that we offload the mounting of the
namespace file to a child process instead of bind mounting from inside the newly
created namespace.

This is done in preparation for also persisting a mount namespace; the mount
namespace reference cannot be bind-mounted from inside that namespace itself, so
we need to mount that from a child process anyway. The child process
approach (as well as some of the helper functions used for it) is adapted from
the code in the unshare(1) utility that is part of util-linux.

No functional change is intended with this patch.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 ip/ipnetns.c | 184 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 130 insertions(+), 54 deletions(-)

diff --git a/ip/ipnetns.c b/ip/ipnetns.c
index 9d996832aef8..a05d84514326 100644
--- a/ip/ipnetns.c
+++ b/ip/ipnetns.c
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#include <stdlib.h>
 #define _ATFILE_SOURCE
 #include <sys/file.h>
 #include <sys/types.h>
@@ -7,6 +8,7 @@
 #include <sys/inotify.h>
 #include <sys/mount.h>
 #include <sys/syscall.h>
+#include <sys/eventfd.h>
 #include <stdio.h>
 #include <string.h>
 #include <sched.h>
@@ -25,6 +27,9 @@
 #include "namespace.h"
 #include "json_print.h"
 
+/* synchronize parent and child by pipe */
+#define PIPE_SYNC_BYTE	0x06
+
 static int usage(void)
 {
 	fprintf(stderr,
@@ -46,7 +51,6 @@ static int usage(void)
 static struct rtnl_handle rtnsh = { .fd = -1 };
 
 static int have_rtnl_getnsid = -1;
-static int saved_netns = -1;
 static struct link_filter filter;
 
 static int ipnetns_accept_msg(struct rtnl_ctrl_data *ctrl,
@@ -768,31 +772,131 @@ static int create_netns_dir(void)
 	return 0;
 }
 
-/* Obtain a FD for the current namespace, so we can reenter it later */
-static void netns_save(void)
+/**
+ * waitchild() - Wait for a process to exit successfully
+ * @pid: PID of the process to wait for
+ *
+ * Wait for a process to exit successfully and return its exit status.
+ */
+static int waitchild(int pid)
 {
-	if (saved_netns != -1)
-		return;
+	int rc, status;
+
+	do {
+		rc = waitpid(pid, &status, 0);
+		if (rc < 0) {
+			if (errno == EINTR)
+				continue;
+			return -errno;
+		}
+		if (WIFEXITED(status) &&
+		    WEXITSTATUS(status) != EXIT_SUCCESS)
+			return WEXITSTATUS(status);
+	} while (rc < 0);
 
-	saved_netns = open("/proc/self/ns/net", O_RDONLY | O_CLOEXEC);
-	if (saved_netns == -1) {
-		perror("Cannot open init namespace");
-		exit(1);
+	return 0;
+}
+
+/**
+ * sync_with_child() - Tell our child we're ready and wait for it to exit
+ * @pid: The pid of our child
+ * @fd: A file descriptor created with eventfd()
+ *
+ * This tells a child created with fork_and_wait() that we are ready for it to
+ * continue. Once we have done that, wait for our child to exit.
+ */
+static int sync_with_child(pid_t pid, int fd)
+{
+	uint64_t ch = PIPE_SYNC_BYTE;
+
+	write(fd, &ch, sizeof(ch));
+	close(fd);
+
+	return waitchild(pid);
+}
+
+/**
+ * fork_and_wait() - Fork and wait to be sync'd with
+ * @fd - A file descriptor created with eventfd() which should be passed to
+ *       sync_with_child()
+ *
+ * This creates an eventfd and forks. The parent process returns immediately,
+ * but the child waits for a %PIPE_SYNC_BYTE on the eventfd before returning.
+ * This allows the parent to perform some tasks before the child starts its
+ * work. The parent should call sync_with_child() once it is ready for the
+ * child to continue.
+ *
+ * Return: The pid from fork()
+ */
+static pid_t fork_and_wait(int *fd)
+{
+	uint64_t ch;
+	pid_t pid;
+
+	*fd = eventfd(0, 0);
+	if (*fd < 0) {
+		fprintf(stderr, "eventfd failed: %s\n", strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
+	pid = fork();
+	if (pid < 0) {
+		fprintf(stderr, "fork failed: %s\n", strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
+	if (!pid) {
+		/* wait for the our parent to tell us to continue */
+		if (read(*fd, (char *)&ch, sizeof(ch)) != sizeof(ch) ||
+		    ch != PIPE_SYNC_BYTE) {
+			fprintf(stderr, "failed to read eventfd\n");
+			exit(EXIT_FAILURE);
+		}
+		close(*fd);
 	}
+
+	return pid;
 }
 
-static void netns_restore(void)
+static int bind_ns_file(const char *parent, const char *nsfile,
+			const char *ns_name, pid_t target_pid)
 {
-	if (saved_netns == -1)
-		return;
+	char ns_path[PATH_MAX], proc_path[PATH_MAX];
+	int fd;
 
-	if (setns(saved_netns, CLONE_NEWNET)) {
-		perror("setns");
-		exit(1);
+	snprintf(ns_path, sizeof(ns_path), "%s/%s", parent, ns_name);
+	snprintf(proc_path, sizeof(proc_path), "/proc/%d/ns/%s", target_pid, nsfile);
+
+	/* Create the filesystem state */
+	fd = open(ns_path, O_RDONLY|O_CREAT|O_EXCL, 0);
+	if (fd < 0) {
+		fprintf(stderr, "Cannot create namespace file \"%s\": %s\n",
+			ns_path, strerror(errno));
+		return -1;
 	}
+	close(fd);
 
-	close(saved_netns);
-	saved_netns = -1;
+	if (mount(proc_path, ns_path, "none", MS_BIND, NULL) < 0) {
+		fprintf(stderr, "Bind %s -> %s failed: %s\n", proc_path,
+			ns_path, strerror(errno));
+		unlink(ns_path);
+		return -1;
+	}
+	return 0;
+}
+
+static pid_t bind_ns_files_from_child(const char *ns_name, pid_t target_pid,
+				      int *fd)
+{
+	pid_t child;
+
+	child = fork_and_wait(fd);
+	if (child)
+		return child;
+
+	if (bind_ns_file(NETNS_RUN_DIR, "net", ns_name, target_pid))
+		exit(EXIT_FAILURE);
+	exit(EXIT_SUCCESS);
 }
 
 static int netns_add(int argc, char **argv, bool create)
@@ -808,10 +912,9 @@ static int netns_add(int argc, char **argv, bool create)
 	 * userspace tweaks like remounting /sys, or bind mounting
 	 * a new /etc/resolv.conf can be shared between users.
 	 */
-	char netns_path[PATH_MAX], proc_path[PATH_MAX];
 	const char *name;
-	pid_t pid;
-	int fd;
+	pid_t pid, child;
+	int event_fd;
 	int lock;
 	int made_netns_run_dir_mount = 0;
 
@@ -820,6 +923,7 @@ static int netns_add(int argc, char **argv, bool create)
 			fprintf(stderr, "No netns name specified\n");
 			return -1;
 		}
+		pid = getpid();
 	} else {
 		if (argc < 2) {
 			fprintf(stderr, "No netns name and PID specified\n");
@@ -833,8 +937,6 @@ static int netns_add(int argc, char **argv, bool create)
 	}
 	name = argv[0];
 
-	snprintf(netns_path, sizeof(netns_path), "%s/%s", NETNS_RUN_DIR, name);
-
 	if (create_netns_dir())
 		return -1;
 
@@ -894,46 +996,20 @@ static int netns_add(int argc, char **argv, bool create)
 		close(lock);
 	}
 
-	/* Create the filesystem state */
-	fd = open(netns_path, O_RDONLY|O_CREAT|O_EXCL, 0);
-	if (fd < 0) {
-		fprintf(stderr, "Cannot create namespace file \"%s\": %s\n",
-			netns_path, strerror(errno));
-		return -1;
-	}
-	close(fd);
+	child = bind_ns_files_from_child(name, pid, &event_fd);
+	if (child < 0)
+		exit(EXIT_FAILURE);
 
 	if (create) {
-		netns_save();
 		if (unshare(CLONE_NEWNET) < 0) {
 			fprintf(stderr, "Failed to create a new network namespace \"%s\": %s\n",
 				name, strerror(errno));
-			goto out_delete;
+			close(event_fd);
+			exit(EXIT_FAILURE);
 		}
-
-		strcpy(proc_path, "/proc/self/ns/net");
-	} else {
-		snprintf(proc_path, sizeof(proc_path), "/proc/%d/ns/net", pid);
-	}
-
-	/* Bind the netns last so I can watch for it */
-	if (mount(proc_path, netns_path, "none", MS_BIND, NULL) < 0) {
-		fprintf(stderr, "Bind %s -> %s failed: %s\n",
-			proc_path, netns_path, strerror(errno));
-		goto out_delete;
 	}
-	netns_restore();
 
-	return 0;
-out_delete:
-	if (create) {
-		netns_restore();
-		netns_delete(argc, argv);
-	} else if (unlink(netns_path) < 0) {
-		fprintf(stderr, "Cannot remove namespace file \"%s\": %s\n",
-			netns_path, strerror(errno));
-	}
-	return -1;
+	return sync_with_child(child, event_fd);
 }
 
 int set_netnsid_from_name(const char *name, int nsid)
-- 
2.42.0


  reply	other threads:[~2023-10-09 18:28 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-10-09 18:27 [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces Toke Høiland-Jørgensen
2023-10-09 18:27 ` Toke Høiland-Jørgensen [this message]
2023-10-09 18:27 ` [RFC PATCH iproute2-next 2/5] ip: Split out code creating namespace mount dir so it can be reused Toke Høiland-Jørgensen
2023-10-09 18:27 ` [RFC PATCH iproute2-next 3/5] lib/namespace: Factor out code for reuse Toke Høiland-Jørgensen
2023-10-09 18:27 ` [RFC PATCH iproute2-next 4/5] ip: Also create and persist mount namespace when creating netns Toke Høiland-Jørgensen
2023-10-09 18:27 ` [RFC PATCH iproute2-next 5/5] lib/namespace: Also mount a bpffs instance inside new mount namespaces Toke Høiland-Jørgensen
2023-10-09 20:32 ` [RFC PATCH iproute2-next 0/5] Persisting of mount namespaces along with network namespaces Eric W. Biederman
2023-10-09 22:03   ` Toke Høiland-Jørgensen
2023-10-10  0:14     ` Eric W. Biederman
2023-10-10 13:38       ` Toke Høiland-Jørgensen
2023-10-10 19:19         ` Eric W. Biederman
2023-10-11 13:49           ` Toke Høiland-Jørgensen
2023-10-11 14:55             ` Eric W. Biederman
2023-10-11 15:03               ` Toke Høiland-Jørgensen
2023-10-10  8:42   ` David Laight
2023-10-10 19:32     ` Eric W. Biederman
2023-10-10 21:51       ` David Laight

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231009182753.851551-2-toke@redhat.com \
    --to=toke@redhat.com \
    --cc=David.Laight@ACULAB.COM \
    --cc=brauner@kernel.org \
    --cc=dsahern@gmail.com \
    --cc=ebiederm@xmission.com \
    --cc=netdev@vger.kernel.org \
    --cc=nicolas.dichtel@6wind.com \
    --cc=stephen@networkplumber.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).