All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Serge E. Hallyn" <serue@us.ibm.com>
To: Oren Laadan <orenl@cs.columbia.edu>,
	Louis.Rilling@kerlabs.com, Matt Helsley <matthltc@us.ibm.com>
Cc: lkml <linux-kernel@vger.kernel.org>,
	Linux Containers <containers@lists.osdl.org>
Subject: [PATCH] user-ns: Nested pidns support (v3)
Date: Tue, 23 Mar 2010 00:20:06 -0500	[thread overview]
Message-ID: <20100323052006.GA16217@us.ibm.com> (raw)
In-Reply-To: <20100323051839.GA16123@us.ibm.com>

Support restart of nested pid namespaces.  Parse the ckpt_vpid
array to decide the vpids to specify for each task's eclone().

Changelog:
	Mar 22: Some bugfixes to handle a more complex testcase,
		and accept array of __s32 instead of array of struct
		cktp_vpid from kernel.

Signed-off-by: Serge Hallyn <serue@us.ibm.com>
---
 include/linux/checkpoint.h     |    2 +-
 include/linux/checkpoint_hdr.h |   11 +++
 restart.c                      |  187 ++++++++++++++++++++++++++++++++++++---
 3 files changed, 184 insertions(+), 16 deletions(-)

diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 53b8b2c..8d021b9 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -14,7 +14,7 @@
  *  distribution for more details.
  */
 
-#define CHECKPOINT_VERSION 5
+#define CHECKPOINT_VERSION 6
 
 /* checkpoint user flags */
 #define CHECKPOINT_SUBTREE 0x1
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index e8eaf23..27c3f92 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -111,6 +111,8 @@ enum {
 #define CKPT_HDR_GROUPINFO CKPT_HDR_GROUPINFO
 	CKPT_HDR_TASK_CREDS,
 #define CKPT_HDR_TASK_CREDS CKPT_HDR_TASK_CREDS
+	CKPT_HDR_VPIDS,
+#define CKPT_HDR_VPIDS CKPT_HDR_VPIDS
 
 	/* 201-299: reserved for arch-dependent */
 
@@ -321,11 +323,20 @@ struct ckpt_hdr_tree {
 } __attribute__((aligned(8)));
 
 struct ckpt_pids {
+	/* these pids are in root_nsproxy's pid ns */
 	__s32 vpid;
 	__s32 vppid;
 	__s32 vtgid;
 	__s32 vpgid;
 	__s32 vsid;
+	__s32 rsid; /* real pid - in checkpointer's pid_ns */
+	__s32 depth; /* pidns depth */
+} __attribute__((aligned(8)));
+
+/* number of vpids */
+struct ckpt_hdr_vpids {
+	struct ckpt_hdr h;
+	__s32 nr_vpids;
 } __attribute__((aligned(8)));
 
 /* pids */
diff --git a/restart.c b/restart.c
index 0c74bb6..608750e 100644
--- a/restart.c
+++ b/restart.c
@@ -244,6 +244,9 @@ struct task {
 
 	struct task *phantom;	/* pointer to place-holdler task (if any) */
 
+	int vidx;		/* index into vpid array, -1 if none */
+	int piddepth;
+
 	pid_t pid;		/* process IDs, our bread-&-butter */
 	pid_t ppid;
 	pid_t tgid;
@@ -272,6 +275,7 @@ struct ckpt_ctx {
 	int pipe_in;
 	int pipe_out;
 	int pids_nr;
+	int vpids_nr;
 
 	int pipe_child[2];	/* for children to report status */
 	int pipe_feed[2];	/* for feeder to provide input */
@@ -279,6 +283,7 @@ struct ckpt_ctx {
 
 	struct ckpt_pids *pids_arr;
 	struct ckpt_pids *copy_arr;
+	__s32 *vpids_arr;
 
 	struct task *tasks_arr;
 	int tasks_nr;
@@ -291,6 +296,7 @@ struct ckpt_ctx {
 	char header_arch[BUFSIZE];
 	char container[BUFSIZE];
 	char tree[BUFSIZE];
+	char vpids[BUFSIZE];
 	char buf[BUFSIZE];
 	struct app_restart_args *args;
 
@@ -316,6 +322,7 @@ static int ckpt_remount_devpts(struct ckpt_ctx *ctx);
 
 static int ckpt_build_tree(struct ckpt_ctx *ctx);
 static int ckpt_init_tree(struct ckpt_ctx *ctx);
+static int assign_vpids(struct ckpt_ctx *ctx);
 static int ckpt_set_creator(struct ckpt_ctx *ctx, struct task *task);
 static int ckpt_placeholder_task(struct ckpt_ctx *ctx, struct task *task);
 static int ckpt_propagate_session(struct ckpt_ctx *ctx, struct task *session);
@@ -339,6 +346,7 @@ static int ckpt_write_header(struct ckpt_ctx *ctx);
 static int ckpt_write_header_arch(struct ckpt_ctx *ctx);
 static int ckpt_write_container(struct ckpt_ctx *ctx);
 static int ckpt_write_tree(struct ckpt_ctx *ctx);
+static int ckpt_write_vpids(struct ckpt_ctx *ctx);
 
 static int _ckpt_read(int fd, void *buf, int count);
 static int ckpt_read(int fd, void *buf, int count);
@@ -350,6 +358,7 @@ static int ckpt_read_header(struct ckpt_ctx *ctx);
 static int ckpt_read_header_arch(struct ckpt_ctx *ctx);
 static int ckpt_read_container(struct ckpt_ctx *ctx);
 static int ckpt_read_tree(struct ckpt_ctx *ctx);
+static int ckpt_read_vpids(struct ckpt_ctx *ctx);
 
 static int hash_init(struct ckpt_ctx *ctx);
 static void hash_exit(struct ckpt_ctx *ctx);
@@ -883,6 +892,12 @@ int app_restart(struct app_restart_args *args)
 		exit(1);
 	}
 
+	ret = ckpt_read_vpids(&ctx);
+	if (ret < 0) {
+		ckpt_perror("read c/r tree");
+		exit(1);
+	}
+
 	/* build creator-child-relationship tree */
 	if (hash_init(&ctx) < 0)
 		exit(1);
@@ -891,6 +906,10 @@ int app_restart(struct app_restart_args *args)
 	if (ret < 0)
 		exit(1);
 
+	ret = assign_vpids(&ctx);
+	if (ret < 0)
+		exit(1);
+
 	ret = ckpt_fork_feeder(&ctx);
 	if (ret < 0)
 		exit(1);
@@ -1218,13 +1237,13 @@ static int ckpt_coordinator_pidns(struct ckpt_ctx *ctx)
 
 	return ret;
 }
-#else
+#else /* CLONE_NEWPID */
 static int ckpt_coordinator_pidns(struct ckpt_ctx *ctx)
 {
 	ckpt_err("logical error: ckpt_coordinator_pidns unexpected\n");
 	exit(1);
 }
-#endif
+#endif /* CLONE_NEWPID */
 
 static int ckpt_coordinator(struct ckpt_ctx *ctx)
 {
@@ -2050,8 +2069,8 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
 	struct clone_args clone_args;
 	genstack stk;
 	unsigned long flags = SIGCHLD;
-	size_t nr_pids = 1;
 	pid_t pid = 0;
+	pid_t *pids = &pid;
 
 	ckpt_dbg("forking child vpid %d flags %#x\n", child->pid, child->flags);
 
@@ -2067,29 +2086,76 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
 		flags |= CLONE_PARENT;
 	}
 
+	memset(&clone_args, 0, sizeof(clone_args));
+	clone_args.nr_pids = 1;
 	/* select pid if --pids, otherwise it's 0 */
-	if (ctx->args->pids)
-		pid = child->pid;
+	if (ctx->args->pids) {
+		int i, depth = child->piddepth + 1;
 
-#ifdef CLONE_NEWPID
-	/* but for new pidns, don't specify a pid */
- 	if (child->flags & TASK_NEWPID) {
-		flags |= CLONE_NEWPID;
-		pid = 0;
-	}
+		clone_args.nr_pids = depth;
+		pids = malloc(sizeof(pid_t) * depth);
+		if (!pids) {
+			perror("ckpt_fork_child pids malloc");
+			return -1;
+		}
+
+		memset(pids, 0, sizeof(pid_t) * depth);
+		pids[0] = child->pid;
+		int j;
+		for (i = child->piddepth-1, j=0; i >= 0; i--, j++)
+			pids[j+1] = ctx->vpids_arr[child->vidx + j];
+
+#ifndef CLONE_NEWPID
+		if (child->piddepth > child->creator->piddepth) {
+			ckpt_err("nested pidns but CLONE_NEWPID undefined");
+			errno = -EINVAL;
+			return -1;
+		} else if (child->flags & TASK_NEWPID) {
+			ckpt_err("TASK_NEWPID set but CLONE_NEWPID undefined");
+			errno = -EINVAL;
+			return -1;
+		}
+#else /* CLONE_NEWPID */
+		if (child->piddepth > child->creator->piddepth) {
+			child->flags |= TASK_NEWPID;
+			flags |= CLONE_NEWPID;
+			clone_args.nr_pids--;
+		} else if (child->flags & TASK_NEWPID) {
+			/* The TASK_NEWPID could have been set for root task */
+			pids[0] = 0;
+			flags |= CLONE_NEWPID;
+		}
+		if (flags & CLONE_NEWPID && !ctx->args->pidns) {
+			ckpt_err("Must use --pidns for nested pidns container");
+			errno = -EINVAL;
+			return -1;
+		}
+#if 0
+		if (flags & CLONE_NEWPID)
+			clone_args.nr_pids--;
 #endif
+#endif /* CLONE_NEWPID */
+	}
 
 	if (child->flags & (TASK_SIBLING | TASK_THREAD))
 		child->real_parent = getppid();
 	else
 		child->real_parent = _getpid();
 
-	memset(&clone_args, 0, sizeof(clone_args));
 	clone_args.child_stack = (unsigned long)genstack_base(stk);
 	clone_args.child_stack_size = genstack_size(stk);
-	clone_args.nr_pids = nr_pids;
 
-	pid = eclone(ckpt_fork_stub, child, flags, &clone_args, &pid);
+	int who;
+
+	who = ((void *)child - (void *) &ctx->tasks_arr[0]) / sizeof(struct task);
+	ckpt_dbg("task %d forking with flags %lx numpids %d\n",
+		child->pid, flags, clone_args.nr_pids);
+	int i;
+	for (i=0; i<clone_args.nr_pids; i++)
+		ckpt_dbg("task %d pid[%d]=%d\n", child->pid, i, pids[i]);
+	pid = eclone(ckpt_fork_stub, child, flags, &clone_args, pids);
+	if (pids != &pid)
+		free(pids);
 	if (pid < 0) {
 		ckpt_perror("eclone");
 		genstack_release(stk);
@@ -2269,6 +2335,9 @@ static int ckpt_do_feeder(void *data)
 	if (ckpt_write_tree(ctx) < 0)
 		ckpt_abort(ctx, "write c/r tree");
 
+	if (ckpt_write_vpids(ctx) < 0)
+		ckpt_abort(ctx, "write vpids");
+
 	/* read rest -> write rest */
 	if (ctx->args->inspect)
 		ckpt_read_write_inspect(ctx);
@@ -2461,6 +2530,8 @@ static int ckpt_read_obj(struct ckpt_ctx *ctx,
 		errno = EINVAL;
 		return -1;
 	}
+	if (h->len == sizeof(*h))
+	return 0;
 	return ckpt_read(STDIN_FILENO, buf, h->len - sizeof(*h));
 }
 
@@ -2609,8 +2680,75 @@ static int ckpt_read_tree(struct ckpt_ctx *ctx)
 	}
 
 	ret = ckpt_read_obj_ptr(ctx, ctx->pids_arr, len, CKPT_HDR_BUFFER);
-	if (ret < 0)
+	if (ret < 0) {
 		free(ctx->pids_arr);
+		return ret;
+	}
+
+	return ret;
+}
+
+/* set the vpids pointers in all the tasks */
+static int assign_vpids(struct ckpt_ctx *ctx)
+{
+	int d, hidx, tidx;
+	struct task *t;
+
+	for (hidx = 0, tidx = 0; tidx < ctx->pids_nr; tidx++) {
+		t = &ctx->tasks_arr[tidx];
+		d = t->piddepth = ctx->pids_arr[tidx].depth;
+		if (!d) {
+			ckpt_dbg("task[%d].vidx = -1\n", tidx);
+			t->vidx = -1;
+			continue;
+		}
+		t->vidx = hidx;
+		ckpt_dbg("task[%d].vidx = %d (depth %d, rpid %d)\n",
+			tidx, hidx, t->piddepth, ctx->pids_arr[tidx].vpid);
+		int i;
+		for (i=0; i<t->piddepth; i++)
+			ckpt_dbg("task[%d].vpid[%d] = %d\n", tidx, i,
+				ctx->vpids_arr[hidx+i]);
+		hidx += d;
+		if (hidx > ctx->vpids_nr) {
+			ckpt_err("Error parsing vpids array");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int ckpt_read_vpids(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_vpids *h;
+	int len, ret;
+
+	h = (struct ckpt_hdr_vpids *) ctx->vpids;
+	ret = ckpt_read_obj_type(ctx, h, sizeof(*h), CKPT_HDR_VPIDS);
+	if (ret < 0)
+		return ret;
+
+	ckpt_dbg("number of vpids: %d\n", h->nr_vpids);
+
+	if (h->nr_vpids < 0) {
+		ckpt_err("invalid number of vpids %d", h->nr_vpids);
+		errno = EINVAL;
+		return -1;
+	}
+	ctx->vpids_nr = h->nr_vpids;
+	if (!ctx->vpids_nr)
+		return 0;
+
+	len = sizeof(__s32) * ctx->vpids_nr;
+
+	ctx->vpids_arr = malloc(len);
+	if (!ctx->pids_arr)
+		return -1;
+
+	ret = ckpt_read_obj_ptr(ctx, ctx->vpids_arr, len, CKPT_HDR_BUFFER);
+	if (ret < 0)
+		free(ctx->vpids_arr);
 
 	return ret;
 }
@@ -2685,6 +2823,25 @@ static int ckpt_write_tree(struct ckpt_ctx *ctx)
 	return 0;
 }
 
+static int ckpt_write_vpids(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_vpids *h;
+	int len;
+
+	h = (struct ckpt_hdr_vpids *) ctx->vpids;
+	if (ckpt_write_obj(ctx, (struct ckpt_hdr *) h) < 0)
+		ckpt_abort(ctx, "write vpids hdr");
+
+	if (!ctx->vpids_nr)
+		return 0;
+	len = sizeof(__s32) * ctx->vpids_nr;
+	if (ckpt_write_obj_ptr(ctx, ctx->vpids_arr, len, CKPT_HDR_BUFFER) < 0)
+		ckpt_abort(ctx, "write vpids");
+	ckpt_dbg("wrote %d bytes for %d vpids\n", len, ctx->vpids_nr);
+
+	return 0;
+}
+
 /*
  * a simple hash implementation
  */
-- 
1.7.0

  reply	other threads:[~2010-03-23  5:20 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-03-23  5:18 [PATCH] linux-cr: nested pid namespaces (v3) Serge E. Hallyn
2010-03-23  5:20 ` Serge E. Hallyn [this message]
2010-03-23  7:14 ` Louis Rilling
2010-03-23 13:52   ` Serge E. Hallyn
2010-03-24  9:56     ` Louis Rilling
2010-03-23 14:46   ` Serge E. Hallyn
     [not found] ` <20100323051839.GA16123-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-03-30  4:51   ` Oren Laadan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100323052006.GA16217@us.ibm.com \
    --to=serue@us.ibm.com \
    --cc=Louis.Rilling@kerlabs.com \
    --cc=containers@lists.osdl.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=matthltc@us.ibm.com \
    --cc=orenl@cs.columbia.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.