* [PATCH] user-ns: Nested pidns support (v3)
2010-03-23 5:18 [PATCH] linux-cr: nested pid namespaces (v3) Serge E. Hallyn
@ 2010-03-23 5:20 ` Serge E. Hallyn
2010-03-23 7:14 ` [PATCH] linux-cr: nested pid namespaces (v3) Louis Rilling
1 sibling, 0 replies; 6+ messages in thread
From: Serge E. Hallyn @ 2010-03-23 5:20 UTC (permalink / raw)
To: Oren Laadan, Louis.Rilling, Matt Helsley; +Cc: lkml, Linux Containers
Support restart of nested pid namespaces. Parse the ckpt_vpid
array to decide the vpids to specify for each task's eclone().
Changelog:
Mar 22: Some bugfixes to handle a more complex testcase,
and accept array of __s32 instead of array of struct
cktp_vpid from kernel.
Signed-off-by: Serge Hallyn <serue@us.ibm.com>
---
include/linux/checkpoint.h | 2 +-
include/linux/checkpoint_hdr.h | 11 +++
restart.c | 187 ++++++++++++++++++++++++++++++++++++---
3 files changed, 184 insertions(+), 16 deletions(-)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 53b8b2c..8d021b9 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -14,7 +14,7 @@
* distribution for more details.
*/
-#define CHECKPOINT_VERSION 5
+#define CHECKPOINT_VERSION 6
/* checkpoint user flags */
#define CHECKPOINT_SUBTREE 0x1
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index e8eaf23..27c3f92 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -111,6 +111,8 @@ enum {
#define CKPT_HDR_GROUPINFO CKPT_HDR_GROUPINFO
CKPT_HDR_TASK_CREDS,
#define CKPT_HDR_TASK_CREDS CKPT_HDR_TASK_CREDS
+ CKPT_HDR_VPIDS,
+#define CKPT_HDR_VPIDS CKPT_HDR_VPIDS
/* 201-299: reserved for arch-dependent */
@@ -321,11 +323,20 @@ struct ckpt_hdr_tree {
} __attribute__((aligned(8)));
struct ckpt_pids {
+ /* these pids are in root_nsproxy's pid ns */
__s32 vpid;
__s32 vppid;
__s32 vtgid;
__s32 vpgid;
__s32 vsid;
+ __s32 rsid; /* real pid - in checkpointer's pid_ns */
+ __s32 depth; /* pidns depth */
+} __attribute__((aligned(8)));
+
+/* number of vpids */
+struct ckpt_hdr_vpids {
+ struct ckpt_hdr h;
+ __s32 nr_vpids;
} __attribute__((aligned(8)));
/* pids */
diff --git a/restart.c b/restart.c
index 0c74bb6..608750e 100644
--- a/restart.c
+++ b/restart.c
@@ -244,6 +244,9 @@ struct task {
struct task *phantom; /* pointer to place-holdler task (if any) */
+ int vidx; /* index into vpid array, -1 if none */
+ int piddepth;
+
pid_t pid; /* process IDs, our bread-&-butter */
pid_t ppid;
pid_t tgid;
@@ -272,6 +275,7 @@ struct ckpt_ctx {
int pipe_in;
int pipe_out;
int pids_nr;
+ int vpids_nr;
int pipe_child[2]; /* for children to report status */
int pipe_feed[2]; /* for feeder to provide input */
@@ -279,6 +283,7 @@ struct ckpt_ctx {
struct ckpt_pids *pids_arr;
struct ckpt_pids *copy_arr;
+ __s32 *vpids_arr;
struct task *tasks_arr;
int tasks_nr;
@@ -291,6 +296,7 @@ struct ckpt_ctx {
char header_arch[BUFSIZE];
char container[BUFSIZE];
char tree[BUFSIZE];
+ char vpids[BUFSIZE];
char buf[BUFSIZE];
struct app_restart_args *args;
@@ -316,6 +322,7 @@ static int ckpt_remount_devpts(struct ckpt_ctx *ctx);
static int ckpt_build_tree(struct ckpt_ctx *ctx);
static int ckpt_init_tree(struct ckpt_ctx *ctx);
+static int assign_vpids(struct ckpt_ctx *ctx);
static int ckpt_set_creator(struct ckpt_ctx *ctx, struct task *task);
static int ckpt_placeholder_task(struct ckpt_ctx *ctx, struct task *task);
static int ckpt_propagate_session(struct ckpt_ctx *ctx, struct task *session);
@@ -339,6 +346,7 @@ static int ckpt_write_header(struct ckpt_ctx *ctx);
static int ckpt_write_header_arch(struct ckpt_ctx *ctx);
static int ckpt_write_container(struct ckpt_ctx *ctx);
static int ckpt_write_tree(struct ckpt_ctx *ctx);
+static int ckpt_write_vpids(struct ckpt_ctx *ctx);
static int _ckpt_read(int fd, void *buf, int count);
static int ckpt_read(int fd, void *buf, int count);
@@ -350,6 +358,7 @@ static int ckpt_read_header(struct ckpt_ctx *ctx);
static int ckpt_read_header_arch(struct ckpt_ctx *ctx);
static int ckpt_read_container(struct ckpt_ctx *ctx);
static int ckpt_read_tree(struct ckpt_ctx *ctx);
+static int ckpt_read_vpids(struct ckpt_ctx *ctx);
static int hash_init(struct ckpt_ctx *ctx);
static void hash_exit(struct ckpt_ctx *ctx);
@@ -883,6 +892,12 @@ int app_restart(struct app_restart_args *args)
exit(1);
}
+ ret = ckpt_read_vpids(&ctx);
+ if (ret < 0) {
+ ckpt_perror("read c/r tree");
+ exit(1);
+ }
+
/* build creator-child-relationship tree */
if (hash_init(&ctx) < 0)
exit(1);
@@ -891,6 +906,10 @@ int app_restart(struct app_restart_args *args)
if (ret < 0)
exit(1);
+ ret = assign_vpids(&ctx);
+ if (ret < 0)
+ exit(1);
+
ret = ckpt_fork_feeder(&ctx);
if (ret < 0)
exit(1);
@@ -1218,13 +1237,13 @@ static int ckpt_coordinator_pidns(struct ckpt_ctx *ctx)
return ret;
}
-#else
+#else /* CLONE_NEWPID */
static int ckpt_coordinator_pidns(struct ckpt_ctx *ctx)
{
ckpt_err("logical error: ckpt_coordinator_pidns unexpected\n");
exit(1);
}
-#endif
+#endif /* CLONE_NEWPID */
static int ckpt_coordinator(struct ckpt_ctx *ctx)
{
@@ -2050,8 +2069,8 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
struct clone_args clone_args;
genstack stk;
unsigned long flags = SIGCHLD;
- size_t nr_pids = 1;
pid_t pid = 0;
+ pid_t *pids = &pid;
ckpt_dbg("forking child vpid %d flags %#x\n", child->pid, child->flags);
@@ -2067,29 +2086,76 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
flags |= CLONE_PARENT;
}
+ memset(&clone_args, 0, sizeof(clone_args));
+ clone_args.nr_pids = 1;
/* select pid if --pids, otherwise it's 0 */
- if (ctx->args->pids)
- pid = child->pid;
+ if (ctx->args->pids) {
+ int i, depth = child->piddepth + 1;
-#ifdef CLONE_NEWPID
- /* but for new pidns, don't specify a pid */
- if (child->flags & TASK_NEWPID) {
- flags |= CLONE_NEWPID;
- pid = 0;
- }
+ clone_args.nr_pids = depth;
+ pids = malloc(sizeof(pid_t) * depth);
+ if (!pids) {
+ perror("ckpt_fork_child pids malloc");
+ return -1;
+ }
+
+ memset(pids, 0, sizeof(pid_t) * depth);
+ pids[0] = child->pid;
+ int j;
+ for (i = child->piddepth-1, j=0; i >= 0; i--, j++)
+ pids[j+1] = ctx->vpids_arr[child->vidx + j];
+
+#ifndef CLONE_NEWPID
+ if (child->piddepth > child->creator->piddepth) {
+ ckpt_err("nested pidns but CLONE_NEWPID undefined");
+ errno = -EINVAL;
+ return -1;
+ } else if (child->flags & TASK_NEWPID) {
+ ckpt_err("TASK_NEWPID set but CLONE_NEWPID undefined");
+ errno = -EINVAL;
+ return -1;
+ }
+#else /* CLONE_NEWPID */
+ if (child->piddepth > child->creator->piddepth) {
+ child->flags |= TASK_NEWPID;
+ flags |= CLONE_NEWPID;
+ clone_args.nr_pids--;
+ } else if (child->flags & TASK_NEWPID) {
+ /* The TASK_NEWPID could have been set for root task */
+ pids[0] = 0;
+ flags |= CLONE_NEWPID;
+ }
+ if (flags & CLONE_NEWPID && !ctx->args->pidns) {
+ ckpt_err("Must use --pidns for nested pidns container");
+ errno = -EINVAL;
+ return -1;
+ }
+#if 0
+ if (flags & CLONE_NEWPID)
+ clone_args.nr_pids--;
#endif
+#endif /* CLONE_NEWPID */
+ }
if (child->flags & (TASK_SIBLING | TASK_THREAD))
child->real_parent = getppid();
else
child->real_parent = _getpid();
- memset(&clone_args, 0, sizeof(clone_args));
clone_args.child_stack = (unsigned long)genstack_base(stk);
clone_args.child_stack_size = genstack_size(stk);
- clone_args.nr_pids = nr_pids;
- pid = eclone(ckpt_fork_stub, child, flags, &clone_args, &pid);
+ int who;
+
+ who = ((void *)child - (void *) &ctx->tasks_arr[0]) / sizeof(struct task);
+ ckpt_dbg("task %d forking with flags %lx numpids %d\n",
+ child->pid, flags, clone_args.nr_pids);
+ int i;
+ for (i=0; i<clone_args.nr_pids; i++)
+ ckpt_dbg("task %d pid[%d]=%d\n", child->pid, i, pids[i]);
+ pid = eclone(ckpt_fork_stub, child, flags, &clone_args, pids);
+ if (pids != &pid)
+ free(pids);
if (pid < 0) {
ckpt_perror("eclone");
genstack_release(stk);
@@ -2269,6 +2335,9 @@ static int ckpt_do_feeder(void *data)
if (ckpt_write_tree(ctx) < 0)
ckpt_abort(ctx, "write c/r tree");
+ if (ckpt_write_vpids(ctx) < 0)
+ ckpt_abort(ctx, "write vpids");
+
/* read rest -> write rest */
if (ctx->args->inspect)
ckpt_read_write_inspect(ctx);
@@ -2461,6 +2530,8 @@ static int ckpt_read_obj(struct ckpt_ctx *ctx,
errno = EINVAL;
return -1;
}
+ if (h->len == sizeof(*h))
+ return 0;
return ckpt_read(STDIN_FILENO, buf, h->len - sizeof(*h));
}
@@ -2609,8 +2680,75 @@ static int ckpt_read_tree(struct ckpt_ctx *ctx)
}
ret = ckpt_read_obj_ptr(ctx, ctx->pids_arr, len, CKPT_HDR_BUFFER);
- if (ret < 0)
+ if (ret < 0) {
free(ctx->pids_arr);
+ return ret;
+ }
+
+ return ret;
+}
+
+/* set the vpids pointers in all the tasks */
+static int assign_vpids(struct ckpt_ctx *ctx)
+{
+ int d, hidx, tidx;
+ struct task *t;
+
+ for (hidx = 0, tidx = 0; tidx < ctx->pids_nr; tidx++) {
+ t = &ctx->tasks_arr[tidx];
+ d = t->piddepth = ctx->pids_arr[tidx].depth;
+ if (!d) {
+ ckpt_dbg("task[%d].vidx = -1\n", tidx);
+ t->vidx = -1;
+ continue;
+ }
+ t->vidx = hidx;
+ ckpt_dbg("task[%d].vidx = %d (depth %d, rpid %d)\n",
+ tidx, hidx, t->piddepth, ctx->pids_arr[tidx].vpid);
+ int i;
+ for (i=0; i<t->piddepth; i++)
+ ckpt_dbg("task[%d].vpid[%d] = %d\n", tidx, i,
+ ctx->vpids_arr[hidx+i]);
+ hidx += d;
+ if (hidx > ctx->vpids_nr) {
+ ckpt_err("Error parsing vpids array");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int ckpt_read_vpids(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_vpids *h;
+ int len, ret;
+
+ h = (struct ckpt_hdr_vpids *) ctx->vpids;
+ ret = ckpt_read_obj_type(ctx, h, sizeof(*h), CKPT_HDR_VPIDS);
+ if (ret < 0)
+ return ret;
+
+ ckpt_dbg("number of vpids: %d\n", h->nr_vpids);
+
+ if (h->nr_vpids < 0) {
+ ckpt_err("invalid number of vpids %d", h->nr_vpids);
+ errno = EINVAL;
+ return -1;
+ }
+ ctx->vpids_nr = h->nr_vpids;
+ if (!ctx->vpids_nr)
+ return 0;
+
+ len = sizeof(__s32) * ctx->vpids_nr;
+
+ ctx->vpids_arr = malloc(len);
+ if (!ctx->pids_arr)
+ return -1;
+
+ ret = ckpt_read_obj_ptr(ctx, ctx->vpids_arr, len, CKPT_HDR_BUFFER);
+ if (ret < 0)
+ free(ctx->vpids_arr);
return ret;
}
@@ -2685,6 +2823,25 @@ static int ckpt_write_tree(struct ckpt_ctx *ctx)
return 0;
}
+static int ckpt_write_vpids(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_vpids *h;
+ int len;
+
+ h = (struct ckpt_hdr_vpids *) ctx->vpids;
+ if (ckpt_write_obj(ctx, (struct ckpt_hdr *) h) < 0)
+ ckpt_abort(ctx, "write vpids hdr");
+
+ if (!ctx->vpids_nr)
+ return 0;
+ len = sizeof(__s32) * ctx->vpids_nr;
+ if (ckpt_write_obj_ptr(ctx, ctx->vpids_arr, len, CKPT_HDR_BUFFER) < 0)
+ ckpt_abort(ctx, "write vpids");
+ ckpt_dbg("wrote %d bytes for %d vpids\n", len, ctx->vpids_nr);
+
+ return 0;
+}
+
/*
* a simple hash implementation
*/
--
1.7.0
^ permalink raw reply related [flat|nested] 6+ messages in thread* Re: [PATCH] linux-cr: nested pid namespaces (v3)
2010-03-23 5:18 [PATCH] linux-cr: nested pid namespaces (v3) Serge E. Hallyn
2010-03-23 5:20 ` [PATCH] user-ns: Nested pidns support (v3) Serge E. Hallyn
@ 2010-03-23 7:14 ` Louis Rilling
2010-03-23 13:52 ` Serge E. Hallyn
2010-03-23 14:46 ` Serge E. Hallyn
1 sibling, 2 replies; 6+ messages in thread
From: Louis Rilling @ 2010-03-23 7:14 UTC (permalink / raw)
To: Serge E. Hallyn; +Cc: Oren Laadan, Matt Helsley, Linux Containers, lkml
[-- Attachment #1: Type: text/plain, Size: 18347 bytes --]
On Tue, Mar 23, 2010 at 12:18:39AM -0500, Serge E. Hallyn wrote:
> Support checkpoint and restart of tasks in nested pid namespaces.
> We keep the original single pids_array to minimize memory
> allocations. The pids array entries are augmented with a pidns
> depth (relative to the container init's pidns, and an "rpid" which
> is the pid in the checkpointer's pidns (or 0 if no valid pid exists).
> The rpid will be used by userspace to gather more information (like
> /proc/$$/mountinfo) after the kernel sys_checkpoint. If any tasks
> are in nested pid namespace, another single array will hold all of
> the vpids. At restart those are used by userspace to determine how
> to call eclone(). Kernel ignores them.
>
> This patch also adds 'rpid' to struct ckpt_hdr_pids, which is not
> needed for nested pid_ns support, but will be needed for the
> userspace checkpointer to gather additional information (i.e.
> /proc/pid/mountinfo) after sys_checkpoint() completes.
>
> Changelog:
> Mar 22:
> Use Louis Rilling's smarter ckpt_vpids algorithm
> verbatim, to handle pid_ns depths > CKPT_HDR_PIDS_CHUNK,
> as well as fix an apparent bug in my original code.
>
> As Louis suggested, use task_active_pid_ns() rather than
> task->nsproxy->pid_ns. In fact it's a must, bc the
> checkpointed task may be dead and have NULL
> task->nsproxy->pid_ns.
Hm, if task can be dead, then there is a much bigger issue:
task->nsproxy is NULL. Or did I miss something?
To me the real reason is to anticipate pid namespace unsharing. And this
together with setns() will need to re-consider much of the namespace C/R
logic imho. For instance, checkpoint could be done from a foreign task
having entered the container, leak detection should take such foreign
tasks into account (see example below), etc.
>
> Oren: Add spinlock for nsproxy->pidns; use
> ckpt_read_consume() to consume vpids; and use __s32 instead
> of ckpt_vpid struct
>
> Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
> ---
> checkpoint/checkpoint.c | 123 ++++++++++++++++++++++++++++++++++----
> checkpoint/process.c | 22 ++++++-
> checkpoint/restart.c | 43 ++++++++++++-
> checkpoint/sys.c | 2 +
> include/linux/checkpoint.h | 2 +-
> include/linux/checkpoint_hdr.h | 14 ++++
> include/linux/checkpoint_types.h | 3 +
> kernel/nsproxy.c | 9 ++-
> 8 files changed, 195 insertions(+), 23 deletions(-)
>
> diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> index f27af41..fd61a80 100644
> --- a/checkpoint/checkpoint.c
> +++ b/checkpoint/checkpoint.c
> @@ -27,6 +27,7 @@
> #include <linux/deferqueue.h>
> #include <linux/checkpoint.h>
> #include <linux/checkpoint_hdr.h>
> +#include <linux/pid_namespace.h>
>
> /* unique checkpoint identifier (FIXME: should be per-container ?) */
> static atomic_t ctx_count = ATOMIC_INIT(0);
> @@ -242,6 +243,7 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
> struct task_struct *root = ctx->root_task;
> struct nsproxy *nsproxy;
> int ret = 0;
> + struct pid_namespace *pidns;
>
> ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
>
> @@ -293,10 +295,15 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
> _ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
> ret = -EPERM;
> }
> - /* no support for >1 private pidns */
> - if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
> - _ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
> - ret = -EPERM;
> + /* pidns must be descendent of root_nsproxy */
> + pidns = nsproxy->pid_ns;
In case of unshared pid namespace, task_active_pid_ns(t) should be checked
instead of t->nsproxy->pid_ns: we can't checkpoint a foreign task.
Thanks,
Louis
> + while (pidns != ctx->root_nsproxy->pid_ns) {
> + if (pidns == &init_pid_ns) {
> + ret = -EPERM;
> + _ckpt_err(ctx, ret, "%(T)stranger pid_ns\n");
> + break;
> + }
> + pidns = pidns->parent;
> }
> rcu_read_unlock();
>
> @@ -305,15 +312,19 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>
> #define CKPT_HDR_PIDS_CHUNK 256
>
> +/*
> + * Write the pids in ctx->root_nsproxy->pidns. This info is
> + * needed at restart to unambiguously dereference tasks.
> + */
> static int checkpoint_pids(struct ckpt_ctx *ctx)
> {
> struct ckpt_pids *h;
> - struct pid_namespace *ns;
> + struct pid_namespace *root_pidns;
> struct task_struct *task;
> struct task_struct **tasks_arr;
> int nr_tasks, n, pos = 0, ret = 0;
>
> - ns = ctx->root_nsproxy->pid_ns;
> + root_pidns = ctx->root_nsproxy->pid_ns;
> tasks_arr = ctx->tasks_arr;
> nr_tasks = ctx->nr_tasks;
> BUG_ON(nr_tasks <= 0);
> @@ -331,15 +342,21 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
> do {
> rcu_read_lock();
> for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
> + struct pid_namespace *task_pidns;
> task = tasks_arr[pos];
>
> - h[n].vpid = task_pid_nr_ns(task, ns);
> - h[n].vtgid = task_tgid_nr_ns(task, ns);
> - h[n].vpgid = task_pgrp_nr_ns(task, ns);
> - h[n].vsid = task_session_nr_ns(task, ns);
> - h[n].vppid = task_tgid_nr_ns(task->real_parent, ns);
> + h[n].vpid = task_pid_nr_ns(task, root_pidns);
> + h[n].vtgid = task_tgid_nr_ns(task, root_pidns);
> + h[n].vpgid = task_pgrp_nr_ns(task, root_pidns);
> + h[n].vsid = task_session_nr_ns(task, root_pidns);
> + h[n].vppid = task_tgid_nr_ns(task->real_parent,
> + root_pidns);
> + task_pidns = task_active_pid_ns(task);
> + h[n].rpid = task_pid_vnr(task);
> + h[n].depth = task_pidns->level - root_pidns->level;
> ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
> pos, h[n].vpid, h[n].vtgid, h[n].vppid);
> + ctx->nr_vpids += h[n].depth;
> pos++;
> }
> rcu_read_unlock();
> @@ -356,6 +373,71 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
> return ret;
> }
>
> +static int checkpoint_vpids(struct ckpt_ctx *ctx)
> +{
> + __s32 *h; /* vpid array */
> + struct pid_namespace *root_pidns, *task_pidns = NULL, *active_pidns;
> + struct task_struct *task;
> + int ret, nr_tasks = ctx->nr_tasks;
> + int tidx = 0, /* index into task array */
> + hidx = 0, /* pids written into current __s32 chunk */
> + vidx = 0; /* vpid index for current task */
> +
> + root_pidns = ctx->root_nsproxy->pid_ns;
> + nr_tasks = ctx->nr_tasks;
> +
> + ret = ckpt_write_obj_type(ctx, NULL,
> + sizeof(*h) * ctx->nr_vpids,
> + CKPT_HDR_BUFFER);
> + if (ret < 0)
> + return ret;
> +
> + h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> + if (!h)
> + return -ENOMEM;
> +
> + do {
> + rcu_read_lock();
> + while (tidx < nr_tasks && hidx < CKPT_HDR_PIDS_CHUNK) {
> + int nsdelta;
> +
> + task = ctx->tasks_arr[tidx];
> + active_pidns = task_active_pid_ns(task);
> + nsdelta = active_pidns->level - root_pidns->level;
> + if (hidx + nsdelta - vidx > CKPT_HDR_PIDS_CHUNK)
> + /*
> + * We will release rcu before recording the
> + * remaining vpids, but neither task nor its
> + * pid can disappear.
> + */
> + nsdelta = CKPT_HDR_PIDS_CHUNK - hidx + vidx;
> +
> + if (vidx == 0)
> + task_pidns = active_pidns;
> + for (; vidx < nsdelta; vidx++) {
> + h[hidx] = task_pid_nr_ns(task, task_pidns);
> + hidx++;
> + task_pidns = task_pidns->parent;
> + }
> +
> + if (task_pidns == root_pidns) {
> + tidx++;
> + vidx = 0;
> + }
> + }
> + rcu_read_unlock();
> +
> + ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h));
> + if (ret < 0)
> + break;
> +
> + hidx = 0;
> + } while (tidx < nr_tasks);
> +
> + _ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> + return ret;
> +}
> +
> static int collect_objects(struct ckpt_ctx *ctx)
> {
> int n, ret = 0;
> @@ -466,6 +548,7 @@ static int build_tree(struct ckpt_ctx *ctx)
> static int checkpoint_tree(struct ckpt_ctx *ctx)
> {
> struct ckpt_hdr_tree *h;
> + struct ckpt_hdr_vpids *hvpids;
> int ret;
>
> h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TREE);
> @@ -480,7 +563,23 @@ static int checkpoint_tree(struct ckpt_ctx *ctx)
> return ret;
>
> ret = checkpoint_pids(ctx);
> - return ret;
> + if (ret < 0)
> + return ret;
> +
> + hvpids = ckpt_hdr_get_type(ctx, sizeof(*hvpids), CKPT_HDR_VPIDS);
> + if (!hvpids)
> + return -ENOMEM;
> +
> + hvpids->nr_vpids = ctx->nr_vpids;
> +
> + ret = ckpt_write_obj(ctx, &hvpids->h);
> + ckpt_hdr_put(ctx, hvpids);
> + if (ret < 0)
> + return ret;
> + if (ctx->nr_vpids == 0)
> + return 0;
> +
> + return checkpoint_vpids(ctx);
> }
>
> static struct task_struct *get_freezer_task(struct task_struct *root_task)
> diff --git a/checkpoint/process.c b/checkpoint/process.c
> index f917112..602ba9f 100644
> --- a/checkpoint/process.c
> +++ b/checkpoint/process.c
> @@ -22,7 +22,7 @@
> #include <linux/checkpoint.h>
> #include <linux/checkpoint_hdr.h>
> #include <linux/syscalls.h>
> -
> +#include <linux/pid_namespace.h>
>
> pid_t ckpt_pid_nr(struct ckpt_ctx *ctx, struct pid *pid)
> {
> @@ -51,7 +51,7 @@ struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid)
> * Find the owner process of this pgid (it must exist
> * if pgrp exists). It must be a thread group leader.
> */
> - pgrp = find_vpid(pgid);
> + pgrp = find_pid_ns(pgid, ctx->root_nsproxy->pid_ns);
> p = pid_task(pgrp, PIDTYPE_PID);
> if (!p || !thread_group_leader(p))
> return NULL;
> @@ -561,6 +561,13 @@ static int restore_task_struct(struct ckpt_ctx *ctx)
> return ret;
> }
>
> +/*
> + * restart is currently serialized, but if/when that changes we want
> + * to make sure that setting nsproxy->pidns in restore_task_ns() is only
> + * done once. That's what checkpoint_nslock is for
> + */
> +DEFINE_SPINLOCK(checkpoint_nslock);
> +
> static int restore_task_ns(struct ckpt_ctx *ctx)
> {
> struct ckpt_hdr_task_ns *h;
> @@ -578,6 +585,10 @@ static int restore_task_ns(struct ckpt_ctx *ctx)
> }
>
> if (nsproxy != task_nsproxy(current)) {
> + spin_lock(&checkpoint_nslock);
> + if (!nsproxy->pid_ns)
> + nsproxy->pid_ns = get_pid_ns(current->nsproxy->pid_ns);
> + spin_unlock(&checkpoint_nslock);
> get_nsproxy(nsproxy);
> switch_task_namespaces(current, nsproxy);
> }
> @@ -829,8 +840,8 @@ static int restore_task_pgid(struct ckpt_ctx *ctx)
>
> pgid = ctx->pids_arr[ctx->active_pid].vpgid;
>
> - if (pgid == task_pgrp_vnr(task)) /* nothing to do */
> - return 0;
> + if (pgid == task_pgrp_nr_ns(task, ctx->root_nsproxy->pid_ns))
> + return 0; /* nothing to do */
>
> if (task->signal->leader) /* (2) */
> return -EINVAL;
> @@ -850,6 +861,9 @@ static int restore_task_pgid(struct ckpt_ctx *ctx)
> if (ctx->uflags & RESTART_TASKSELF)
> ret = 0;
>
> + if (ret < 0)
> + ckpt_err(ctx, ret, "setting pgid\n");
> +
> return ret;
> }
>
> diff --git a/checkpoint/restart.c b/checkpoint/restart.c
> index 6a9644d..c25ce88 100644
> --- a/checkpoint/restart.c
> +++ b/checkpoint/restart.c
> @@ -23,6 +23,7 @@
> #include <asm/syscall.h>
> #include <linux/elf.h>
> #include <linux/deferqueue.h>
> +#include <linux/pid_namespace.h>
> #include <linux/checkpoint.h>
> #include <linux/checkpoint_hdr.h>
>
> @@ -760,6 +761,33 @@ static int restore_read_tree(struct ckpt_ctx *ctx)
> return ret;
> }
>
> +/*
> + * read all the vpids - we don't actually care about them,
> + * userspace did
> + */
> +static int restore_slurp_vpids(struct ckpt_ctx *ctx)
> +{
> + struct ckpt_hdr_vpids *h;
> + int size, ret;
> +
> + h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_VPIDS);
> + if (IS_ERR(h))
> + return PTR_ERR(h);
> + ctx->nr_vpids = h->nr_vpids;
> + ckpt_hdr_put(ctx, h);
> +
> + if (!ctx->nr_vpids)
> + return 0;
> +
> + size = sizeof(__s32) * ctx->nr_vpids;
> + if (size < 0) /* overflow ? */
> + return -EINVAL;
> +
> + ret = ckpt_read_consume(ctx, size + sizeof(struct ckpt_hdr),
> + CKPT_HDR_BUFFER);
> + return ret;
> +}
> +
> static inline int all_tasks_activated(struct ckpt_ctx *ctx)
> {
> return (ctx->active_pid == ctx->nr_pids);
> @@ -848,7 +876,8 @@ static int restore_activate_next(struct ckpt_ctx *ctx)
> pid = get_active_pid(ctx);
>
> rcu_read_lock();
> - task = find_task_by_pid_ns(pid, ctx->root_nsproxy->pid_ns);
> + task = find_task_by_pid_ns(pid,
> + task_active_pid_ns(ctx->root_task));
> /* target task must have same restart context */
> if (task && task->checkpoint_ctx == ctx)
> wake_up_process(task);
> @@ -870,7 +899,7 @@ static int restore_activate_next(struct ckpt_ctx *ctx)
>
> static int wait_task_active(struct ckpt_ctx *ctx)
> {
> - pid_t pid = task_pid_vnr(current);
> + pid_t pid = task_pid_nr_ns(current, task_active_pid_ns(ctx->root_task));
> int ret;
>
> ckpt_debug("pid %d waiting\n", pid);
> @@ -886,7 +915,8 @@ static int wait_task_active(struct ckpt_ctx *ctx)
>
> static int wait_task_sync(struct ckpt_ctx *ctx)
> {
> - ckpt_debug("pid %d syncing\n", task_pid_vnr(current));
> + ckpt_debug("pid %d syncing\n",
> + task_pid_nr_ns(current, task_active_pid_ns(ctx->root_task)));
> wait_event_interruptible(ctx->waitq, ckpt_test_complete(ctx));
> ckpt_debug("task sync done (errno %d)\n", ctx->errno);
> if (ckpt_test_error(ctx))
> @@ -1160,7 +1190,7 @@ static struct task_struct *choose_root_task(struct ckpt_ctx *ctx, pid_t pid)
>
> read_lock(&tasklist_lock);
> list_for_each_entry(task, ¤t->children, sibling) {
> - if (task_pid_vnr(task) == pid) {
> + if (task_pid_nr_ns(task, ctx->coord_pidns) == pid) {
> get_task_struct(task);
> ctx->root_task = task;
> ctx->root_pid = pid;
> @@ -1237,6 +1267,11 @@ static int do_restore_coord(struct ckpt_ctx *ctx, pid_t pid)
> if (ret < 0)
> return ret;
>
> + ret = restore_slurp_vpids(ctx);
> + ckpt_debug("read vpids %d\n", ret);
> + if (ret < 0)
> + return ret;
> +
> if ((ctx->uflags & RESTART_TASKSELF) && ctx->nr_pids != 1)
> return -EINVAL;
>
> diff --git a/checkpoint/sys.c b/checkpoint/sys.c
> index 9e9df9b..45d3e7a 100644
> --- a/checkpoint/sys.c
> +++ b/checkpoint/sys.c
> @@ -22,6 +22,7 @@
> #include <linux/capability.h>
> #include <linux/checkpoint.h>
> #include <linux/deferqueue.h>
> +#include <linux/pid_namespace.h>
>
> /*
> * ckpt_unpriv_allowed - sysctl controlled.
> @@ -276,6 +277,7 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
> ctx->uflags = uflags;
> ctx->kflags = kflags;
> ctx->ktime_begin = ktime_get();
> + ctx->coord_pidns = get_pid_ns(current->nsproxy->pid_ns);
>
> atomic_set(&ctx->refcount, 0);
> INIT_LIST_HEAD(&ctx->pgarr_list);
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index 792b523..e860bf5 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -10,7 +10,7 @@
> * distribution for more details.
> */
>
> -#define CHECKPOINT_VERSION 5
> +#define CHECKPOINT_VERSION 6
>
> /* checkpoint user flags */
> #define CHECKPOINT_SUBTREE 0x1
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 41412d1..20c90b3 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -117,6 +117,8 @@ enum {
> #define CKPT_HDR_GROUPINFO CKPT_HDR_GROUPINFO
> CKPT_HDR_TASK_CREDS,
> #define CKPT_HDR_TASK_CREDS CKPT_HDR_TASK_CREDS
> + CKPT_HDR_VPIDS,
> +#define CKPT_HDR_VPIDS CKPT_HDR_VPIDS
>
> /* 201-299: reserved for arch-dependent */
>
> @@ -327,11 +329,23 @@ struct ckpt_hdr_tree {
> } __attribute__((aligned(8)));
>
> struct ckpt_pids {
> + /* These pids are in the root_nsproxy's pid ns */
> __s32 vpid;
> __s32 vppid;
> __s32 vtgid;
> __s32 vpgid;
> __s32 vsid;
> + /* rpid is the real pid, in checkpointer's pidns. This is
> + * so checkpointer in userspace can get more info about the
> + * task (i.e. /proc/pid/mountinfo) */
> + __s32 rpid;
> + __s32 depth; /* pid namespace depth relative to container init */
> +} __attribute__((aligned(8)));
> +
> +/* number of vpids */
> +struct ckpt_hdr_vpids {
> + struct ckpt_hdr h;
> + __s32 nr_vpids;
> } __attribute__((aligned(8)));
>
> /* pids */
> diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
> index ecd3e91..2fb79cf 100644
> --- a/include/linux/checkpoint_types.h
> +++ b/include/linux/checkpoint_types.h
> @@ -72,6 +72,9 @@ struct ckpt_ctx {
> struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
> int nr_tasks; /* size of tasks array */
>
> + int nr_vpids;
> + struct pid_namespace *coord_pidns; /* coordinator pid_ns */
> +
> /* [multi-process restart] */
> struct ckpt_pids *pids_arr; /* array of all pids [restart] */
> int nr_pids; /* size of pids array */
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index 0da0d83..6d86240 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -364,8 +364,13 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
> get_net(net_ns);
> nsproxy->net_ns = net_ns;
>
> - get_pid_ns(current->nsproxy->pid_ns);
> - nsproxy->pid_ns = current->nsproxy->pid_ns;
> + /*
> + * The pid_ns will get assigned the first time that we
> + * assign the nsproxy to a task. The task had unshared
> + * its pid_ns in userspace before calling restart, and
> + * we want to keep using that pid_ns.
> + */
> + nsproxy->pid_ns = NULL;
> }
> out:
> if (ret < 0)
> --
> 1.6.1
>
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers
--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes
[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 197 bytes --]
^ permalink raw reply [flat|nested] 6+ messages in thread