* [PATCH] linux-cr: Handle nested pid namespaces
[not found] ` <1268371676-3029-1-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2010-03-12 5:27 ` serue-r/Jw6+rmf7HQT0dZR+AlfA
0 siblings, 0 replies; 3+ messages in thread
From: serue-r/Jw6+rmf7HQT0dZR+AlfA @ 2010-03-12 5:27 UTC (permalink / raw)
To: orenl-eQaUEPhvms7ENvBUuze7eA; +Cc: containers-qjLDD68F18O7TbgM5vRIOg
From: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
In place of one big pids array, checkpoint one struct ckpt_hdr_pids
per task. It contains pid/ppid/etc in the root nsproxy's pidns, and
is followed by a list of all virtual pids in child pid namespaces, if
any.
When an nsproxy is created during do_restore_ns(), we don't yet set
its pid_ns - I *think* the nsproxy will generally get recreated by
the task which will use it, but we may as well be sure by having
the pid_ns set when the nsproxy is first assigned.
This patch applies on top of ckpt-v19-dev-serge. With this patch
applied (and the corresponding user-cr patch), all cr_tests pass,
including a new pidns test.
Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
checkpoint/checkpoint.c | 91 +++++++++++++++++++++++--------------
checkpoint/process.c | 27 +++++++-----
checkpoint/restart.c | 59 ++++++++++++++++--------
checkpoint/sys.c | 8 +++-
include/linux/checkpoint_hdr.h | 19 +++++---
include/linux/checkpoint_types.h | 3 +
kernel/nsproxy.c | 9 +++-
7 files changed, 141 insertions(+), 75 deletions(-)
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index b3c1c4f..d40a092 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -27,6 +27,7 @@
#include <linux/deferqueue.h>
#include <linux/checkpoint.h>
#include <linux/checkpoint_hdr.h>
+#include <linux/pid_namespace.h>
/* unique checkpoint identifier (FIXME: should be per-container ?) */
static atomic_t ctx_count = ATOMIC_INIT(0);
@@ -237,6 +238,7 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
{
struct task_struct *root = ctx->root_task;
struct nsproxy *nsproxy;
+ struct pid_namespace *pidns;
int ret = 0;
ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
@@ -289,66 +291,85 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
_ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
ret = -EPERM;
}
- /* no support for >1 private pidns */
- if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
- _ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
- ret = -EPERM;
+ /* pidns must be descendent of root_nsproxy */
+ pidns = nsproxy->pid_ns;
+ while (pidns != ctx->root_nsproxy->pid_ns) {
+ if (pidns == &init_pid_ns) {
+ ret = -EPERM;
+ _ckpt_err(ctx, ret, "%(T)stranger pid_ns\n");
+ break;
+ }
+ pidns = pidns->parent;
}
rcu_read_unlock();
return ret;
}
-#define CKPT_HDR_PIDS_CHUNK 256
+/* called under rcu_read_lock */
+static void copy_task(struct ckpt_hdr_pids *h, struct task_struct *t,
+ struct pid_namespace *root_pid_ns,
+ struct pid_namespace *task_pid_ns)
+{
+ int i = 0;
+ __s32 *pids;
+
+ h->pid = task_pid_nr_ns(t, root_pid_ns);
+ h->tgid = task_tgid_nr_ns(t, root_pid_ns);
+ h->pgid = task_pgrp_nr_ns(t, root_pid_ns);
+ h->sid = task_session_nr_ns(t, root_pid_ns);
+ h->ppid = task_tgid_nr_ns(t->real_parent, root_pid_ns);
+ h->rpid = task_pid_vnr(t);
+ pids = h->vpids;
+
+ while (task_pid_ns != root_pid_ns) {
+ pids[i++] = task_pid_nr_ns(t, task_pid_ns);
+ task_pid_ns = task_pid_ns->parent;
+ }
+}
static int checkpoint_pids(struct ckpt_ctx *ctx)
{
- struct ckpt_pids *h;
- struct pid_namespace *ns;
+ struct ckpt_hdr_pids *h;
+ struct pid_namespace *root_pidns;
struct task_struct *task;
struct task_struct **tasks_arr;
- int nr_tasks, n, pos = 0, ret = 0;
+ int nr_tasks, i, ret = 0;
- ns = ctx->root_nsproxy->pid_ns;
+ root_pidns = ctx->root_nsproxy->pid_ns;
tasks_arr = ctx->tasks_arr;
nr_tasks = ctx->nr_tasks;
BUG_ON(nr_tasks <= 0);
- ret = ckpt_write_obj_type(ctx, NULL,
- sizeof(*h) * nr_tasks,
- CKPT_HDR_BUFFER);
- if (ret < 0)
- return ret;
+ for (i=0; i<nr_tasks; i++) {
+ int nsdelta, size;
+ struct pid_namespace *task_pidns;
- h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
- if (!h)
- return -ENOMEM;
+ task = tasks_arr[i];
+ rcu_read_lock();
+ task_pidns = task_nsproxy(task)->pid_ns;
+ rcu_read_unlock();
+
+ nsdelta = task_pidns->level - root_pidns->level;
+ size = sizeof(*h) + nsdelta * sizeof(__s32);
+
+ h = ckpt_hdr_get_type(ctx, size, CKPT_HDR_PID);
+ if (!h)
+ return -ENOMEM;
- do {
rcu_read_lock();
- for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
- task = tasks_arr[pos];
-
- h[n].vpid = task_pid_nr_ns(task, ns);
- h[n].vtgid = task_tgid_nr_ns(task, ns);
- h[n].vpgid = task_pgrp_nr_ns(task, ns);
- h[n].vsid = task_session_nr_ns(task, ns);
- h[n].vppid = task_tgid_nr_ns(task->real_parent, ns);
- ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
- pos, h[n].vpid, h[n].vtgid, h[n].vppid);
- pos++;
- }
+ copy_task(h, task, root_pidns, task_pidns);
rcu_read_unlock();
+ ckpt_debug("task[%d]: pid %d tgid %d parent %d\n",
+ i, h->pid, h->tgid, h->ppid);
- n = min(nr_tasks, CKPT_HDR_PIDS_CHUNK);
- ret = ckpt_kwrite(ctx, h, n * sizeof(*h));
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
if (ret < 0)
break;
- nr_tasks -= n;
- } while (nr_tasks > 0);
+ }
- _ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
return ret;
}
diff --git a/checkpoint/process.c b/checkpoint/process.c
index f917112..bb44960 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -22,7 +22,7 @@
#include <linux/checkpoint.h>
#include <linux/checkpoint_hdr.h>
#include <linux/syscalls.h>
-
+#include <linux/pid_namespace.h>
pid_t ckpt_pid_nr(struct ckpt_ctx *ctx, struct pid *pid)
{
@@ -36,12 +36,6 @@ struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid)
struct pid *pgrp;
if (pgid == 0) {
- /*
- * At checkpoint the pgid owner lived in an ancestor
- * pid-ns. The best we can do (sanely and safely) is
- * to examine the parent of this restart's root: if in
- * a distinct pid-ns, use its pgrp; otherwise fail.
- */
p = ctx->root_task->real_parent;
if (p->nsproxy->pid_ns == current->nsproxy->pid_ns)
return NULL;
@@ -51,7 +45,7 @@ struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid)
* Find the owner process of this pgid (it must exist
* if pgrp exists). It must be a thread group leader.
*/
- pgrp = find_vpid(pgid);
+ pgrp = find_pid_ns(pgid, ctx->root_nsproxy->pid_ns);
p = pid_task(pgrp, PIDTYPE_PID);
if (!p || !thread_group_leader(p))
return NULL;
@@ -578,6 +572,14 @@ static int restore_task_ns(struct ckpt_ctx *ctx)
}
if (nsproxy != task_nsproxy(current)) {
+ /*
+ * This is *kinda* shady to do without any locking. However
+ * it is safe because each task is restarted separately in
+ * serial. If that ever changes, we'll need a spinlock?
+ */
+ if (!nsproxy->pid_ns)
+ nsproxy->pid_ns = get_pid_ns(current->nsproxy->pid_ns);
+
get_nsproxy(nsproxy);
switch_task_namespaces(current, nsproxy);
}
@@ -827,10 +829,10 @@ static int restore_task_pgid(struct ckpt_ctx *ctx)
if (!thread_group_leader(task)) /* (1) */
return 0;
- pgid = ctx->pids_arr[ctx->active_pid].vpgid;
+ pgid = ctx->vpgids_arr[ctx->active_pid];
- if (pgid == task_pgrp_vnr(task)) /* nothing to do */
- return 0;
+ if (pgid == task_pgrp_nr_ns(task, ctx->root_nsproxy->pid_ns))
+ return 0; /* nothing to do */
if (task->signal->leader) /* (2) */
return -EINVAL;
@@ -850,6 +852,9 @@ static int restore_task_pgid(struct ckpt_ctx *ctx)
if (ctx->uflags & RESTART_TASKSELF)
ret = 0;
+ if (ret < 0)
+ ckpt_err(ctx, ret, "setting pgid\n");
+
return ret;
}
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 0891952..4d1b804 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -145,7 +145,7 @@ void restore_debug_free(struct ckpt_ctx *ctx)
ckpt_debug("kflags %lu uflags %lu oflags %lu", ctx->kflags,
ctx->uflags, ctx->oflags);
for (i = 0; i < ctx->nr_pids; i++)
- ckpt_debug("task[%d] to run %d\n", i, ctx->pids_arr[i].vpid);
+ ckpt_debug("task[%d] to run %d\n", i, ctx->vpids_arr[i]);
list_for_each_entry_safe(s, p, &ctx->task_status, list) {
if (s->flags & RESTART_DBG_COORD)
@@ -417,7 +417,8 @@ void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type)
h = ckpt_read_obj(ctx, len, len);
if (IS_ERR(h)) {
- ckpt_err(ctx, PTR_ERR(h), "Expecting to read type %d\n", type);
+ ckpt_err(ctx, PTR_ERR(h), "Expecting to read type %d len %d\n",
+ type, len);
return h;
}
@@ -722,34 +723,51 @@ static int restore_read_tail(struct ckpt_ctx *ctx)
return ret;
}
+#define CKPT_MAX_PIDS_SZ 99999
/* restore_read_tree - read the tasks tree into the checkpoint context */
static int restore_read_tree(struct ckpt_ctx *ctx)
{
struct ckpt_hdr_tree *h;
- int size, ret;
+ int i, size;
h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TREE);
if (IS_ERR(h))
return PTR_ERR(h);
- ret = -EINVAL;
+ ctx->nr_pids = h->nr_tasks;
+ ckpt_hdr_put(ctx, h);
+
if (h->nr_tasks <= 0)
- goto out;
+ return -EINVAL;
- ctx->nr_pids = h->nr_tasks;
- size = sizeof(*ctx->pids_arr) * ctx->nr_pids;
+ size = sizeof(pid_t) * ctx->nr_pids;
if (size <= 0) /* overflow ? */
- goto out;
+ return -EINVAL;
- ctx->pids_arr = kmalloc(size, GFP_KERNEL);
- if (!ctx->pids_arr) {
- ret = -ENOMEM;
- goto out;
+ ctx->vpids_arr = kmalloc(size, GFP_KERNEL);
+ ctx->vpgids_arr = kmalloc(size, GFP_KERNEL);
+ if (!ctx->vpids_arr || !ctx->vpgids_arr)
+ return -ENOMEM;
+
+ for (i=0; i<ctx->nr_pids; i++) {
+ struct ckpt_hdr_pids *p;
+
+ p = ckpt_read_obj(ctx, 0, CKPT_MAX_PIDS_SZ);
+ if (!p)
+ return -EINVAL;
+ if (p->h.type != CKPT_HDR_PID) {
+ ckpt_hdr_put(ctx, p);
+ return -EINVAL;
+ }
+ if (p->h.len < sizeof(*p)) {
+ ckpt_hdr_put(ctx, p);
+ return -EINVAL;
+ }
+ ctx->vpids_arr[i] = p->pid;
+ ctx->vpgids_arr[i] = p->pgid;
+ ckpt_hdr_put(ctx, p);
}
- ret = _ckpt_read_buffer(ctx, ctx->pids_arr, size);
- out:
- ckpt_hdr_put(ctx, h);
- return ret;
+ return 0;
}
static inline int all_tasks_activated(struct ckpt_ctx *ctx)
@@ -760,7 +778,7 @@ static inline int all_tasks_activated(struct ckpt_ctx *ctx)
static inline pid_t get_active_pid(struct ckpt_ctx *ctx)
{
int active = ctx->active_pid;
- return active >= 0 ? ctx->pids_arr[active].vpid : 0;
+ return active >= 0 ? ctx->vpids_arr[active] : 0;
}
static inline int is_task_active(struct ckpt_ctx *ctx, pid_t pid)
@@ -862,7 +880,7 @@ static int restore_activate_next(struct ckpt_ctx *ctx)
static int wait_task_active(struct ckpt_ctx *ctx)
{
- pid_t pid = task_pid_vnr(current);
+ pid_t pid = task_pid_nr_ns(current, ctx->root_nsproxy->pid_ns);
int ret;
ckpt_debug("pid %d waiting\n", pid);
@@ -878,7 +896,8 @@ static int wait_task_active(struct ckpt_ctx *ctx)
static int wait_task_sync(struct ckpt_ctx *ctx)
{
- ckpt_debug("pid %d syncing\n", task_pid_vnr(current));
+ ckpt_debug("pid %d syncing\n",
+ task_pid_nr_ns(current, ctx->root_nsproxy->pid_ns));
wait_event_interruptible(ctx->waitq, ckpt_test_complete(ctx));
ckpt_debug("task sync done (errno %d)\n", ctx->errno);
if (ckpt_test_error(ctx))
@@ -1152,7 +1171,7 @@ static struct task_struct *choose_root_task(struct ckpt_ctx *ctx, pid_t pid)
read_lock(&tasklist_lock);
list_for_each_entry(task, ¤t->children, sibling) {
- if (task_pid_vnr(task) == pid) {
+ if (task_pid_nr_ns(task, ctx->coord_pidns) == pid) {
get_task_struct(task);
ctx->root_task = task;
ctx->root_pid = pid;
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index b605784..90caf61 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -22,6 +22,7 @@
#include <linux/capability.h>
#include <linux/checkpoint.h>
#include <linux/deferqueue.h>
+#include <linux/pid_namespace.h>
/*
* ckpt_unpriv_allowed - sysctl controlled.
@@ -243,6 +244,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
if (ctx->tasks_arr)
task_arr_free(ctx);
+ if (ctx->coord_pidns)
+ put_pid_ns(ctx->coord_pidns);
if (ctx->root_nsproxy)
put_nsproxy(ctx->root_nsproxy);
if (ctx->root_task)
@@ -252,7 +255,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
free_page((unsigned long) ctx->scratch_page);
- kfree(ctx->pids_arr);
+ kfree(ctx->vpids_arr);
+ kfree(ctx->vpgids_arr);
sock_listening_list_free(&ctx->listen_sockets);
@@ -273,6 +277,8 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
ctx->kflags = kflags;
ctx->ktime_begin = ktime_get();
+ ctx->coord_pidns = get_pid_ns(current->nsproxy->pid_ns);
+
atomic_set(&ctx->refcount, 0);
INIT_LIST_HEAD(&ctx->pgarr_list);
INIT_LIST_HEAD(&ctx->pgarr_pool);
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 41412d1..7957b3b 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -117,6 +117,8 @@ enum {
#define CKPT_HDR_GROUPINFO CKPT_HDR_GROUPINFO
CKPT_HDR_TASK_CREDS,
#define CKPT_HDR_TASK_CREDS CKPT_HDR_TASK_CREDS
+ CKPT_HDR_PID,
+#define CKPT_HDR_PID CKPT_HDR_PID
/* 201-299: reserved for arch-dependent */
@@ -326,12 +328,17 @@ struct ckpt_hdr_tree {
__s32 nr_tasks;
} __attribute__((aligned(8)));
-struct ckpt_pids {
- __s32 vpid;
- __s32 vppid;
- __s32 vtgid;
- __s32 vpgid;
- __s32 vsid;
+struct ckpt_hdr_pids {
+ struct ckpt_hdr h;
+ __s32 rpid; /* pid in checkpointer's pid_ns */
+ /* The rest of these are in container init's pid_ns */
+ __s32 pid;
+ __s32 ppid;
+ __s32 tgid;
+ __s32 pgid;
+ __s32 sid;
+ /* followed by pids in pid_ns up to root->nsproxy->pid_ns */
+ __s32 vpids[0];
} __attribute__((aligned(8)));
/* pids */
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index 5d5e00d..57a7d80 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -37,6 +37,7 @@ struct ckpt_ctx {
int root_init; /* [container] root init ? */
pid_t root_pid; /* [container] root pid */
struct task_struct *root_task; /* [container] root task */
+ struct pid_namespace *coord_pidns; /* coordinator pid_ns */
struct nsproxy *root_nsproxy; /* [container] root nsproxy */
struct task_struct *root_freezer; /* [container] root task */
char lsm_name[SECURITY_NAME_MAX + 1]; /* security module at ckpt */
@@ -74,6 +75,8 @@ struct ckpt_ctx {
/* [multi-process restart] */
struct ckpt_pids *pids_arr; /* array of all pids [restart] */
+ pid_t *vpids_arr; /* array of all pids, in container pidns */
+ pid_t *vpgids_arr; /* array of all vpgids, in container pidns */
int nr_pids; /* size of pids array */
atomic_t nr_total; /* total tasks count (with ghosts) */
int active_pid; /* (next) position in pids array */
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 0da0d83..6d86240 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -364,8 +364,13 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
get_net(net_ns);
nsproxy->net_ns = net_ns;
- get_pid_ns(current->nsproxy->pid_ns);
- nsproxy->pid_ns = current->nsproxy->pid_ns;
+ /*
+ * The pid_ns will get assigned the first time that we
+ * assign the nsproxy to a task. The task had unshared
+ * its pid_ns in userspace before calling restart, and
+ * we want to keep using that pid_ns.
+ */
+ nsproxy->pid_ns = NULL;
}
out:
if (ret < 0)
--
1.6.1
^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCH linux-cr] Handle nested pid namespaces
@ 2010-03-18 20:19 Serge E. Hallyn
2010-03-18 20:22 ` [PATCH] user-cr: " Serge E. Hallyn
0 siblings, 1 reply; 3+ messages in thread
From: Serge E. Hallyn @ 2010-03-18 20:19 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers, lkml
[ Patch against https://www.linux-cr.org/redmine/tab/show/kernel-cr ]
In place of one big pids array, checkpoint one struct ckpt_hdr_pids
per task. It contains pid/ppid/etc in the root nsproxy's pidns, and
is followed by a list of all virtual pids in child pid namespaces, if
any.
When an nsproxy is created during do_restore_ns(), we don't yet set
its pid_ns, waiting instead until a task attaches that new nsproxy to
itself. I *think* the nsproxy will generally get recreated by the
task which will use it, but we may as well be sure by having the pid_ns
set when the nsproxy is first assigned.
This patch applies on top of ckpt-v20. With this patch applied (and
the corresponding user-cr patch), all cr_tests pass, including a new
pidns test (which is in branch pidns.1 until this patch goes into
ckpt-v20-dev).
Please apply.
Changelog:
Mar 18: bump checkpoing image format version
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
---
checkpoint/checkpoint.c | 91 +++++++++++++++++++++++--------------
checkpoint/process.c | 27 +++++++-----
checkpoint/restart.c | 59 ++++++++++++++++--------
checkpoint/sys.c | 8 +++-
include/linux/checkpoint.h | 2 +-
include/linux/checkpoint_hdr.h | 19 +++++---
include/linux/checkpoint_types.h | 3 +
kernel/nsproxy.c | 9 +++-
8 files changed, 142 insertions(+), 76 deletions(-)
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index f27af41..55e14c3 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -27,6 +27,7 @@
#include <linux/deferqueue.h>
#include <linux/checkpoint.h>
#include <linux/checkpoint_hdr.h>
+#include <linux/pid_namespace.h>
/* unique checkpoint identifier (FIXME: should be per-container ?) */
static atomic_t ctx_count = ATOMIC_INIT(0);
@@ -241,6 +242,7 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
{
struct task_struct *root = ctx->root_task;
struct nsproxy *nsproxy;
+ struct pid_namespace *pidns;
int ret = 0;
ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
@@ -293,66 +295,85 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
_ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
ret = -EPERM;
}
- /* no support for >1 private pidns */
- if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
- _ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
- ret = -EPERM;
+ /* pidns must be descendent of root_nsproxy */
+ pidns = nsproxy->pid_ns;
+ while (pidns != ctx->root_nsproxy->pid_ns) {
+ if (pidns == &init_pid_ns) {
+ ret = -EPERM;
+ _ckpt_err(ctx, ret, "%(T)stranger pid_ns\n");
+ break;
+ }
+ pidns = pidns->parent;
}
rcu_read_unlock();
return ret;
}
-#define CKPT_HDR_PIDS_CHUNK 256
+/* called under rcu_read_lock */
+static void copy_task(struct ckpt_hdr_pids *h, struct task_struct *t,
+ struct pid_namespace *root_pid_ns,
+ struct pid_namespace *task_pid_ns)
+{
+ int i = 0;
+ __s32 *pids;
+
+ h->pid = task_pid_nr_ns(t, root_pid_ns);
+ h->tgid = task_tgid_nr_ns(t, root_pid_ns);
+ h->pgid = task_pgrp_nr_ns(t, root_pid_ns);
+ h->sid = task_session_nr_ns(t, root_pid_ns);
+ h->ppid = task_tgid_nr_ns(t->real_parent, root_pid_ns);
+ h->rpid = task_pid_vnr(t);
+ pids = h->vpids;
+
+ while (task_pid_ns != root_pid_ns) {
+ pids[i++] = task_pid_nr_ns(t, task_pid_ns);
+ task_pid_ns = task_pid_ns->parent;
+ }
+}
static int checkpoint_pids(struct ckpt_ctx *ctx)
{
- struct ckpt_pids *h;
- struct pid_namespace *ns;
+ struct ckpt_hdr_pids *h;
+ struct pid_namespace *root_pidns;
struct task_struct *task;
struct task_struct **tasks_arr;
- int nr_tasks, n, pos = 0, ret = 0;
+ int nr_tasks, i, ret = 0;
- ns = ctx->root_nsproxy->pid_ns;
+ root_pidns = ctx->root_nsproxy->pid_ns;
tasks_arr = ctx->tasks_arr;
nr_tasks = ctx->nr_tasks;
BUG_ON(nr_tasks <= 0);
- ret = ckpt_write_obj_type(ctx, NULL,
- sizeof(*h) * nr_tasks,
- CKPT_HDR_BUFFER);
- if (ret < 0)
- return ret;
+ for (i = 0; i < nr_tasks; i++) {
+ int nsdelta, size;
+ struct pid_namespace *task_pidns;
- h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
- if (!h)
- return -ENOMEM;
+ task = tasks_arr[i];
+ rcu_read_lock();
+ task_pidns = task_nsproxy(task)->pid_ns;
+ rcu_read_unlock();
+
+ nsdelta = task_pidns->level - root_pidns->level;
+ size = sizeof(*h) + nsdelta * sizeof(__s32);
+
+ h = ckpt_hdr_get_type(ctx, size, CKPT_HDR_PID);
+ if (!h)
+ return -ENOMEM;
- do {
rcu_read_lock();
- for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
- task = tasks_arr[pos];
-
- h[n].vpid = task_pid_nr_ns(task, ns);
- h[n].vtgid = task_tgid_nr_ns(task, ns);
- h[n].vpgid = task_pgrp_nr_ns(task, ns);
- h[n].vsid = task_session_nr_ns(task, ns);
- h[n].vppid = task_tgid_nr_ns(task->real_parent, ns);
- ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
- pos, h[n].vpid, h[n].vtgid, h[n].vppid);
- pos++;
- }
+ copy_task(h, task, root_pidns, task_pidns);
rcu_read_unlock();
+ ckpt_debug("task[%d]: pid %d tgid %d parent %d\n",
+ i, h->pid, h->tgid, h->ppid);
- n = min(nr_tasks, CKPT_HDR_PIDS_CHUNK);
- ret = ckpt_kwrite(ctx, h, n * sizeof(*h));
+ ret = ckpt_write_obj(ctx, &h->h);
+ ckpt_hdr_put(ctx, h);
if (ret < 0)
break;
- nr_tasks -= n;
- } while (nr_tasks > 0);
+ }
- _ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
return ret;
}
diff --git a/checkpoint/process.c b/checkpoint/process.c
index f917112..bb44960 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -22,7 +22,7 @@
#include <linux/checkpoint.h>
#include <linux/checkpoint_hdr.h>
#include <linux/syscalls.h>
-
+#include <linux/pid_namespace.h>
pid_t ckpt_pid_nr(struct ckpt_ctx *ctx, struct pid *pid)
{
@@ -36,12 +36,6 @@ struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid)
struct pid *pgrp;
if (pgid == 0) {
- /*
- * At checkpoint the pgid owner lived in an ancestor
- * pid-ns. The best we can do (sanely and safely) is
- * to examine the parent of this restart's root: if in
- * a distinct pid-ns, use its pgrp; otherwise fail.
- */
p = ctx->root_task->real_parent;
if (p->nsproxy->pid_ns == current->nsproxy->pid_ns)
return NULL;
@@ -51,7 +45,7 @@ struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid)
* Find the owner process of this pgid (it must exist
* if pgrp exists). It must be a thread group leader.
*/
- pgrp = find_vpid(pgid);
+ pgrp = find_pid_ns(pgid, ctx->root_nsproxy->pid_ns);
p = pid_task(pgrp, PIDTYPE_PID);
if (!p || !thread_group_leader(p))
return NULL;
@@ -578,6 +572,14 @@ static int restore_task_ns(struct ckpt_ctx *ctx)
}
if (nsproxy != task_nsproxy(current)) {
+ /*
+ * This is *kinda* shady to do without any locking. However
+ * it is safe because each task is restarted separately in
+ * serial. If that ever changes, we'll need a spinlock?
+ */
+ if (!nsproxy->pid_ns)
+ nsproxy->pid_ns = get_pid_ns(current->nsproxy->pid_ns);
+
get_nsproxy(nsproxy);
switch_task_namespaces(current, nsproxy);
}
@@ -827,10 +829,10 @@ static int restore_task_pgid(struct ckpt_ctx *ctx)
if (!thread_group_leader(task)) /* (1) */
return 0;
- pgid = ctx->pids_arr[ctx->active_pid].vpgid;
+ pgid = ctx->vpgids_arr[ctx->active_pid];
- if (pgid == task_pgrp_vnr(task)) /* nothing to do */
- return 0;
+ if (pgid == task_pgrp_nr_ns(task, ctx->root_nsproxy->pid_ns))
+ return 0; /* nothing to do */
if (task->signal->leader) /* (2) */
return -EINVAL;
@@ -850,6 +852,9 @@ static int restore_task_pgid(struct ckpt_ctx *ctx)
if (ctx->uflags & RESTART_TASKSELF)
ret = 0;
+ if (ret < 0)
+ ckpt_err(ctx, ret, "setting pgid\n");
+
return ret;
}
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 6a9644d..84713c7 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -145,7 +145,7 @@ void restore_debug_free(struct ckpt_ctx *ctx)
ckpt_debug("kflags %lu uflags %lu oflags %lu", ctx->kflags,
ctx->uflags, ctx->oflags);
for (i = 0; i < ctx->nr_pids; i++)
- ckpt_debug("task[%d] to run %d\n", i, ctx->pids_arr[i].vpid);
+ ckpt_debug("task[%d] to run %d\n", i, ctx->vpids_arr[i]);
list_for_each_entry_safe(s, p, &ctx->task_status, list) {
if (s->flags & RESTART_DBG_COORD)
@@ -420,7 +420,8 @@ void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type)
h = ckpt_read_obj(ctx, len, len);
if (IS_ERR(h)) {
- ckpt_err(ctx, PTR_ERR(h), "Expecting to read type %d\n", type);
+ ckpt_err(ctx, PTR_ERR(h), "Expecting to read type %d len %d\n",
+ type, len);
return h;
}
@@ -730,34 +731,51 @@ static int restore_read_tail(struct ckpt_ctx *ctx)
return ret;
}
+#define CKPT_MAX_PIDS_SZ 99999
/* restore_read_tree - read the tasks tree into the checkpoint context */
static int restore_read_tree(struct ckpt_ctx *ctx)
{
struct ckpt_hdr_tree *h;
- int size, ret;
+ int i, size;
h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TREE);
if (IS_ERR(h))
return PTR_ERR(h);
- ret = -EINVAL;
+ ctx->nr_pids = h->nr_tasks;
+ ckpt_hdr_put(ctx, h);
+
if (h->nr_tasks <= 0)
- goto out;
+ return -EINVAL;
- ctx->nr_pids = h->nr_tasks;
- size = sizeof(*ctx->pids_arr) * ctx->nr_pids;
+ size = sizeof(pid_t) * ctx->nr_pids;
if (size <= 0) /* overflow ? */
- goto out;
+ return -EINVAL;
- ctx->pids_arr = kmalloc(size, GFP_KERNEL);
- if (!ctx->pids_arr) {
- ret = -ENOMEM;
- goto out;
+ ctx->vpids_arr = kmalloc(size, GFP_KERNEL);
+ ctx->vpgids_arr = kmalloc(size, GFP_KERNEL);
+ if (!ctx->vpids_arr || !ctx->vpgids_arr)
+ return -ENOMEM;
+
+ for (i = 0; i < ctx->nr_pids; i++) {
+ struct ckpt_hdr_pids *p;
+
+ p = ckpt_read_obj(ctx, 0, CKPT_MAX_PIDS_SZ);
+ if (!p)
+ return -EINVAL;
+ if (p->h.type != CKPT_HDR_PID) {
+ ckpt_hdr_put(ctx, p);
+ return -EINVAL;
+ }
+ if (p->h.len < sizeof(*p)) {
+ ckpt_hdr_put(ctx, p);
+ return -EINVAL;
+ }
+ ctx->vpids_arr[i] = p->pid;
+ ctx->vpgids_arr[i] = p->pgid;
+ ckpt_hdr_put(ctx, p);
}
- ret = _ckpt_read_buffer(ctx, ctx->pids_arr, size);
- out:
- ckpt_hdr_put(ctx, h);
- return ret;
+ return 0;
}
static inline int all_tasks_activated(struct ckpt_ctx *ctx)
@@ -768,7 +786,7 @@ static inline int all_tasks_activated(struct ckpt_ctx *ctx)
static inline pid_t get_active_pid(struct ckpt_ctx *ctx)
{
int active = ctx->active_pid;
- return active >= 0 ? ctx->pids_arr[active].vpid : 0;
+ return active >= 0 ? ctx->vpids_arr[active] : 0;
}
static inline int is_task_active(struct ckpt_ctx *ctx, pid_t pid)
@@ -870,7 +888,7 @@ static int restore_activate_next(struct ckpt_ctx *ctx)
static int wait_task_active(struct ckpt_ctx *ctx)
{
- pid_t pid = task_pid_vnr(current);
+ pid_t pid = task_pid_nr_ns(current, ctx->root_nsproxy->pid_ns);
int ret;
ckpt_debug("pid %d waiting\n", pid);
@@ -886,7 +904,8 @@ static int wait_task_active(struct ckpt_ctx *ctx)
static int wait_task_sync(struct ckpt_ctx *ctx)
{
- ckpt_debug("pid %d syncing\n", task_pid_vnr(current));
+ ckpt_debug("pid %d syncing\n",
+ task_pid_nr_ns(current, ctx->root_nsproxy->pid_ns));
wait_event_interruptible(ctx->waitq, ckpt_test_complete(ctx));
ckpt_debug("task sync done (errno %d)\n", ctx->errno);
if (ckpt_test_error(ctx))
@@ -1160,7 +1179,7 @@ static struct task_struct *choose_root_task(struct ckpt_ctx *ctx, pid_t pid)
read_lock(&tasklist_lock);
list_for_each_entry(task, ¤t->children, sibling) {
- if (task_pid_vnr(task) == pid) {
+ if (task_pid_nr_ns(task, ctx->coord_pidns) == pid) {
get_task_struct(task);
ctx->root_task = task;
ctx->root_pid = pid;
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 9e9df9b..5df72b0 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -22,6 +22,7 @@
#include <linux/capability.h>
#include <linux/checkpoint.h>
#include <linux/deferqueue.h>
+#include <linux/pid_namespace.h>
/*
* ckpt_unpriv_allowed - sysctl controlled.
@@ -247,6 +248,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
if (ctx->tasks_arr)
task_arr_free(ctx);
+ if (ctx->coord_pidns)
+ put_pid_ns(ctx->coord_pidns);
if (ctx->root_nsproxy)
put_nsproxy(ctx->root_nsproxy);
if (ctx->root_task)
@@ -256,7 +259,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
free_page((unsigned long) ctx->scratch_page);
- kfree(ctx->pids_arr);
+ kfree(ctx->vpids_arr);
+ kfree(ctx->vpgids_arr);
sock_listening_list_free(&ctx->listen_sockets);
@@ -277,6 +281,8 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
ctx->kflags = kflags;
ctx->ktime_begin = ktime_get();
+ ctx->coord_pidns = get_pid_ns(current->nsproxy->pid_ns);
+
atomic_set(&ctx->refcount, 0);
INIT_LIST_HEAD(&ctx->pgarr_list);
INIT_LIST_HEAD(&ctx->pgarr_pool);
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 792b523..e860bf5 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -10,7 +10,7 @@
* distribution for more details.
*/
-#define CHECKPOINT_VERSION 5
+#define CHECKPOINT_VERSION 6
/* checkpoint user flags */
#define CHECKPOINT_SUBTREE 0x1
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 41412d1..7957b3b 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -117,6 +117,8 @@ enum {
#define CKPT_HDR_GROUPINFO CKPT_HDR_GROUPINFO
CKPT_HDR_TASK_CREDS,
#define CKPT_HDR_TASK_CREDS CKPT_HDR_TASK_CREDS
+ CKPT_HDR_PID,
+#define CKPT_HDR_PID CKPT_HDR_PID
/* 201-299: reserved for arch-dependent */
@@ -326,12 +328,17 @@ struct ckpt_hdr_tree {
__s32 nr_tasks;
} __attribute__((aligned(8)));
-struct ckpt_pids {
- __s32 vpid;
- __s32 vppid;
- __s32 vtgid;
- __s32 vpgid;
- __s32 vsid;
+struct ckpt_hdr_pids {
+ struct ckpt_hdr h;
+ __s32 rpid; /* pid in checkpointer's pid_ns */
+ /* The rest of these are in container init's pid_ns */
+ __s32 pid;
+ __s32 ppid;
+ __s32 tgid;
+ __s32 pgid;
+ __s32 sid;
+ /* followed by pids in pid_ns up to root->nsproxy->pid_ns */
+ __s32 vpids[0];
} __attribute__((aligned(8)));
/* pids */
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index ecd3e91..0ae78a7 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -37,6 +37,7 @@ struct ckpt_ctx {
int root_init; /* [container] root init ? */
pid_t root_pid; /* [container] root pid */
struct task_struct *root_task; /* [container] root task */
+ struct pid_namespace *coord_pidns; /* coordinator pid_ns */
struct nsproxy *root_nsproxy; /* [container] root nsproxy */
struct task_struct *root_freezer; /* [container] root task */
char lsm_name[SECURITY_NAME_MAX + 1]; /* security module at ckpt */
@@ -74,6 +75,8 @@ struct ckpt_ctx {
/* [multi-process restart] */
struct ckpt_pids *pids_arr; /* array of all pids [restart] */
+ pid_t *vpids_arr; /* pids array in container pidns */
+ pid_t *vpgids_arr; /* vpgids array in container pidns */
int nr_pids; /* size of pids array */
atomic_t nr_total; /* total tasks count (with ghosts) */
int active_pid; /* (next) position in pids array */
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 0da0d83..6d86240 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -364,8 +364,13 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
get_net(net_ns);
nsproxy->net_ns = net_ns;
- get_pid_ns(current->nsproxy->pid_ns);
- nsproxy->pid_ns = current->nsproxy->pid_ns;
+ /*
+ * The pid_ns will get assigned the first time that we
+ * assign the nsproxy to a task. The task had unshared
+ * its pid_ns in userspace before calling restart, and
+ * we want to keep using that pid_ns.
+ */
+ nsproxy->pid_ns = NULL;
}
out:
if (ret < 0)
--
1.6.1
^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCH] user-cr: Handle nested pid namespaces
2010-03-18 20:19 [PATCH linux-cr] Handle nested pid namespaces Serge E. Hallyn
@ 2010-03-18 20:22 ` Serge E. Hallyn
0 siblings, 0 replies; 3+ messages in thread
From: Serge E. Hallyn @ 2010-03-18 20:22 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers, lkml
[ Patch against https://www.linux-cr.org/redmine/tab/show/user-cr ]
Make userspace use eclone to recreate all original checkpointed
pids in nested pid namespaces.
Yup, the kernel doesn't actually care about the vpids in all
the child pid namespaces, they're actually just for us. We
parse them to decide how to tell eclone to recreate the full
hierarchical pid and pidns trees.
Changelog:
Mar 18: bump checkpoing image format version #
Signed-off-by: Serge Hallyn <serue@us.ibm.com>
---
include/linux/checkpoint.h | 2 +-
include/linux/checkpoint_hdr.h | 17 ++-
restart.c | 289 ++++++++++++++++++++++++----------------
3 files changed, 184 insertions(+), 124 deletions(-)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 53b8b2c..8d021b9 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -14,7 +14,7 @@
* distribution for more details.
*/
-#define CHECKPOINT_VERSION 5
+#define CHECKPOINT_VERSION 6
/* checkpoint user flags */
#define CHECKPOINT_SUBTREE 0x1
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index e8eaf23..cbd6ab2 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -111,6 +111,8 @@ enum {
#define CKPT_HDR_GROUPINFO CKPT_HDR_GROUPINFO
CKPT_HDR_TASK_CREDS,
#define CKPT_HDR_TASK_CREDS CKPT_HDR_TASK_CREDS
+ CKPT_HDR_PID,
+#define CKPT_HDR_PID CKPT_HDR_PID
/* 201-299: reserved for arch-dependent */
@@ -320,12 +322,15 @@ struct ckpt_hdr_tree {
__s32 nr_tasks;
} __attribute__((aligned(8)));
-struct ckpt_pids {
- __s32 vpid;
- __s32 vppid;
- __s32 vtgid;
- __s32 vpgid;
- __s32 vsid;
+struct ckpt_hdr_pids {
+ struct ckpt_hdr h;
+ __s32 rpid;
+ __s32 pid;
+ __s32 ppid;
+ __s32 tgid;
+ __s32 pgid;
+ __s32 sid;
+ __s32 vpids[0];
} __attribute__((aligned(8)));
/* pids */
diff --git a/restart.c b/restart.c
index 0c74bb6..d0fe224 100644
--- a/restart.c
+++ b/restart.c
@@ -244,10 +244,12 @@ struct task {
struct task *phantom; /* pointer to place-holdler task (if any) */
+ int piddepth;
pid_t pid; /* process IDs, our bread-&-butter */
pid_t ppid;
pid_t tgid;
pid_t sid;
+ __s32 *vpids;
pid_t rpid; /* [restart without vpids] actual (real) pid */
@@ -267,6 +269,16 @@ struct task zero_task;
#define TASK_NEWPID 0x20 /* starts a new pid namespace */
#define TASK_DEAD 0x40 /* dead task (dummy) */
+struct uckpt_pid {
+ int depth;
+ __s32 rpid;
+ __s32 pid;
+ __s32 ppid;
+ __s32 tgid;
+ __s32 pgid;
+ __s32 sid;
+ __s32 vpids[0];
+};
struct ckpt_ctx {
pid_t root_pid;
int pipe_in;
@@ -277,8 +289,7 @@ struct ckpt_ctx {
int pipe_feed[2]; /* for feeder to provide input */
int pipe_coord[2]; /* for coord to report status (if needed) */
- struct ckpt_pids *pids_arr;
- struct ckpt_pids *copy_arr;
+ struct uckpt_pid **orig_pids;
struct task *tasks_arr;
int tasks_nr;
@@ -1465,14 +1476,14 @@ static int ckpt_zero_pid(struct ckpt_ctx *ctx)
pid = ckpt_alloc_pid(ctx);
if (pid < 0)
return -1;
- if (ckpt_setup_task(ctx, pid, ctx->pids_arr[0].vpid) < 0)
+ if (ckpt_setup_task(ctx, pid, ctx->orig_pids[0]->pid) < 0)
return -1;
return pid;
}
static int ckpt_init_tree(struct ckpt_ctx *ctx)
{
- struct ckpt_pids *pids_arr = ctx->pids_arr;
+ struct uckpt_pid **orig_pids = ctx->orig_pids;
int pids_nr = ctx->pids_nr;
struct task *task;
pid_t root_pid;
@@ -1480,8 +1491,8 @@ static int ckpt_init_tree(struct ckpt_ctx *ctx)
pid_t zero_pid = 0;
int i;
- root_pid = pids_arr[0].vpid;
- root_sid = pids_arr[0].vsid;
+ root_pid = orig_pids[0]->pid;
+ root_sid = orig_pids[0]->sid;
/*
* The case where root_sid != root_pid is special. It must be
@@ -1515,24 +1526,26 @@ static int ckpt_init_tree(struct ckpt_ctx *ctx)
task->flags = 0;
- if (!ckpt_valid_pid(ctx, pids_arr[i].vpid, "pid", i))
+ if (!ckpt_valid_pid(ctx, orig_pids[i]->pid, "pid", i))
return -1;
- else if (!ckpt_valid_pid(ctx, pids_arr[i].vtgid, "tgid", i))
+ else if (!ckpt_valid_pid(ctx, orig_pids[i]->tgid, "tgid", i))
return -1;
- else if (!ckpt_valid_pid(ctx, pids_arr[i].vsid, "sid", i))
+ else if (!ckpt_valid_pid(ctx, orig_pids[i]->sid, "sid", i))
return -1;
- else if (!ckpt_valid_pid(ctx, pids_arr[i].vpgid, "pgid", i))
+ else if (!ckpt_valid_pid(ctx, orig_pids[i]->pgid, "pgid", i))
return -1;
- if (pids_arr[i].vsid == root_sid)
- pids_arr[i].vsid = 0;
- if (pids_arr[i].vpgid == root_sid)
- pids_arr[i].vpgid = 0;
+ if (orig_pids[i]->sid == root_sid)
+ orig_pids[i]->sid = 0;
+ if (orig_pids[i]->pgid == root_sid)
+ orig_pids[i]->pgid = 0;
- task->pid = pids_arr[i].vpid;
- task->ppid = pids_arr[i].vppid;
- task->tgid = pids_arr[i].vtgid;
- task->sid = pids_arr[i].vsid;
+ task->piddepth = orig_pids[i]->depth;
+ task->pid = orig_pids[i]->pid;
+ task->ppid = orig_pids[i]->ppid;
+ task->tgid = orig_pids[i]->tgid;
+ task->sid = orig_pids[i]->sid;
+ task->vpids = orig_pids[i]->vpids;
task->children = NULL;
task->next_sib = NULL;
@@ -1553,10 +1566,10 @@ static int ckpt_init_tree(struct ckpt_ctx *ctx)
for (i = 0; i < pids_nr; i++) {
pid_t sid;
- sid = pids_arr[i].vsid;
+ sid = orig_pids[i]->sid;
- /* Remember if we find any vsid/vpgid - see below */
- if (pids_arr[i].vsid == 0 || pids_arr[i].vpgid == 0)
+ /* Remember if we find any sid/pgid - see below */
+ if (orig_pids[i]->sid == 0 || orig_pids[i]->pgid == 0)
zero_pid = 1;
/*
* An unaccounted-for sid belongs to a task that was a
@@ -1579,7 +1592,7 @@ static int ckpt_init_tree(struct ckpt_ctx *ctx)
* need to add it with the same sid as current (and
* other) threads.
*/
- if (ckpt_setup_task(ctx, pids_arr[i].vtgid, sid) < 0)
+ if (ckpt_setup_task(ctx, orig_pids[i]->tgid, sid) < 0)
return -1;
/*
@@ -1590,7 +1603,7 @@ static int ckpt_init_tree(struct ckpt_ctx *ctx)
* same sid as us: all tasks with same pgrp must have
* their sid matching.
*/
- if (ckpt_setup_task(ctx, pids_arr[i].vpgid, sid) < 0)
+ if (ckpt_setup_task(ctx, orig_pids[i]->pgid, sid) < 0)
return -1;
}
@@ -1604,13 +1617,13 @@ static int ckpt_init_tree(struct ckpt_ctx *ctx)
if (zero_pid < 0)
return -1;
for (i = 0; i < pids_nr; i++) {
- if (pids_arr[i].vsid == 0) {
- pids_arr[i].vsid = zero_pid;
- pids_arr[i].vppid = zero_pid;
+ if (orig_pids[i]->sid == 0) {
+ orig_pids[i]->sid = zero_pid;
+ orig_pids[i]->ppid = zero_pid;
}
- if (pids_arr[i].vpgid == 0) {
- pids_arr[i].vpgid = zero_pid;
- pids_arr[i].vppid = zero_pid;
+ if (orig_pids[i]->pgid == 0) {
+ orig_pids[i]->pgid = zero_pid;
+ orig_pids[i]->ppid = zero_pid;
}
}
}
@@ -2050,8 +2063,8 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
struct clone_args clone_args;
genstack stk;
unsigned long flags = SIGCHLD;
- size_t nr_pids = 1;
pid_t pid = 0;
+ pid_t *pids = &pid;
ckpt_dbg("forking child vpid %d flags %#x\n", child->pid, child->flags);
@@ -2067,29 +2080,46 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
flags |= CLONE_PARENT;
}
+ memset(&clone_args, 0, sizeof(clone_args));
+ clone_args.nr_pids = 1;
/* select pid if --pids, otherwise it's 0 */
- if (ctx->args->pids)
- pid = child->pid;
+ if (ctx->args->pids) {
+ int i, depth = child->piddepth + 1;
-#ifdef CLONE_NEWPID
- /* but for new pidns, don't specify a pid */
- if (child->flags & TASK_NEWPID) {
- flags |= CLONE_NEWPID;
- pid = 0;
+ clone_args.nr_pids = depth;
+ pids = malloc(sizeof(pid_t) * depth);
+ if (!pids) {
+ perror("ckpt_fork_child pids malloc");
+ return -1;
+ }
+
+ pids[0] = child->pid;
+ for (i = 1; i <= child->piddepth; i++)
+ pids[i] = child->vpids[i-1];
+
+ if (child->piddepth > child->creator->piddepth) {
+ child->flags |= TASK_NEWPID;
+ flags |= CLONE_NEWPID;
+ } else if (child->flags & TASK_NEWPID) {
+ /* The TASK_NEWPID could have been set for root task */
+ pids[0] = 0;
+ flags |= CLONE_NEWPID;
+ }
+ if (flags & CLONE_NEWPID)
+ clone_args.nr_pids--;
}
-#endif
if (child->flags & (TASK_SIBLING | TASK_THREAD))
child->real_parent = getppid();
else
child->real_parent = _getpid();
- memset(&clone_args, 0, sizeof(clone_args));
clone_args.child_stack = (unsigned long)genstack_base(stk);
clone_args.child_stack_size = genstack_size(stk);
- clone_args.nr_pids = nr_pids;
- pid = eclone(ckpt_fork_stub, child, flags, &clone_args, &pid);
+ pid = eclone(ckpt_fork_stub, child, flags, &clone_args, pids);
+ if (pids != &pid)
+ free(pids);
if (pid < 0) {
ckpt_perror("eclone");
genstack_release(stk);
@@ -2298,7 +2328,7 @@ static int ckpt_do_feeder(void *data)
static int ckpt_adjust_pids(struct ckpt_ctx *ctx)
{
struct pid_swap swap;
- int n, m, len, ret;
+ int n, m, ret;
pid_t coord_sid;
coord_sid = getsid(0);
@@ -2313,22 +2343,7 @@ static int ckpt_adjust_pids(struct ckpt_ctx *ctx)
* but correct should be: [][][B][][A][]...
*/
- len = sizeof(struct ckpt_pids) * ctx->pids_nr;
-
-#ifdef CHECKPOINT_DEBUG
- ckpt_dbg("====== PIDS ARRAY\n");
- for (m = 0; m < ctx->pids_nr; m++) {
- struct ckpt_pids *p;
- p = &ctx->pids_arr[m];
- ckpt_dbg("[%d] pid %d ppid %d sid %d pgid %d\n",
- m, p->vpid, p->vppid, p->vsid, p->vpgid);
- }
- ckpt_dbg("............\n");
-#endif
-
- memcpy(ctx->copy_arr, ctx->pids_arr, len);
-
- /* read in 'pid_swap' data and adjust ctx->pids_arr */
+ /* read in 'pid_swap' data and adjust ctx->orig_pids */
for (n = 0; n < ctx->tasks_nr; n++) {
/* get pid info from next task */
ret = read(ctx->pipe_in, &swap, sizeof(swap));
@@ -2341,31 +2356,16 @@ static int ckpt_adjust_pids(struct ckpt_ctx *ctx)
ckpt_dbg("c/r swap old %d new %d\n", swap.old, swap.new);
for (m = 0; m < ctx->pids_nr; m++) {
- if (ctx->pids_arr[m].vpid == swap.old)
- ctx->copy_arr[m].vpid = swap.new;
- if (ctx->pids_arr[m].vtgid == swap.old)
- ctx->copy_arr[m].vtgid = swap.new;
- if (ctx->pids_arr[m].vsid == swap.old)
- ctx->copy_arr[m].vsid = swap.new;
- if (ctx->pids_arr[m].vpgid == swap.old)
- ctx->copy_arr[m].vpgid = swap.new;
- }
- }
-
- memcpy(ctx->pids_arr, ctx->copy_arr, len);
-
-#ifdef CHECKPOINT_DEBUG
- if (!ctx->args->pids) {
- ckpt_dbg("====== PIDS ARRAY (swaped)\n");
- for (m = 0; m < ctx->pids_nr; m++) {
- struct ckpt_pids *p;
- p = &ctx->pids_arr[m];
- ckpt_dbg("[%d] pid %d ppid %d sid %d pgid %d\n",
- m, p->vpid, p->vppid, p->vsid, p->vpgid);
+ if (ctx->orig_pids[m]->pid == swap.old)
+ ctx->orig_pids[m]->pid = swap.new;
+ if (ctx->orig_pids[m]->tgid == swap.old)
+ ctx->orig_pids[m]->tgid = swap.new;
+ if (ctx->orig_pids[m]->sid == swap.old)
+ ctx->orig_pids[m]->sid = swap.new;
+ if (ctx->orig_pids[m]->pgid == swap.old)
+ ctx->orig_pids[m]->pgid = swap.new;
}
- ckpt_dbg("............\n");
}
-#endif
close(ctx->pipe_in);
return 0;
@@ -2479,21 +2479,6 @@ static int ckpt_read_obj_type(struct ckpt_ctx *ctx, void *buf, int n, int type)
return 0;
}
-static int ckpt_read_obj_ptr(struct ckpt_ctx *ctx, void *buf, int n, int type)
-{
- struct ckpt_hdr h;
- int ret;
-
- ret = ckpt_read_obj(ctx, &h, buf, n + sizeof(h));
- if (ret < 0)
- return ret;
- if (h.type != type) {
- errno = EINVAL;
- return -1;
- }
- return 0;
-}
-
static int ckpt_read_obj_buffer(struct ckpt_ctx *ctx, void *buf, int n)
{
return ckpt_read_obj_type(ctx, buf, BUFSIZE, CKPT_HDR_BUFFER);
@@ -2575,10 +2560,64 @@ static int ckpt_read_container(struct ckpt_ctx *ctx)
return ckpt_read_obj_type(ctx, ptr, 200, CKPT_HDR_LSM_INFO);
}
+#define MAX_PID_SZ 999999
+static int ckpt_read_pids(struct ckpt_ctx *ctx)
+{
+ struct uckpt_pid *p;
+ struct ckpt_hdr_pids *h;
+ int ret, i, numpids, size;
+ char *buf;
+
+ size = sizeof(struct ckpt_hdr_pids **) * ctx->pids_nr;
+
+ ctx->orig_pids = malloc(size);
+ if (!ctx->orig_pids)
+ return -1;
+
+ buf = malloc(MAX_PID_SZ);
+ if (!buf)
+ return -ENOMEM;
+ h = (struct ckpt_hdr_pids *) buf;
+
+ for (i=0; i<ctx->pids_nr; i++) {
+ int j;
+ __s32 *vpid;
+
+ ret = ckpt_read_obj_type(ctx, h, MAX_PID_SZ, CKPT_HDR_PID);
+ if (ret < 0)
+ goto out;
+ numpids = h->h.len - sizeof(struct ckpt_hdr_pids);
+ numpids /= sizeof(__s32);
+ size = sizeof(struct uckpt_pid) + numpids * sizeof(__s32);
+ p = malloc(size);
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ctx->orig_pids[i] = p;
+ p->rpid = h->rpid;
+ p->depth = numpids;
+ p->pid = h->pid;
+ p->ppid = h->ppid;
+ p->tgid = h->tgid;
+ p->pgid = h->pgid;
+ p->sid = h->sid;
+ vpid = h->vpids;
+ for (j=0; j<numpids; j++)
+ p->vpids[j] = vpid[j];
+ }
+
+ ret = 0;
+
+out:
+ free(buf);
+ return ret;
+}
+
static int ckpt_read_tree(struct ckpt_ctx *ctx)
{
struct ckpt_hdr_tree *h;
- int len, ret;
+ int ret;
h = (struct ckpt_hdr_tree *) ctx->tree;
ret = ckpt_read_obj_type(ctx, h, sizeof(*h), CKPT_HDR_TREE);
@@ -2598,21 +2637,7 @@ static int ckpt_read_tree(struct ckpt_ctx *ctx)
ctx->pids_nr = h->nr_tasks;
- len = sizeof(struct ckpt_pids) * ctx->pids_nr;
-
- ctx->pids_arr = malloc(len);
- ctx->copy_arr = malloc(len);
- if (!ctx->pids_arr || !ctx->copy_arr) {
- if (ctx->pids_arr)
- free(ctx->pids_arr);
- return -1;
- }
-
- ret = ckpt_read_obj_ptr(ctx, ctx->pids_arr, len, CKPT_HDR_BUFFER);
- if (ret < 0)
- free(ctx->pids_arr);
-
- return ret;
+ return ckpt_read_pids(ctx);
}
static int ckpt_write_header(struct ckpt_ctx *ctx)
@@ -2669,20 +2694,50 @@ static int ckpt_write_container(struct ckpt_ctx *ctx)
return ckpt_write_obj(ctx, (struct ckpt_hdr *) ptr);
}
+static int write_out_a_pid(struct ckpt_ctx *ctx, int i)
+{
+ struct ckpt_hdr_pids *h;
+ struct uckpt_pid *p = ctx->orig_pids[i];
+ int ret, size;
+ __s32 *pids;
+
+ size = p->depth * sizeof(__s32);
+ size += sizeof(*h);
+ h = malloc(size);
+ if (!h)
+ return -ENOMEM;
+ h->h.len = size;
+ h->h.type = CKPT_HDR_PID;
+ h->pid = p->pid;
+ h->rpid = p->rpid;
+ h->ppid = p->ppid;
+ h->tgid = p->tgid;
+ h->pgid = p->pgid;
+ h->sid = p->sid;
+ pids = h->vpids;
+ for (i=0; i < p->depth; i++)
+ pids[i] = p->vpids[i];
+ ret = ckpt_write_obj(ctx, &h->h);
+ free(h);
+ return ret;
+}
+
static int ckpt_write_tree(struct ckpt_ctx *ctx)
{
struct ckpt_hdr_tree *h;
- int len;
+ int i, ret = 0;
h = (struct ckpt_hdr_tree *) ctx->tree;
if (ckpt_write_obj(ctx, (struct ckpt_hdr *) h) < 0)
ckpt_abort(ctx, "write tree");
- len = sizeof(struct ckpt_pids) * ctx->pids_nr;
- if (ckpt_write_obj_ptr(ctx, ctx->pids_arr, len, CKPT_HDR_BUFFER) < 0)
- ckpt_abort(ctx, "write pids");
+ for (i = 0; i < ctx->pids_nr; i++) {
+ ret = write_out_a_pid(ctx, i);
+ if (ret < 0)
+ ckpt_abort(ctx, "write pids");
+ }
- return 0;
+ return ret;
}
/*
--
1.7.0
^ permalink raw reply related [flat|nested] 3+ messages in thread
end of thread, other threads:[~2010-03-18 20:22 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-03-18 20:19 [PATCH linux-cr] Handle nested pid namespaces Serge E. Hallyn
2010-03-18 20:22 ` [PATCH] user-cr: " Serge E. Hallyn
-- strict thread matches above, loose matches on Subject: below --
2010-03-12 5:27 cr: handle " serue-r/Jw6+rmf7HQT0dZR+AlfA
[not found] ` <1268371676-3029-1-git-send-email-serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-03-12 5:27 ` [PATCH] linux-cr: Handle " serue-r/Jw6+rmf7HQT0dZR+AlfA
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.