linux-api.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
To: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
Cc: Linus Torvalds <torvalds-3NddpPZAyC0@public.gmane.org>,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Thomas Gleixner <tglx-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org>,
	Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>,
	Dave Hansen
	<dave-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>,
	Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org>,
	"H. Peter Anvin" <hpa-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>,
	Alexander Viro
	<viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org>,
	Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Subject: [RFC v9][PATCH 12/13] Checkpoint multiple processes
Date: Mon, 10 Nov 2008 11:37:39 -0500	[thread overview]
Message-ID: <1226335060-7061-13-git-send-email-orenl@cs.columbia.edu> (raw)
In-Reply-To: <1226335060-7061-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>

Checkpointing of multiple processes works by recording the tasks tree
structure below a given task (usually this task is the container init).

For a given task, do a DFS scan of the tasks tree and collect them
into an array (keeping a reference to each task). Using DFS simplifies
the recreation of tasks either in user space or kernel space. For each
task collected, test if it can be checkpointed, and save its pid, tgid,
and ppid.

The actual work is divided into two passes: a first scan counts the
tasks, then memory is allocated and a second scan fills the array.

The logic is suitable for creation of processes during restart either
in userspace or by the kernel.

Currently we ignore threads and zombies, as well as session ids.

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 checkpoint/checkpoint.c        |  228 +++++++++++++++++++++++++++++++++++++---
 checkpoint/sys.c               |   16 +++
 include/linux/checkpoint.h     |    3 +
 include/linux/checkpoint_hdr.h |   13 ++-
 4 files changed, 243 insertions(+), 17 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index dd9f325..04b0c4a 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -206,19 +206,6 @@ static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t)
 {
 	int ret;
 
-	/* TODO: verity that the task is frozen (unless self) */
-
-	if (task->state == TASK_DEAD) {
-		pr_warning("c/r: task may not be in state TASK_DEAD\n");
-		return -EAGAIN;
-	}
-
-	if (!atomic_read(&task->may_checkpoint)) {
-		pr_warning("c/r: task %d may not checkpoint\n",
-			   task_pid_vnr(task));
-		return -BUSY;
-	}
-
 	ret = cr_write_task_struct(ctx, t);
 	cr_debug("task_struct: ret %d\n", ret);
 	if (ret < 0)
@@ -241,6 +228,205 @@ static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t)
 	return ret;
 }
 
+/* dump the entire state of a given task */
+static int cr_write_all_tasks(struct cr_ctx *ctx)
+{
+	int n, ret = 0;
+
+	for (n = 0; n < ctx->tasks_nr; n++) {
+		cr_debug("dumping task #%d\n", n);
+		ret = cr_write_task(ctx, ctx->tasks_arr[n]);
+		if (ret < 0)
+			break;
+	}
+
+	return ret;
+}
+
+static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
+{
+	cr_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
+
+	if (t->state == TASK_DEAD) {
+		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
+		return -EAGAIN;
+	}
+
+	if (!atomic_read(&t->may_checkpoint)) {
+		pr_warning("c/r: task %d uncheckpointable\n", task_pid_vnr(t));
+		return -EBUSY;
+	}
+
+	if (!ptrace_may_access(t, PTRACE_MODE_READ))
+		return -EPERM;
+
+	/* FIXME: verify that the task is frozen (unless self) */
+
+	/* FIXME: change this for nested containers */
+	if (task_nsproxy(t) != ctx->root_nsproxy)
+		return -EPERM;
+
+	return 0;
+}
+
+#define CR_HDR_PIDS_CHUNK	256
+
+static int cr_write_pids(struct cr_ctx *ctx)
+{
+	struct cr_hdr_pids *hh;
+	struct pid_namespace *ns;
+	struct task_struct *task;
+	struct task_struct **tasks_arr;
+	int tasks_nr, n, ret = 0, pos = 0;
+
+	ns = ctx->root_nsproxy->pid_ns;
+	tasks_arr = ctx->tasks_arr;
+	tasks_nr = ctx->tasks_nr;
+
+	hh = cr_hbuf_get(ctx, sizeof(*hh) * CR_HDR_PIDS_CHUNK);
+
+	while (tasks_nr > 0) {
+		rcu_read_lock();
+		for (n = min(tasks_nr, CR_HDR_PIDS_CHUNK); n; n--) {
+			task = tasks_arr[pos];
+
+			/* is this task cool ? */
+			ret = cr_may_checkpoint_task(task, ctx);
+			if (ret < 0) {
+				rcu_read_unlock();
+				goto out;
+			}
+			hh[pos].vpid = task_pid_nr_ns(task, ns);
+			hh[pos].vtgid = task_tgid_nr_ns(task, ns);
+			hh[pos].vppid = task_tgid_nr_ns(task->real_parent, ns);
+			cr_debug("task[%d]: vpid %d vtgid %d parent %d\n", pos,
+				 hh[pos].vpid, hh[pos].vtgid, hh[pos].vppid);
+			pos++;
+		}
+		rcu_read_unlock();
+
+		n = min(tasks_nr, CR_HDR_PIDS_CHUNK);
+		ret = cr_kwrite(ctx, hh, n * sizeof(*hh));
+		if (ret < 0)
+			break;
+
+		tasks_nr -= n;
+	}
+ out:
+	cr_hbuf_put(ctx, sizeof(*hh));
+	return ret;
+}
+
+/* count number of tasks in tree (and optionally fill pid's in array) */
+static int cr_tree_count_tasks(struct cr_ctx *ctx)
+{
+	struct task_struct *root = ctx->root_task;
+	struct task_struct *task = root;
+	struct task_struct *parent = NULL;
+	struct task_struct **tasks_arr = ctx->tasks_arr;
+	int tasks_nr = ctx->tasks_nr;
+	int nr = 0;
+
+	read_lock(&tasklist_lock);
+
+	/* count tasks via DFS scan of the tree */
+	while (1) {
+		if (tasks_arr) {
+			/* unlikely, but ... */
+			if (nr == tasks_nr)
+				return -EBUSY;	/* cleanup in cr_ctx_free() */
+			tasks_arr[nr] = task;
+			get_task_struct(task);
+		}
+
+		nr++;
+
+		/* if has children - proceed with child */
+		if (!list_empty(&task->children)) {
+			parent = task;
+			task = list_entry(task->children.next,
+					  struct task_struct, sibling);
+			continue;
+		}
+
+		while (task != root) {
+			/* if has sibling - proceed with sibling */
+			if (!list_is_last(&task->sibling, &parent->children)) {
+				task = list_entry(task->sibling.next,
+						  struct task_struct, sibling);
+				break;
+			}
+
+			/* else, trace back to parent and proceed */
+			task = parent;
+			parent = parent->real_parent;
+		}
+
+		if (task == root)
+			break;
+	}
+
+	read_unlock(&tasklist_lock);
+	return nr;
+}
+
+/*
+ * cr_build_tree - scan the tasks tree in DFS order and fill in array
+ * @ctx: checkpoint context
+ *
+ * Using DFS order simplifies the restart logic to re-create the tasks.
+ *
+ * On success, ctx->tasks_arr will be allocated and populated with all
+ * tasks (reference taken), and ctx->tasks_nr will hold the total count.
+ * The array is cleaned up by cr_ctx_free().
+ */
+static int cr_build_tree(struct cr_ctx *ctx)
+{
+	int n, m;
+
+	/* count tasks (no side effects) */
+	n = cr_tree_count_tasks(ctx);
+
+	ctx->tasks_nr = n;
+	ctx->tasks_arr = kzalloc(n * sizeof(*ctx->tasks_arr), GFP_KERNEL);
+	if (!ctx->tasks_arr)
+		return -ENOMEM;
+
+	/* count again (now will fill array) */
+	m = cr_tree_count_tasks(ctx);
+
+	/* unlikely, but ... (cleanup in cr_ctx_free) */
+	if (m < 0)
+		return m;
+	else if (m != n)
+		return -EBUSY;
+
+	return 0;
+}
+
+/* dump the array that describes the tasks tree */
+static int cr_write_tree(struct cr_ctx *ctx)
+{
+	struct cr_hdr h;
+	struct cr_hdr_tree *hh;
+	int ret;
+
+	h.type = CR_HDR_TREE;
+	h.len = sizeof(*hh);
+	h.parent = 0;
+
+	hh = cr_hbuf_get(ctx, sizeof(*hh));
+	hh->tasks_nr = ctx->tasks_nr;
+
+	ret = cr_write_obj(ctx, &h, hh);
+	cr_hbuf_put(ctx, sizeof(*hh));
+	if (ret < 0)
+		return ret;
+
+	ret = cr_write_pids(ctx);
+	return ret;
+}
+
 static int cr_get_container(struct cr_ctx *ctx, pid_t pid)
 {
 	struct task_struct *task = NULL;
@@ -258,7 +444,7 @@ static int cr_get_container(struct cr_ctx *ctx, pid_t pid)
 	if (!task)
 		goto out;
 
-#if 0	/* enable to use containers */
+#if 0	/* enable with containers */
 	if (!is_container_init(task)) {
 		err = -EINVAL;
 		goto out;
@@ -280,7 +466,7 @@ static int cr_get_container(struct cr_ctx *ctx, pid_t pid)
 	if (!nsproxy)
 		goto out;
 
-	/* TODO: verify that the container is frozen */
+	/* FIXME: verify that the container is frozen */
 
 	ctx->root_task = task;
 	ctx->root_nsproxy = nsproxy;
@@ -319,12 +505,22 @@ int do_checkpoint(struct cr_ctx *ctx, pid_t pid)
 	ret = cr_ctx_checkpoint(ctx, pid);
 	if (ret < 0)
 		goto out;
+
+	ret = cr_build_tree(ctx);
+	if (ret < 0)
+		goto out;
+
 	ret = cr_write_head(ctx);
 	if (ret < 0)
 		goto out;
-	ret = cr_write_task(ctx, ctx->root_task);
+	ret = cr_write_tree(ctx);
 	if (ret < 0)
 		goto out;
+
+	ret = cr_write_all_tasks(ctx);
+	if (ret < 0)
+		goto out;
+
 	ret = cr_write_tail(ctx);
 	if (ret < 0)
 		goto out;
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index dfe63ca..fbaf683 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -130,6 +130,19 @@ void cr_hbuf_put(struct cr_ctx *ctx, int n)
  * restart operation, and persists until the operation is completed.
  */
 
+static void cr_task_arr_free(struct cr_ctx *ctx)
+{
+	int n;
+
+	for (n = 0; n < ctx->tasks_nr; n++) {
+		if (ctx->tasks_arr[n]) {
+			put_task_struct(ctx->tasks_arr[n]);
+			ctx->tasks_arr[n] = NULL;
+		}
+	}
+	kfree(ctx->tasks_arr);
+}
+
 static void cr_ctx_free(struct cr_ctx *ctx)
 {
 	if (ctx->file)
@@ -143,6 +156,9 @@ static void cr_ctx_free(struct cr_ctx *ctx)
 	cr_pgarr_free(ctx);
 	cr_objhash_free(ctx);
 
+	if (ctx->tasks_arr)
+		cr_task_arr_free(ctx);
+
 	if (ctx->root_nsproxy)
 		put_nsproxy(ctx->root_nsproxy);
 	if (ctx->root_task)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 74a8597..b142171 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -34,6 +34,9 @@ struct cr_ctx {
 	void *hbuf;		/* temporary buffer for headers */
 	int hpos;		/* position in headers buffer */
 
+	struct task_struct **tasks_arr;	/* array of all tasks in container */
+	int tasks_nr;			/* size of tasks array */
+
 	struct cr_objhash *objhash;	/* hash for shared objects */
 
 	struct list_head pgarr_list;	/* page array to dump VMA contents */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 3a21179..5114bdd 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -35,7 +35,8 @@ enum {
 	CR_HDR_STRING,
 	CR_HDR_FNAME,
 
-	CR_HDR_TASK = 101,
+	CR_HDR_TREE = 101,
+	CR_HDR_TASK,
 	CR_HDR_THREAD,
 	CR_HDR_CPU,
 
@@ -71,6 +72,16 @@ struct cr_hdr_tail {
 	__u64 magic;
 } __attribute__((aligned(8)));
 
+struct cr_hdr_tree {
+	__u32 tasks_nr;
+} __attribute__((aligned(8)));
+
+struct cr_hdr_pids {
+	__s32 vpid;
+	__s32 vtgid;
+	__s32 vppid;
+} __attribute__((aligned(8)));
+
 struct cr_hdr_task {
 	__u32 state;
 	__u32 exit_state;
-- 
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

  parent reply	other threads:[~2008-11-10 16:37 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-11-10 16:37 [RFC v9][PATCH 00/13] Kernel based checkpoint/restart Oren Laadan
     [not found] ` <1226335060-7061-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2008-11-10 16:37   ` [RFC v9][PATCH 01/13] Create syscalls: sys_checkpoint, sys_restart Oren Laadan
2008-11-10 16:37   ` [RFC v9][PATCH 02/13] Checkpoint/restart: initial documentation Oren Laadan
2008-11-10 16:37   ` [RFC v9][PATCH 03/13] General infrastructure for checkpoint restart Oren Laadan
2008-11-10 16:37   ` [RFC v9][PATCH 04/13] x86 support for checkpoint/restart Oren Laadan
2008-11-10 16:37   ` [RFC v9][PATCH 07/13] Infrastructure for shared objects Oren Laadan
2008-11-10 16:37   ` [RFC v9][PATCH 08/13] Dump open file descriptors Oren Laadan
2008-11-10 16:37   ` [RFC v9][PATCH 09/13] Restore open file descriprtors Oren Laadan
2008-11-10 16:37   ` Oren Laadan [this message]
     [not found]     ` <1226335060-7061-13-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2008-11-14  3:41       ` [RFC v9][PATCH 12/13] Checkpoint multiple processes Serge E. Hallyn
2008-11-10 16:37   ` [RFC v9][PATCH 13/13] Restart " Oren Laadan
2008-11-12  5:03   ` [RFC v9][PATCH 00/13] Kernel based checkpoint/restart Serge E. Hallyn
2008-11-10 16:37 ` [RFC v9][PATCH 05/13] Dump memory address space Oren Laadan
     [not found]   ` <1226335060-7061-6-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2008-11-10 20:12     ` Serge E. Hallyn
2008-11-10 20:34       ` Oren Laadan
2008-11-11 16:45     ` Serge E. Hallyn
     [not found]       ` <20081111164517.GA15999-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2008-11-11 23:53         ` Oren Laadan
2008-11-10 16:37 ` [RFC v9][PATCH 06/13] Restore " Oren Laadan
2008-11-10 16:37 ` [RFC v9][PATCH 10/13] External checkpoint of a task other than ourself Oren Laadan
2008-11-10 16:37 ` [RFC v9][PATCH 11/13] Track in-kernel when we expect checkpoint/restart to work Oren Laadan, Dave Hansen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1226335060-7061-13-git-send-email-orenl@cs.columbia.edu \
    --to=orenl-eqauephvms7envbuuze7ea@public.gmane.org \
    --cc=akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org \
    --cc=containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org \
    --cc=dave-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org \
    --cc=hpa-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org \
    --cc=linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org \
    --cc=mingo-X9Un+BFzKDI@public.gmane.org \
    --cc=serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org \
    --cc=tglx-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org \
    --cc=torvalds-3NddpPZAyC0@public.gmane.org \
    --cc=viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).