All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
To: Linux Containers <containers-qjLDD68F18O7TbgM5vRIOg@public.gmane.org>
Subject: [PATCH 2/3] restart debug: add final process tree status
Date: Tue, 29 Sep 2009 11:54:02 -0500	[thread overview]
Message-ID: <20090929165402.GA10114@us.ibm.com> (raw)
In-Reply-To: <20090929165342.GA10076-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

Have tasks in sys_restart keep some status in a list off
of checkpoint_ctx, and print this info when the checkpoint_ctx
is freed.

This is mostly an RFC - in particular the error tracking is
pretty half-hearted so far.  But the info it does spit out
helped me to figured out the coordinator syncing problem
fixed by the previous patch.

Sample dmesg output:
[4568:4568:c/r:free_per_task_status:200] 4 tasks registered, nr_tasks was 0 nr_total 0
[4568:4568:c/r:free_per_task_status:202] active pid was 1, ctx->errno 0
[4568:4568:c/r:free_per_task_status:204] kflags 6 uflags 0 oflags 1
[4568:4568:c/r:free_per_task_status:206] task 0 to run was 4568
[4568:4568:c/r:free_per_task_status:209] pid 4566
[4568:4568:c/r:free_per_task_status:211] it was coordinator
[4568:4568:c/r:free_per_task_status:219] it was running
[4568:4568:c/r:free_per_task_status:209] pid 4570
[4568:4568:c/r:free_per_task_status:213] it was a ghost
[4568:4568:c/r:free_per_task_status:209] pid 4569
[4568:4568:c/r:free_per_task_status:213] it was a ghost
[4568:4568:c/r:free_per_task_status:209] pid 4568
[4568:4568:c/r:free_per_task_status:215] it was the root task
[4568:4568:c/r:free_per_task_status:221] it was a normal task

So, when one task died before hitting sys_restart, the first line would
show '3 tasks registered'.

Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 checkpoint/restart.c             |  105 ++++++++++++++++++++++++++++++++++++++
 checkpoint/sys.c                 |   49 ++++++++++++++++++
 include/linux/checkpoint_types.h |   20 +++++++
 3 files changed, 174 insertions(+), 0 deletions(-)

diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 849bda5..1085ed5 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -26,6 +26,98 @@
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 
+#ifdef CONFIG_CHECKPOINT_DEBUG
+static struct ckpt_task_status *ckpt_debug_checkin(struct ckpt_ctx *ctx)
+{
+	struct ckpt_task_status *s;
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return NULL;
+	s->pid = current->pid;
+	s->error = 0;
+	s->flags = RESTART_DBG_WAITING;
+	if (current == ctx->root_task)
+		s->flags |= RESTART_DBG_ROOT;
+	list_add_tail(&s->list, &ctx->per_task_status);
+	return s;
+}
+
+static struct ckpt_task_status *getme(struct ckpt_ctx *ctx)
+{
+	struct ckpt_task_status *s = NULL;
+	list_for_each_entry(s, &ctx->per_task_status, list) {
+		if (s->pid == current->pid)
+			break;
+	}
+	if (!s || s->pid != current->pid)
+		return NULL;
+	return s;
+}
+
+static void ckpt_debug_coord(struct ckpt_ctx *ctx)
+{
+	struct ckpt_task_status *s;
+
+	s = ckpt_debug_checkin(ctx);
+	if (s)
+		s->flags |= RESTART_DBG_COORD;
+}
+
+static void ckpt_debug_ghost(struct ckpt_ctx *ctx)
+{
+	struct ckpt_task_status *s;
+
+	s = ckpt_debug_checkin(ctx);
+	if (s)
+		s->flags |= RESTART_DBG_GHOST;
+}
+
+static void ckpt_debug_normal(struct ckpt_ctx *ctx)
+{
+	struct ckpt_task_status *s;
+
+	s = ckpt_debug_checkin(ctx);
+	if (s)
+		s->flags |= RESTART_DBG_NORMAL;
+}
+
+static void ckpt_debug_log_error(struct ckpt_ctx *ctx, int err)
+{
+	struct ckpt_task_status *s = getme(ctx);
+	if (!s)
+		return;
+	s->error = err;
+	s->flags &= ~RESTART_DBG_WAITING;
+	s->flags &= ~RESTART_DBG_RUNNING;
+	if (err)
+		s->flags |= RESTART_DBG_FAILED;
+	else
+		s->flags |= RESTART_DBG_SUCCESS;
+}
+
+static void ckpt_debug_log_running(struct ckpt_ctx *ctx)
+{
+	struct ckpt_task_status *s = getme(ctx);
+	if (!s)
+		return;
+	s->flags &= ~RESTART_DBG_WAITING;
+	s->flags |= RESTART_DBG_RUNNING;
+}
+#else
+static inline void ckpt_debug_checkin(struct ckpt_ctx *ctx)
+{}
+static inline void ckpt_debug_coord(struct ckpt_ctx *ctx)
+{}
+static inline void ckpt_debug_ghost(struct ckpt_ctx *ctx)
+{}
+static inline void ckpt_debug_normal(struct ckpt_ctx *ctx)
+{}
+static inline void ckpt_debug_log_error(struct ckpt_ctx *ctx, int err)
+{}
+static inline void ckpt_debug_log_running(struct ckpt_ctx *ctx)
+{}
+#endif
+
 static int _ckpt_read_err(struct ckpt_ctx *ctx, struct ckpt_hdr *h)
 {
 	char *ptr;
@@ -661,6 +753,8 @@ static int do_ghost_task(void)
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
+	ckpt_debug_ghost(ctx);
+
 	/*
 	 * Wait until coordinator task has completed prepare_descendants().
 	 * Note that prepare_descendants() wakes each task one a time
@@ -681,6 +775,8 @@ static int do_ghost_task(void)
 	if (atomic_dec_and_test(&ctx->nr_running))
 		complete(&ctx->all_ready);
 
+	ckpt_debug_log_running(ctx);
+
 	current->flags |= PF_RESTARTING;
 
 	/* Finally wait for all ghosts to be woken up - to die */
@@ -688,6 +784,7 @@ static int do_ghost_task(void)
 				 all_tasks_activated(ctx) ||
 				 ckpt_test_ctx_error(ctx));
 
+	ckpt_debug_log_error(ctx, 0);
 	current->exit_signal = -1;
 	ckpt_ctx_put(ctx);
 	do_exit(0);
@@ -704,6 +801,8 @@ static int do_restore_task(void)
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
+	ckpt_debug_normal(ctx);
+
 	/*
 	 * Wait until coordinator task has completed prepare_descendants().
 	 * Note that prepare_descendants() wakes each task one a time
@@ -731,6 +830,8 @@ static int do_restore_task(void)
 	if (ret < 0)
 		goto out;
 
+	ckpt_debug_log_running(ctx);
+
 	zombie = restore_task(ctx);
 	if (zombie < 0) {
 		ret = zombie;
@@ -759,6 +860,7 @@ static int do_restore_task(void)
 	if (old_ctx)
 		ckpt_ctx_put(old_ctx);
 
+	ckpt_debug_log_error(ctx, ret);
 	/* if we're first to fail - notify others */
 	if (ret < 0 && !ckpt_test_ctx_error(ctx)) {
 		restore_notify_error(ctx, ret);
@@ -945,6 +1047,9 @@ static int do_restore_coord(struct ckpt_ctx *ctx, pid_t pid)
 	struct ckpt_ctx *old_ctx;
 	int ret;
 
+	ckpt_debug_coord(ctx);
+	ckpt_debug_log_running(ctx);
+
 	ret = restore_read_header(ctx);
 	if (ret < 0)
 		return ret;
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index d48e261..fb3332a 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -188,10 +188,56 @@ static void task_arr_free(struct ckpt_ctx *ctx)
 	kfree(ctx->tasks_arr);
 }
 
+#ifdef CONFIG_CHECKPOINT_DEBUG
+static void free_per_task_status(struct ckpt_ctx *ctx)
+{
+	struct ckpt_task_status *s, *p;
+	int i, count = 0;
+
+	list_for_each_entry(s, &ctx->per_task_status, list)
+		count++;
+	ckpt_debug("%d tasks registered, nr_tasks was %d nr_total %d\n",
+		count, ctx->nr_tasks, atomic_read(&ctx->nr_total));
+	ckpt_debug("active pid was %d, ctx->errno %d\n", ctx->active_pid,
+		ctx->errno);
+	ckpt_debug("kflags %lu uflags %lu oflags %lu", ctx->kflags,
+		ctx->uflags, ctx->oflags);
+	for (i = 0; i < ctx->active_pid; i++)
+		ckpt_debug("task %d to run was %d\n", i, ctx->pids_arr[i].vpid);
+
+	list_for_each_entry_safe(s, p, &ctx->per_task_status, list) {
+		ckpt_debug("pid %d\n", s->pid);
+		if (s->flags & RESTART_DBG_COORD)
+			ckpt_debug("it was coordinator\n");
+		if (s->flags & RESTART_DBG_GHOST)
+			ckpt_debug("it was a ghost\n");
+		if (s->flags & RESTART_DBG_ROOT)
+			ckpt_debug("it was the root task\n");
+		if (s->flags & RESTART_DBG_WAITING)
+			ckpt_debug("it was still waiting to run restart\n");
+		if (s->flags & RESTART_DBG_RUNNING)
+			ckpt_debug("it was running\n");
+		if (s->flags & RESTART_DBG_NORMAL)
+			ckpt_debug("it was a normal task\n");
+		if (s->flags & RESTART_DBG_FAILED)
+			ckpt_debug("it finished with error %d\n", s->error);
+		if (s->flags & RESTART_DBG_FAILED)
+			ckpt_debug("it finished successfully");
+		list_del(&s->list);
+		kfree(s);
+	}
+}
+#else
+static inline void free_per_task_status(struct ckpt_ctx *ctx)
+{ }
+#endif
+
 static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 {
 	BUG_ON(atomic_read(&ctx->refcount));
 
+	free_per_task_status(ctx);
+
 	if (ctx->deferqueue)
 		deferqueue_destroy(ctx->deferqueue);
 
@@ -237,6 +283,9 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
 
 	atomic_set(&ctx->refcount, 0);
 	INIT_LIST_HEAD(&ctx->pgarr_list);
+#ifdef CONFIG_CHECKPOINT_DEBUG
+	INIT_LIST_HEAD(&ctx->per_task_status);
+#endif
 	INIT_LIST_HEAD(&ctx->pgarr_pool);
 	init_waitqueue_head(&ctx->waitq);
 	init_waitqueue_head(&ctx->ghostq);
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index 2a854e0..1cda085 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -76,10 +76,30 @@ struct ckpt_ctx {
 	wait_queue_head_t waitq;	/* waitqueue for restarting tasks */
 	wait_queue_head_t ghostq;	/* waitqueue for ghost tasks */
 	struct cred *realcred, *ecred;	/* tmp storage for cred at restart */
+#ifdef CONFIG_CHECKPOINT_DEBUG
+	struct list_head per_task_status;
+#endif
 
 	struct ckpt_stats stats;	/* statistics */
 };
 
+#ifdef CONFIG_CHECKPOINT_DEBUG
+struct ckpt_task_status {
+	pid_t pid;
+#define RESTART_DBG_ROOT	(1 << 0)
+#define RESTART_DBG_GHOST	(1 << 1)
+#define RESTART_DBG_COORD	(1 << 2)
+#define RESTART_DBG_NORMAL	(1 << 3)
+#define RESTART_DBG_WAITING	(1 << 4)
+#define RESTART_DBG_RUNNING	(1 << 5)
+#define RESTART_DBG_FAILED	(1 << 6)
+#define RESTART_DBG_SUCCESS	(1 << 7)
+	int flags;
+	int error;
+	struct list_head list;
+};
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_CHECKPOINT_TYPES_H_ */
-- 
1.6.1

  parent reply	other threads:[~2009-09-29 16:54 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-09-29 16:53 [PATCH 1/3] restart: make sure all tasks are in sys_restart Serge E. Hallyn
     [not found] ` <20090929165342.GA10076-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-09-29 16:54   ` Serge E. Hallyn [this message]
     [not found]     ` <20090929165402.GA10114-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-10-01  1:57       ` [PATCH 2/3] restart debug: add final process tree status Oren Laadan
     [not found]         ` <4AC40CA0.8020305-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
2009-10-01 15:33           ` Serge E. Hallyn
     [not found]             ` <20091001153356.GA20565-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-10-01 23:29               ` Oren Laadan
2009-09-29 16:54   ` [PATCH 3/3] restart debug: splatter more ckpt_debugs about Serge E. Hallyn
     [not found]     ` <20090929165415.GB10114-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-10-01  1:54       ` Oren Laadan
2009-10-01  1:53   ` [PATCH 1/3] restart: make sure all tasks are in sys_restart Oren Laadan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090929165402.GA10114@us.ibm.com \
    --to=serue-r/jw6+rmf7hqt0dzr+alfa@public.gmane.org \
    --cc=containers-qjLDD68F18O7TbgM5vRIOg@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.