From: "Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
To: Linux Containers <containers-qjLDD68F18O7TbgM5vRIOg@public.gmane.org>
Subject: [PATCH 2/3] restart debug: add final process tree status
Date: Tue, 29 Sep 2009 11:54:02 -0500 [thread overview]
Message-ID: <20090929165402.GA10114@us.ibm.com> (raw)
In-Reply-To: <20090929165342.GA10076-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Have tasks in sys_restart keep some status in a list off
of checkpoint_ctx, and print this info when the checkpoint_ctx
is freed.
This is mostly an RFC - in particular the error tracking is
pretty half-hearted so far. But the info it does spit out
helped me to figured out the coordinator syncing problem
fixed by the previous patch.
Sample dmesg output:
[4568:4568:c/r:free_per_task_status:200] 4 tasks registered, nr_tasks was 0 nr_total 0
[4568:4568:c/r:free_per_task_status:202] active pid was 1, ctx->errno 0
[4568:4568:c/r:free_per_task_status:204] kflags 6 uflags 0 oflags 1
[4568:4568:c/r:free_per_task_status:206] task 0 to run was 4568
[4568:4568:c/r:free_per_task_status:209] pid 4566
[4568:4568:c/r:free_per_task_status:211] it was coordinator
[4568:4568:c/r:free_per_task_status:219] it was running
[4568:4568:c/r:free_per_task_status:209] pid 4570
[4568:4568:c/r:free_per_task_status:213] it was a ghost
[4568:4568:c/r:free_per_task_status:209] pid 4569
[4568:4568:c/r:free_per_task_status:213] it was a ghost
[4568:4568:c/r:free_per_task_status:209] pid 4568
[4568:4568:c/r:free_per_task_status:215] it was the root task
[4568:4568:c/r:free_per_task_status:221] it was a normal task
So, when one task died before hitting sys_restart, the first line would
show '3 tasks registered'.
Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
checkpoint/restart.c | 105 ++++++++++++++++++++++++++++++++++++++
checkpoint/sys.c | 49 ++++++++++++++++++
include/linux/checkpoint_types.h | 20 +++++++
3 files changed, 174 insertions(+), 0 deletions(-)
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 849bda5..1085ed5 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -26,6 +26,98 @@
#include <linux/checkpoint.h>
#include <linux/checkpoint_hdr.h>
+#ifdef CONFIG_CHECKPOINT_DEBUG
+static struct ckpt_task_status *ckpt_debug_checkin(struct ckpt_ctx *ctx)
+{
+ struct ckpt_task_status *s;
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return NULL;
+ s->pid = current->pid;
+ s->error = 0;
+ s->flags = RESTART_DBG_WAITING;
+ if (current == ctx->root_task)
+ s->flags |= RESTART_DBG_ROOT;
+ list_add_tail(&s->list, &ctx->per_task_status);
+ return s;
+}
+
+static struct ckpt_task_status *getme(struct ckpt_ctx *ctx)
+{
+ struct ckpt_task_status *s = NULL;
+ list_for_each_entry(s, &ctx->per_task_status, list) {
+ if (s->pid == current->pid)
+ break;
+ }
+ if (!s || s->pid != current->pid)
+ return NULL;
+ return s;
+}
+
+static void ckpt_debug_coord(struct ckpt_ctx *ctx)
+{
+ struct ckpt_task_status *s;
+
+ s = ckpt_debug_checkin(ctx);
+ if (s)
+ s->flags |= RESTART_DBG_COORD;
+}
+
+static void ckpt_debug_ghost(struct ckpt_ctx *ctx)
+{
+ struct ckpt_task_status *s;
+
+ s = ckpt_debug_checkin(ctx);
+ if (s)
+ s->flags |= RESTART_DBG_GHOST;
+}
+
+static void ckpt_debug_normal(struct ckpt_ctx *ctx)
+{
+ struct ckpt_task_status *s;
+
+ s = ckpt_debug_checkin(ctx);
+ if (s)
+ s->flags |= RESTART_DBG_NORMAL;
+}
+
+static void ckpt_debug_log_error(struct ckpt_ctx *ctx, int err)
+{
+ struct ckpt_task_status *s = getme(ctx);
+ if (!s)
+ return;
+ s->error = err;
+ s->flags &= ~RESTART_DBG_WAITING;
+ s->flags &= ~RESTART_DBG_RUNNING;
+ if (err)
+ s->flags |= RESTART_DBG_FAILED;
+ else
+ s->flags |= RESTART_DBG_SUCCESS;
+}
+
+static void ckpt_debug_log_running(struct ckpt_ctx *ctx)
+{
+ struct ckpt_task_status *s = getme(ctx);
+ if (!s)
+ return;
+ s->flags &= ~RESTART_DBG_WAITING;
+ s->flags |= RESTART_DBG_RUNNING;
+}
+#else
+static inline void ckpt_debug_checkin(struct ckpt_ctx *ctx)
+{}
+static inline void ckpt_debug_coord(struct ckpt_ctx *ctx)
+{}
+static inline void ckpt_debug_ghost(struct ckpt_ctx *ctx)
+{}
+static inline void ckpt_debug_normal(struct ckpt_ctx *ctx)
+{}
+static inline void ckpt_debug_log_error(struct ckpt_ctx *ctx, int err)
+{}
+static inline void ckpt_debug_log_running(struct ckpt_ctx *ctx)
+{}
+#endif
+
static int _ckpt_read_err(struct ckpt_ctx *ctx, struct ckpt_hdr *h)
{
char *ptr;
@@ -661,6 +753,8 @@ static int do_ghost_task(void)
if (IS_ERR(ctx))
return PTR_ERR(ctx);
+ ckpt_debug_ghost(ctx);
+
/*
* Wait until coordinator task has completed prepare_descendants().
* Note that prepare_descendants() wakes each task one a time
@@ -681,6 +775,8 @@ static int do_ghost_task(void)
if (atomic_dec_and_test(&ctx->nr_running))
complete(&ctx->all_ready);
+ ckpt_debug_log_running(ctx);
+
current->flags |= PF_RESTARTING;
/* Finally wait for all ghosts to be woken up - to die */
@@ -688,6 +784,7 @@ static int do_ghost_task(void)
all_tasks_activated(ctx) ||
ckpt_test_ctx_error(ctx));
+ ckpt_debug_log_error(ctx, 0);
current->exit_signal = -1;
ckpt_ctx_put(ctx);
do_exit(0);
@@ -704,6 +801,8 @@ static int do_restore_task(void)
if (IS_ERR(ctx))
return PTR_ERR(ctx);
+ ckpt_debug_normal(ctx);
+
/*
* Wait until coordinator task has completed prepare_descendants().
* Note that prepare_descendants() wakes each task one a time
@@ -731,6 +830,8 @@ static int do_restore_task(void)
if (ret < 0)
goto out;
+ ckpt_debug_log_running(ctx);
+
zombie = restore_task(ctx);
if (zombie < 0) {
ret = zombie;
@@ -759,6 +860,7 @@ static int do_restore_task(void)
if (old_ctx)
ckpt_ctx_put(old_ctx);
+ ckpt_debug_log_error(ctx, ret);
/* if we're first to fail - notify others */
if (ret < 0 && !ckpt_test_ctx_error(ctx)) {
restore_notify_error(ctx, ret);
@@ -945,6 +1047,9 @@ static int do_restore_coord(struct ckpt_ctx *ctx, pid_t pid)
struct ckpt_ctx *old_ctx;
int ret;
+ ckpt_debug_coord(ctx);
+ ckpt_debug_log_running(ctx);
+
ret = restore_read_header(ctx);
if (ret < 0)
return ret;
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index d48e261..fb3332a 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -188,10 +188,56 @@ static void task_arr_free(struct ckpt_ctx *ctx)
kfree(ctx->tasks_arr);
}
+#ifdef CONFIG_CHECKPOINT_DEBUG
+static void free_per_task_status(struct ckpt_ctx *ctx)
+{
+ struct ckpt_task_status *s, *p;
+ int i, count = 0;
+
+ list_for_each_entry(s, &ctx->per_task_status, list)
+ count++;
+ ckpt_debug("%d tasks registered, nr_tasks was %d nr_total %d\n",
+ count, ctx->nr_tasks, atomic_read(&ctx->nr_total));
+ ckpt_debug("active pid was %d, ctx->errno %d\n", ctx->active_pid,
+ ctx->errno);
+ ckpt_debug("kflags %lu uflags %lu oflags %lu", ctx->kflags,
+ ctx->uflags, ctx->oflags);
+ for (i = 0; i < ctx->active_pid; i++)
+ ckpt_debug("task %d to run was %d\n", i, ctx->pids_arr[i].vpid);
+
+ list_for_each_entry_safe(s, p, &ctx->per_task_status, list) {
+ ckpt_debug("pid %d\n", s->pid);
+ if (s->flags & RESTART_DBG_COORD)
+ ckpt_debug("it was coordinator\n");
+ if (s->flags & RESTART_DBG_GHOST)
+ ckpt_debug("it was a ghost\n");
+ if (s->flags & RESTART_DBG_ROOT)
+ ckpt_debug("it was the root task\n");
+ if (s->flags & RESTART_DBG_WAITING)
+ ckpt_debug("it was still waiting to run restart\n");
+ if (s->flags & RESTART_DBG_RUNNING)
+ ckpt_debug("it was running\n");
+ if (s->flags & RESTART_DBG_NORMAL)
+ ckpt_debug("it was a normal task\n");
+ if (s->flags & RESTART_DBG_FAILED)
+ ckpt_debug("it finished with error %d\n", s->error);
+ if (s->flags & RESTART_DBG_FAILED)
+ ckpt_debug("it finished successfully");
+ list_del(&s->list);
+ kfree(s);
+ }
+}
+#else
+static inline void free_per_task_status(struct ckpt_ctx *ctx)
+{ }
+#endif
+
static void ckpt_ctx_free(struct ckpt_ctx *ctx)
{
BUG_ON(atomic_read(&ctx->refcount));
+ free_per_task_status(ctx);
+
if (ctx->deferqueue)
deferqueue_destroy(ctx->deferqueue);
@@ -237,6 +283,9 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
atomic_set(&ctx->refcount, 0);
INIT_LIST_HEAD(&ctx->pgarr_list);
+#ifdef CONFIG_CHECKPOINT_DEBUG
+ INIT_LIST_HEAD(&ctx->per_task_status);
+#endif
INIT_LIST_HEAD(&ctx->pgarr_pool);
init_waitqueue_head(&ctx->waitq);
init_waitqueue_head(&ctx->ghostq);
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index 2a854e0..1cda085 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -76,10 +76,30 @@ struct ckpt_ctx {
wait_queue_head_t waitq; /* waitqueue for restarting tasks */
wait_queue_head_t ghostq; /* waitqueue for ghost tasks */
struct cred *realcred, *ecred; /* tmp storage for cred at restart */
+#ifdef CONFIG_CHECKPOINT_DEBUG
+ struct list_head per_task_status;
+#endif
struct ckpt_stats stats; /* statistics */
};
+#ifdef CONFIG_CHECKPOINT_DEBUG
+struct ckpt_task_status {
+ pid_t pid;
+#define RESTART_DBG_ROOT (1 << 0)
+#define RESTART_DBG_GHOST (1 << 1)
+#define RESTART_DBG_COORD (1 << 2)
+#define RESTART_DBG_NORMAL (1 << 3)
+#define RESTART_DBG_WAITING (1 << 4)
+#define RESTART_DBG_RUNNING (1 << 5)
+#define RESTART_DBG_FAILED (1 << 6)
+#define RESTART_DBG_SUCCESS (1 << 7)
+ int flags;
+ int error;
+ struct list_head list;
+};
+#endif
+
#endif /* __KERNEL__ */
#endif /* _LINUX_CHECKPOINT_TYPES_H_ */
--
1.6.1
next prev parent reply other threads:[~2009-09-29 16:54 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-09-29 16:53 [PATCH 1/3] restart: make sure all tasks are in sys_restart Serge E. Hallyn
[not found] ` <20090929165342.GA10076-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-09-29 16:54 ` Serge E. Hallyn [this message]
[not found] ` <20090929165402.GA10114-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-10-01 1:57 ` [PATCH 2/3] restart debug: add final process tree status Oren Laadan
[not found] ` <4AC40CA0.8020305-RdfvBDnrOixBDgjK7y7TUQ@public.gmane.org>
2009-10-01 15:33 ` Serge E. Hallyn
[not found] ` <20091001153356.GA20565-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-10-01 23:29 ` Oren Laadan
2009-09-29 16:54 ` [PATCH 3/3] restart debug: splatter more ckpt_debugs about Serge E. Hallyn
[not found] ` <20090929165415.GB10114-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-10-01 1:54 ` Oren Laadan
2009-10-01 1:53 ` [PATCH 1/3] restart: make sure all tasks are in sys_restart Oren Laadan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20090929165402.GA10114@us.ibm.com \
--to=serue-r/jw6+rmf7hqt0dzr+alfa@public.gmane.org \
--cc=containers-qjLDD68F18O7TbgM5vRIOg@public.gmane.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox