* [RFC][PATCH 2/2] CR: handle a single task with private memory maps
@ 2008-07-30 3:27 Oren Laadan
[not found] ` <Pine.LNX.4.64.0807292325290.9868-CXF6herHY6ykSYb+qCZC/1i27PF6R63G9nwVQlTi/Pw@public.gmane.org>
[not found] ` <20080730161535.GB22403@hawkmoon.kerlabs.com>
0 siblings, 2 replies; 37+ messages in thread
From: Oren Laadan @ 2008-07-30 3:27 UTC (permalink / raw)
To: Linux Containers
Expand the template sys_checkpoint and sys_restart to be able to dump
and restore a single task. The task's address space may consist of only
private, simple vma's - anonymous or file-mapped.
This big patch adds a mechanism to transfer data between kernel or user
space to and from the file given by the caller (sys.c), alloc/setup/free
of the checkpoint/restart context (sys.c), output wrappers and basic
checkpoint handling (checkpoint.c), memory dump (ckpt_mem.c), input
wrappers and basic restart handling (restart.c), and finally the memory
restore (rstr_mem.c).
Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
ckpt/Makefile | 1 +
ckpt/checkpoint.c | 366 ++++++++++++++++++++++++++++++++++++++++++++++
ckpt/ckpt.h | 78 ++++++++++
ckpt/ckpt_hdr.h | 143 ++++++++++++++++++
ckpt/ckpt_mem.c | 421 +++++++++++++++++++++++++++++++++++++++++++++++++++++
ckpt/ckpt_mem.h | 32 ++++
ckpt/restart.c | 328 +++++++++++++++++++++++++++++++++++++++++
ckpt/rstr_mem.c | 415 ++++++++++++++++++++++++++++++++++++++++++++++++++++
ckpt/sys.c | 239 ++++++++++++++++++++++++++++++
9 files changed, 2023 insertions(+), 0 deletions(-)
create mode 100644 ckpt/Makefile
create mode 100644 ckpt/checkpoint.c
create mode 100644 ckpt/ckpt.h
create mode 100644 ckpt/ckpt_hdr.h
create mode 100644 ckpt/ckpt_mem.c
create mode 100644 ckpt/ckpt_mem.h
create mode 100644 ckpt/restart.c
create mode 100644 ckpt/rstr_mem.c
create mode 100644 ckpt/sys.c
diff --git a/ckpt/Makefile b/ckpt/Makefile
new file mode 100644
index 0000000..41f205d
--- /dev/null
+++ b/ckpt/Makefile
@@ -0,0 +1 @@
+obj-y += sys.o checkpoint.o restart.o ckpt_mem.o rstr_mem.o
diff --git a/ckpt/checkpoint.c b/ckpt/checkpoint.c
new file mode 100644
index 0000000..1698a35
--- /dev/null
+++ b/ckpt/checkpoint.c
@@ -0,0 +1,366 @@
+/*
+ * Checkpoint logic and helpers
+ *
+ * Copyright (C) 2008 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <linux/version.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <asm/ptrace.h>
+
+#if defined (CONFIG_X86)
+#include <asm/i387.h>
+#endif
+
+#include "ckpt.h"
+#include "ckpt_hdr.h"
+
+/**
+ * cr_get_fname - return pathname of a given file
+ * @file: file pointer
+ * @buf: buffer for pathname
+ * @n: buffer length (in) and pathname length (out)
+ *
+ * if the buffer provivded by the caller is too small, allocate a new
+ * buffer; caller should call cr_put_pathname() for cleanup
+ */
+char *cr_get_fname(struct path *path, struct path *root, char *buf, int *n)
+{
+ char *fname;
+
+ fname = __d_path(path, root, buf, *n);
+
+ if (IS_ERR(fname) && PTR_ERR(fname) == -ENAMETOOLONG) {
+ if (!(buf = (char *) __get_free_pages(GFP_KERNEL, 0)))
+ return ERR_PTR(-ENOMEM);
+ fname = __d_path(path, root, buf, PAGE_SIZE);
+ if (IS_ERR(fname))
+ free_pages((unsigned long) buf, 0);
+ }
+ if (!IS_ERR(fname))
+ *n = (buf + *n - fname);
+
+ return fname;
+}
+
+/**
+ * cr_put_fname - (possibly) cleanup pathname buffer
+ * @buf: original buffer that was given to cr_get_pathname()
+ * @fname: resulting pathname from cr_get_pathname()
+ * @n: length of original buffer
+ */
+void cr_put_fname(char *buf, char *fname, int n)
+{
+ if (fname && (fname < buf || fname >= buf + n))
+ free_pages((unsigned long) buf, 0);
+}
+
+/**
+ * cr_write_obj - write a record described by a cr_hdr
+ * @ctx: checkpoint context
+ * @h: record descriptor
+ * @buf: record buffer
+ */
+int cr_write_obj(struct cr_ctx *ctx, struct cr_hdr *h, void *buf)
+{
+ int ret;
+
+ if ((ret = cr_kwrite(ctx, h, sizeof(*h))) < 0)
+ return ret;
+ return cr_kwrite(ctx, buf, h->len);
+}
+
+/**
+ * cr_write_str - write a string record
+ * @ctx: checkpoint context
+ * @str: string buffer
+ * @n: string length
+ */
+int cr_write_str(struct cr_ctx *ctx, char *str, int n)
+{
+ struct cr_hdr h;
+
+ h.type = CR_HDR_STR;
+ h.len = n;
+ h.id = 0;
+
+ return cr_write_obj(ctx, &h, str);
+}
+
+/* write the checkpoint header */
+static int cr_write_hdr(struct cr_ctx *ctx)
+{
+ struct cr_hdr h;
+ struct cr_hdr_head *hh = ctx->tbuf;
+ struct timeval ktv;
+
+ h.type = CR_HDR_HEAD;
+ h.len = sizeof(*hh);
+ h.id = 0;
+
+ do_gettimeofday(&ktv);
+
+ hh->magic = 0x00a2d200;
+ hh->major = (LINUX_VERSION_CODE >> 16) & 0xff;
+ hh->minor = (LINUX_VERSION_CODE >> 8) & 0xff;
+ hh->patch = (LINUX_VERSION_CODE) & 0xff;
+
+ hh->version = 1;
+
+ hh->flags = ctx->flags;
+ hh->time = ktv.tv_sec;
+
+ return cr_write_obj(ctx, &h, hh);
+}
+
+/* write the checkpoint trailer */
+static int cr_write_tail(struct cr_ctx *ctx)
+{
+ struct cr_hdr h;
+ struct cr_hdr_tail *hh = ctx->tbuf;
+
+ h.type = CR_HDR_TAIL;
+ h.len = sizeof(*hh);
+ h.id = 0;
+
+ hh->magic = 0x002d2a00;
+ hh->cksum[0] = hh->cksum[1] = 1; /* TBD ... */
+
+ return cr_write_obj(ctx, &h, hh);
+}
+
+/* dump the task_struct of a given task */
+static int cr_write_task_struct(struct cr_ctx *ctx, struct task_struct *t)
+{
+ struct cr_hdr h;
+ struct cr_hdr_task *hh = ctx->tbuf;
+
+ h.type = CR_HDR_TASK;
+ h.len = sizeof(*hh);
+ h.id = ctx->pid;
+
+ hh->state = t->state;
+ hh->exit_state = t->exit_state;
+ hh->exit_code = t->exit_code;
+ hh->exit_signal = t->exit_signal;
+
+ hh->pid = t->pid;
+ hh->tgid = t->tgid;
+
+ hh->utime = t->utime;
+ hh->stime = t->stime;
+ hh->utimescaled = t->utimescaled;
+ hh->stimescaled = t->stimescaled;
+ hh->gtime = t->gtime;
+ hh->prev_utime = t->prev_utime;
+ hh->prev_stime = t->prev_stime;
+ hh->nvcsw = t->nvcsw;
+ hh->nivcsw = t->nivcsw;
+ hh->start_time_sec = t->start_time.tv_sec;
+ hh->start_time_nsec = t->start_time.tv_nsec;
+ hh->real_start_time_sec = t->real_start_time.tv_sec;
+ hh->real_start_time_nsec = t->real_start_time.tv_nsec;
+ hh->min_flt = t->min_flt;
+ hh->maj_flt = t->maj_flt;
+
+ hh->task_comm_len = TASK_COMM_LEN;
+ memcpy(hh->comm, t->comm, TASK_COMM_LEN);
+
+ return cr_write_obj(ctx, &h, hh);
+}
+
+#if defined(CONFIG_X86)
+/* dump the thread_struct of a given task */
+static int cr_write_thread(struct cr_ctx *ctx, struct task_struct *t)
+{
+ struct cr_hdr h;
+ struct cr_hdr_thread *hh = ctx->tbuf;
+ struct thread_struct *thread;
+ struct desc_struct *desc;
+ int ntls = 0;
+ int n, ret;
+
+ h.type = CR_HDR_THREAD;
+ h.len = sizeof(*hh);
+ h.id = ctx->pid;
+
+ thread = &t->thread;
+
+ /* calculate no. of TLS entries that follow */
+ desc = thread->tls_array;
+ for (n = GDT_ENTRY_TLS_ENTRIES; n > 0; n--, desc++) {
+ if (desc->a || desc->b)
+ ntls++;
+ }
+
+ hh->gdt_entry_tls_entries = GDT_ENTRY_TLS_ENTRIES;
+ hh->sizeof_tls_array = sizeof(thread->tls_array);
+ hh->ntls = ntls;
+
+ if ((ret = cr_write_obj(ctx, &h, hh)) < 0)
+ return ret;
+
+ /* for simplicity dump the entire array, cherry-pick upon restart */
+ ret = cr_kwrite(ctx, thread->tls_array, sizeof(thread->tls_array));
+
+ CR_PRINTK("ntls %d\n", ntls);
+
+ /* IGNORE RESTART BLOCKS FOR NOW ... */
+
+ return ret;
+}
+#endif
+
+#if defined(CONFIG_X86)
+/* dump the cpu state and registers of a given task */
+static int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t)
+{
+ struct cr_hdr h;
+ struct cr_hdr_cpu *hh = ctx->tbuf;
+ struct thread_struct *thread;
+ struct thread_info *thread_info;
+ struct pt_regs *regs;
+
+ h.type = CR_HDR_CPU;
+ h.len = sizeof(*hh);
+ h.id = ctx->pid;
+
+ thread = &t->thread;
+ thread_info = task_thread_info(t);
+ regs = task_pt_regs(t);
+
+ hh->bx = regs->bx;
+ hh->cx = regs->cx;
+ hh->dx = regs->dx;
+ hh->si = regs->si;
+ hh->di = regs->di;
+ hh->bp = regs->bp;
+ hh->ax = regs->ax;
+ hh->ds = regs->ds;
+ hh->es = regs->es;
+ hh->orig_ax = regs->orig_ax;
+ hh->ip = regs->ip;
+ hh->cs = regs->cs;
+ hh->flags = regs->flags;
+ hh->sp = regs->sp;
+ hh->ss = regs->ss;
+
+ /* for checkpoint in process context (from within a container)
+ the GS and FS registers should be saved from the hardware;
+ otherwise they are already sabed on the thread structure */
+ if (t == current) {
+ savesegment(gs, hh->gs);
+ savesegment(fs, hh->fs);
+ } else {
+ hh->gs = thread->gs;
+ hh->fs = thread->fs;
+ }
+
+ /*
+ * for checkpoint in process context (from within a container),
+ * the actual syscall is taking place at this very moment; so
+ * we (optimistically) subtitute the future return value (0) of
+ * this syscall into the orig_eax, so that upon restart it will
+ * succeed (or it will endlessly retry checkpoint...)
+ */
+ if (t == current) {
+ BUG_ON(hh->orig_ax < 0);
+ hh->ax = 0;
+ }
+
+ preempt_disable();
+
+ /* i387 + MMU + SSE logic */
+ hh->used_math = tsk_used_math(t) ? 1 : 0;
+ if (hh->used_math) {
+ /* normally, no need to unlazy_fpu(), since TS_USEDFPU flag
+ * have been cleared when task was conexted-switched out...
+ * except if we are in process context, in which case we do */
+ if (thread_info->status & TS_USEDFPU)
+ unlazy_fpu(current);
+
+ hh->has_fxsr = cpu_has_fxsr;
+ memcpy(&hh->xstate, &thread->xstate, sizeof(thread->xstate));
+ }
+
+ /* debug regs */
+
+ /*
+ * for checkpoint in process context (from within a container),
+ * get the actual registers; otherwise get the saved values.
+ */
+ if (t == current) {
+ get_debugreg(hh->debugreg0, 0);
+ get_debugreg(hh->debugreg1, 1);
+ get_debugreg(hh->debugreg2, 2);
+ get_debugreg(hh->debugreg3, 3);
+ get_debugreg(hh->debugreg6, 6);
+ get_debugreg(hh->debugreg7, 7);
+ } else {
+ hh->debugreg0 = thread->debugreg0;
+ hh->debugreg1 = thread->debugreg1;
+ hh->debugreg2 = thread->debugreg2;
+ hh->debugreg3 = thread->debugreg3;
+ hh->debugreg6 = thread->debugreg6;
+ hh->debugreg7 = thread->debugreg7;
+ }
+
+ hh->uses_debug = !!(thread_info->flags & TIF_DEBUG);
+
+ preempt_enable();
+
+ CR_PRINTK("math %d debug %d\n", hh->used_math, hh->uses_debug);
+
+ return cr_write_obj(ctx, &h, hh);
+}
+#endif
+
+/* dump the entire state of a given task */
+static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t)
+{
+ int ret ;
+
+ BUG_ON(t->state == TASK_DEAD);
+
+ ret = cr_write_task_struct(ctx, t);
+ CR_PRINTK("ret (task_struct) %d\n", ret);
+ if (!ret)
+ ret = cr_write_mm(ctx, t);
+ CR_PRINTK("ret (mm) %d\n", ret);
+ if (!ret)
+ ret = cr_write_thread(ctx, t);
+ CR_PRINTK("ret (thread) %d\n", ret);
+ if (!ret)
+ ret = cr_write_cpu(ctx, t);
+ CR_PRINTK("ret (cpu) %d\n", ret);
+
+ return ret;
+}
+
+int do_checkpoint(struct cr_ctx *ctx)
+{
+ int ret;
+
+ /* FIX: need to test whether container is checkpointable */
+
+ ret = cr_write_hdr(ctx);
+ if (!ret)
+ ret = cr_write_task(ctx, current);
+ if (!ret)
+ ret = cr_write_tail(ctx);
+
+ /* on success, return (unique) checkpoint identifier */
+ if (!ret)
+ ret = ctx->crid;
+
+ return ret;
+}
diff --git a/ckpt/ckpt.h b/ckpt/ckpt.h
new file mode 100644
index 0000000..699ecb9
--- /dev/null
+++ b/ckpt/ckpt.h
@@ -0,0 +1,78 @@
+/*
+ * Generic container checkpoint-restart
+ *
+ * Copyright (C) 2008 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <linux/path.h>
+#include <linux/fs.h>
+
+struct cr_pgarr;
+
+struct cr_ctx {
+ pid_t pid; /* container identifier */
+ int crid; /* unique checkpoint id */
+
+ unsigned long flags;
+ unsigned long oflags; /* restart: old flags */
+
+ struct file *file;
+ int total; /* total read/written */
+
+ void *tbuf; /* temp: to avoid many alloc/dealloc */
+ void *hbuf; /* header: to avoid many alloc/dealloc */
+ int hpos;
+
+ struct cr_pgarr *pgarr;
+ struct cr_pgarr *pgcur;
+
+ struct path *vfsroot; /* container root */
+};
+
+/* cr_ctx: flags */
+#define CR_CTX_CKPT 0x1
+#define CR_CTX_RSTR 0x2
+
+/* allocation defaults */
+#define CR_ORDER_TBUF 1
+#define CR_ORDER_HBUF 1
+
+#define CR_TBUF_TOTAL ((PAGE_SIZE << CR_ORDER_TBUF) / sizeof(void *))
+#define CR_HBUF_TOTAL ((PAGE_SIZE << CR_ORDER_HBUF) / sizeof(void *))
+
+extern void cr_put_fname(char *buf, char *fname, int n);
+extern char *cr_get_fname(struct path *path, struct path *root, char *buf, int *n);
+
+extern int cr_uwrite(struct cr_ctx *ctx, void *buf, int count);
+extern int cr_kwrite(struct cr_ctx *ctx, void *buf, int count);
+extern int cr_uread(struct cr_ctx *ctx, void *buf, int count);
+extern int cr_kread(struct cr_ctx *ctx, void *buf, int count);
+
+extern void *cr_hbuf_get(struct cr_ctx *ctx, int n);
+extern void cr_hbuf_put(struct cr_ctx *ctx, int n);
+
+struct cr_hdr;
+
+extern int cr_write_obj(struct cr_ctx *ctx, struct cr_hdr *h, void *buf);
+extern int cr_write_str(struct cr_ctx *ctx, char *str, int n);
+extern int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t);
+
+extern int cr_read_obj(struct cr_ctx *ctx, struct cr_hdr *h, void *buf, int n);
+extern int cr_read_obj_type(struct cr_ctx *ctx, void *buf, int n, int type);
+extern int cr_read_str(struct cr_ctx *ctx, void *str, int n);
+extern int cr_read_mm(struct cr_ctx *ctx);
+
+extern int do_checkpoint(struct cr_ctx *ctx);
+extern int do_restart(struct cr_ctx *ctx);
+
+/* debugging */
+#if 0
+#define CR_PRINTK(str, args...) \
+ printk(KERN_ERR "cr@%s#%d: " str, __func__, __LINE__, ##args)
+#else
+#define CR_PRINTK(...) do {} while (0)
+#endif
diff --git a/ckpt/ckpt_hdr.h b/ckpt/ckpt_hdr.h
new file mode 100644
index 0000000..d5e2043
--- /dev/null
+++ b/ckpt/ckpt_hdr.h
@@ -0,0 +1,143 @@
+/*
+ * Generic container checkpoint-restart
+ *
+ * Copyright (C) 2008 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <linux/types.h>
+
+#if defined(CONFIG_X86)
+#include <asm/processor.h>
+#endif
+
+struct cr_hdr {
+ __s16 type;
+ __s16 len;
+ __u32 id;
+};
+
+enum {
+ CR_HDR_HEAD = 1,
+ CR_HDR_STR,
+
+ CR_HDR_TASK = 101,
+ CR_HDR_THREAD,
+ CR_HDR_CPU,
+
+ CR_HDR_MM = 201,
+ CR_HDR_VMA,
+ CR_HDR_MM_CONTEXT,
+
+ CR_HDR_TAIL = 5001
+};
+
+struct cr_hdr_head {
+ __u32 magic;
+ __u16 major;
+ __u16 minor;
+ __u16 patch;
+ __u16 version;
+ __u32 flags; /* checkpoint options */
+ __u64 time; /* when checkpoint taken */
+};
+
+struct cr_hdr_tail {
+ __u32 magic;
+ __u32 cksum[2];
+};
+
+struct cr_hdr_task {
+ __u64 state;
+ __u32 exit_state;
+ __u32 exit_code, exit_signal;
+
+ __u16 pid;
+ __u16 tgid;
+
+ __u64 utime, stime, utimescaled, stimescaled;
+ __u64 gtime;
+ __u64 prev_utime, prev_stime;
+ __u64 nvcsw, nivcsw;
+ __u64 start_time_sec, start_time_nsec;
+ __u64 real_start_time_sec, real_start_time_nsec;
+ __u64 min_flt, maj_flt;
+
+ __s16 task_comm_len;
+ char comm[TASK_COMM_LEN];
+};
+
+#if defined(CONFIG_X86)
+struct cr_hdr_thread {
+ /* NEED: restart blocks */
+ __s16 gdt_entry_tls_entries;
+ __s16 sizeof_tls_array;
+ __s16 ntls; /* number of TLS entries to follow */
+};
+#endif
+
+#if defined(CONFIG_X86)
+struct cr_hdr_cpu {
+ __u64 bx;
+ __u64 cx;
+ __u64 dx;
+ __u64 si;
+ __u64 di;
+ __u64 bp;
+ __u64 ax;
+ __u64 ds;
+ __u64 es;
+ __u64 orig_ax;
+ __u64 ip;
+ __u64 cs;
+ __u64 flags;
+ __u64 sp;
+ __u64 ss;
+ __u64 fs;
+ __u64 gs;
+
+ __u64 debugreg0;
+ __u64 debugreg1;
+ __u64 debugreg2;
+ __u64 debugreg3;
+ __u64 debugreg6;
+ __u64 debugreg7;
+
+ __u8 uses_debug;
+
+ __u8 used_math;
+ __u8 has_fxsr;
+ union thread_xstate xstate; /* i387 */
+};
+#endif
+
+struct cr_hdr_mm {
+ __u32 tag; /* sharing identifier */
+ __u64 start_code, end_code, start_data, end_data;
+ __u64 start_brk, brk, start_stack;
+ __u64 arg_start, arg_end, env_start, env_end;
+ __s16 map_count;
+};
+
+#if defined(CONFIG_X86)
+struct cr_hdr_mm_context {
+ __s16 ldt_entry_size;
+ __s16 nldt;
+};
+#endif
+
+struct cr_hdr_vma {
+ __u32 how;
+
+ __u64 vm_start;
+ __u64 vm_end;
+ __u64 vm_page_prot;
+ __u64 vm_flags;
+ __u64 vm_pgoff;
+
+ __s16 npages;
+ __s16 namelen;
+};
diff --git a/ckpt/ckpt_mem.c b/ckpt/ckpt_mem.c
new file mode 100644
index 0000000..12caad0
--- /dev/null
+++ b/ckpt/ckpt_mem.c
@@ -0,0 +1,421 @@
+/*
+ * Checkpoint memory contents
+ *
+ * Copyright (C) 2008 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/mm_types.h>
+
+#if defined(CONFIG_X86)
+#include <asm/ldt.h>
+#endif
+
+#include "ckpt.h"
+#include "ckpt_hdr.h"
+#include "ckpt_mem.h"
+
+/*
+ * utilities to alloc, free, and handle 'struct cr_pgarr'
+ * (common to ckpt_mem.c and rstr_mem.c)
+ */
+
+#define CR_ORDER_PGARR 0
+#define CR_PGARR_TOTAL ((PAGE_SIZE << CR_ORDER_PGARR) / sizeof(void *))
+
+/* release pages referenced by a page-array */
+void _cr_pgarr_release(struct cr_ctx *ctx, struct cr_pgarr *pgarr)
+{
+ int n;
+
+ /* only checkpoint keeps references to pages */
+ if (ctx->flags & CR_CTX_CKPT) {
+ CR_PRINTK("release pages (nused %d)\n", pgarr->nused);
+ for (n = pgarr->nused; n--; )
+ page_cache_release(pgarr->pages[n]);
+ }
+ pgarr->nused = 0;
+ pgarr->nleft = CR_PGARR_TOTAL;
+}
+
+/* release pages referenced by chain of page-arrays */
+void cr_pgarr_release(struct cr_ctx *ctx)
+{
+ struct cr_pgarr *pgarr;
+
+ for (pgarr = ctx->pgarr; pgarr; pgarr = pgarr->next)
+ _cr_pgarr_release(ctx, pgarr);
+}
+
+/* free a chain of page-arrays */
+void cr_pgarr_free(struct cr_ctx *ctx)
+{
+ struct cr_pgarr *pgarr, *pgnxt;
+
+ for (pgarr = ctx->pgarr; pgarr; pgarr = pgnxt) {
+ _cr_pgarr_release(ctx, pgarr);
+ free_pages((unsigned long) ctx->pgarr->addrs, CR_ORDER_PGARR);
+ free_pages((unsigned long) ctx->pgarr->pages, CR_ORDER_PGARR);
+ pgnxt = pgarr->next;
+ kfree(pgarr);
+ }
+}
+
+/* allocate and add a new page-array to chain */
+struct cr_pgarr *cr_pgarr_alloc(struct cr_ctx *ctx, struct cr_pgarr **pgnew)
+{
+ struct cr_pgarr *pgarr = ctx->pgcur;
+
+ if (pgarr && pgarr->next) {
+ ctx->pgcur = pgarr->next;
+ return pgarr->next;
+ }
+
+ if ((pgarr = kzalloc(sizeof(*pgarr), GFP_KERNEL))) {
+ pgarr->nused = 0;
+ pgarr->nleft = CR_PGARR_TOTAL;
+ pgarr->addrs = (unsigned long *)
+ __get_free_pages(GFP_KERNEL, CR_ORDER_PGARR);
+ pgarr->pages = (struct page **)
+ __get_free_pages(GFP_KERNEL, CR_ORDER_PGARR);
+ if (likely(pgarr->addrs && pgarr->pages)) {
+ *pgnew = pgarr;
+ ctx->pgcur = pgarr;
+ return pgarr;
+ } else if (pgarr->addrs)
+ free_pages((unsigned long) pgarr->addrs,
+ CR_ORDER_PGARR);
+ kfree(pgarr);
+ }
+
+ return NULL;
+}
+
+/* return current page-array (and allocate if needed) */
+struct cr_pgarr *cr_pgarr_prep(struct cr_ctx *ctx)
+{
+ struct cr_pgarr *pgarr = ctx->pgcur;
+
+ if (unlikely(!pgarr->nleft))
+ pgarr = cr_pgarr_alloc(ctx, &pgarr->next);
+ return pgarr;
+}
+
+/*
+ * Checkpoint is outside the context of the checkpointee, so one cannot
+ * simply read pages from user-space. Instead, we scan the address space
+ * of the target to cherry-pick pages of interest. Selected pages are
+ * enlisted in a page-array chain (attached to the checkpoint context).
+ * To save their contents, each page is mapped to kernel memory and then
+ * dumped to the file descriptor.
+ */
+
+/**
+ * cr_vma_fill_pgarr - fill a page-array with addr/page tuples for a vma
+ * @ctx - checkpoint context
+ * @pgarr - page-array to fill
+ * @vma - vma to scan
+ * @start - start address (updated)
+ */
+static int cr_vma_fill_pgarr(struct cr_ctx *ctx, struct cr_pgarr *pgarr,
+ struct vm_area_struct *vma, unsigned long *start)
+{
+ unsigned long end = vma->vm_end;
+ unsigned long addr = *start;
+ struct page **pagep;
+ unsigned long *addrp;
+ int cow, nr, ret = 0;
+
+ nr = pgarr->nleft;
+ pagep = &pgarr->pages[pgarr->nused];
+ addrp = &pgarr->addrs[pgarr->nused];
+ cow = !!vma->vm_file;
+
+ while (addr < end) {
+ struct page *page;
+
+ /* simplified version of get_user_pages(): already have vma,
+ * only need FOLL_TOUCH, and (for now) ignore fault stats */
+
+ cond_resched();
+ while (!(page = follow_page(vma, addr, FOLL_TOUCH))) {
+ ret = handle_mm_fault(vma->vm_mm, vma, addr, 0);
+ if (ret & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_OOM)
+ ret = -ENOMEM;
+ else if (ret & VM_FAULT_SIGBUS)
+ ret = -EFAULT;
+ else
+ BUG();
+ break;
+ }
+ cond_resched();
+ }
+
+ if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ break;
+ }
+
+ if (page == ZERO_PAGE(0))
+ page = NULL; /* zero page: ignore */
+ else if (cow && page_mapping(page) != NULL)
+ page = NULL; /* clean cow: ignore */
+ else {
+ get_page(page);
+ *(addrp++) = addr;
+ *(pagep++) = page;
+ if (--nr == 0) {
+ addr += PAGE_SIZE;
+ break;
+ }
+ }
+
+ addr += PAGE_SIZE;
+ }
+
+ if (unlikely(ret < 0)) {
+ nr = pgarr->nleft - nr;
+ while (nr--)
+ page_cache_release(*(--pagep));
+ return ret;
+ }
+
+ *start = addr;
+ return (pgarr->nleft - nr);
+}
+
+/**
+ * cr_vma_scan_pages - scan vma for pages that will need to be dumped
+ * @ctx - checkpoint context
+ * @vma - vma to scan
+ *
+ * a list of addr/page tuples is kept in ctx->pgarr page-array chain
+ */
+static int cr_vma_scan_pages(struct cr_ctx *ctx, struct vm_area_struct *vma)
+{
+ unsigned long addr = vma->vm_start;
+ unsigned long end = vma->vm_end;
+ struct cr_pgarr *pgarr;
+ int nr, total = 0;
+
+ while (addr < end) {
+ if (!(pgarr = cr_pgarr_prep(ctx)))
+ return -ENOMEM;
+ if ((nr = cr_vma_fill_pgarr(ctx, pgarr, vma, &addr)) < 0)
+ return nr;
+ pgarr->nleft -= nr;
+ pgarr->nused += nr;
+ total += nr;
+ }
+
+ CR_PRINTK("total %d\n", total);
+ return total;
+}
+
+/**
+ * cr_vma_dump_pages - dump pages listed in the ctx page-array chain
+ * @ctx - checkpoint context
+ * @total - total number of pages
+ */
+static int cr_vma_dump_pages(struct cr_ctx *ctx, int total)
+{
+ struct cr_pgarr *pgarr;
+ int ret;
+
+ if (!total)
+ return 0;
+
+ for (pgarr = ctx->pgarr; pgarr; pgarr = pgarr->next) {
+ ret = cr_kwrite(ctx, pgarr->addrs,
+ pgarr->nused * sizeof(*pgarr->addrs));
+ if (ret < 0)
+ return ret;
+ }
+
+ for (pgarr = ctx->pgarr; pgarr; pgarr = pgarr->next) {
+ struct page **pages = pgarr->pages;
+ int nr = pgarr->nused;
+ void *ptr;
+
+ while (nr--) {
+ ptr = kmap(*pages);
+ ret = cr_kwrite(ctx, ptr, PAGE_SIZE);
+ kunmap(*pages);
+ if (ret < 0)
+ return ret;
+ pages++;
+ }
+ }
+
+ return total;
+}
+
+static int cr_write_vma(struct cr_ctx *ctx, struct vm_area_struct *vma)
+{
+ struct cr_hdr h;
+ struct cr_hdr_vma *hh = ctx->tbuf;
+ char *fname = NULL;
+ int how, nr, ret;
+
+ h.type = CR_HDR_VMA;
+ h.len = sizeof(*hh);
+ h.id = ctx->pid;
+
+ hh->vm_start = vma->vm_start;
+ hh->vm_end = vma->vm_end;
+ hh->vm_page_prot = vma->vm_page_prot.pgprot;
+ hh->vm_flags = vma->vm_flags;
+ hh->vm_pgoff = vma->vm_pgoff;
+
+ if (vma->vm_flags & (VM_SHARED | VM_IO | VM_HUGETLB | VM_NONLINEAR)) {
+ printk(KERN_WARNING "CR: unknown VMA %#lx\n", vma->vm_flags);
+ return -ETXTBSY;
+ }
+
+ /* by default assume anon memory */
+ how = CR_VMA_ANON;
+
+ /* if there is a backing file, assume private-mapped */
+ /* (NEED: check if the file is unlinked) */
+ if (vma->vm_file) {
+ nr = PAGE_SIZE;
+ fname = cr_get_fname(&vma->vm_file->f_path,
+ ctx->vfsroot, ctx->tbuf, &nr);
+ if (IS_ERR(fname))
+ return PTR_ERR(fname);
+ hh->namelen = nr;
+ how = CR_VMA_FILE;
+ } else
+ hh->namelen = 0;
+
+ hh->how = how;
+
+ /*
+ * it seems redundant now, but we do it in 3 steps for because:
+ * first, the logic is simpler when we how many pages before
+ * dumping them; second, a future optimization will defer the
+ * writeout (dump, and free) to a later step; in which case all
+ * the pages to be dumped will be aggregated on the checkpoint ctx
+ */
+
+ /* (1) scan: scan through the PTEs of the vma, both to count the
+ * pages to dump, and make those pages COW. keep the list of pages
+ * (and a reference to each page) on the checkpoint ctx */
+ nr = cr_vma_scan_pages(ctx, vma);
+ if (nr < 0) {
+ cr_put_fname(ctx->tbuf, fname, PAGE_SIZE);
+ return nr;
+ }
+
+ hh->npages = nr;
+ ret = cr_write_obj(ctx, &h, hh);
+
+ if (!ret && hh->namelen)
+ ret = cr_write_str(ctx, fname, hh->namelen);
+
+ cr_put_fname(ctx->tbuf, fname, PAGE_SIZE);
+
+ if (ret < 0)
+ return ret;
+
+ /* (2) dump: write out the addresses of all pages in the list (on
+ * the checkpoint ctx) followed by the contents of all pages */
+ ret = cr_vma_dump_pages(ctx, nr);
+
+ /* (3) free: free the extra references to the pages in the list */
+ cr_pgarr_release(ctx);
+
+ return ret;
+}
+
+#if defined(CONFIG_X86)
+static int cr_write_mm_context(struct cr_ctx *ctx, struct mm_struct *mm)
+{
+ struct cr_hdr h;
+ struct cr_hdr_mm_context *hh = ctx->tbuf;
+ int ret;
+
+ h.type = CR_HDR_MM_CONTEXT;
+ h.len = sizeof(*hh);
+ h.id = ctx->pid;
+
+ mutex_lock(&mm->context.lock);
+
+ hh->ldt_entry_size = LDT_ENTRY_SIZE;
+ hh->nldt = mm->context.size;
+
+ CR_PRINTK("nldt %d\n", hh->nldt);
+
+ ret = cr_write_obj(ctx, &h, hh);
+ if (ret < 0)
+ return ret;
+
+ ret = cr_kwrite(ctx, mm->context.ldt, hh->nldt * LDT_ENTRY_SIZE);
+
+ mutex_unlock(&mm->context.lock);
+
+ return ret;
+}
+#endif
+
+int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t)
+{
+ struct cr_hdr h;
+ struct cr_hdr_mm *hh = ctx->tbuf;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ int ret;
+
+ h.type = CR_HDR_MM;
+ h.len = sizeof(*hh);
+ h.id = ctx->pid;
+
+ mm = get_task_mm(t);
+
+ hh->tag = 1; /* non-zero will mean first time encounter */
+
+ hh->start_code = mm->start_code;
+ hh->end_code = mm->end_code;
+ hh->start_data = mm->start_data;
+ hh->end_data = mm->end_data;
+ hh->start_brk = mm->start_brk;
+ hh->brk = mm->brk;
+ hh->start_stack = mm->start_stack;
+ hh->arg_start = mm->arg_start;
+ hh->arg_end = mm->arg_end;
+ hh->env_start = mm->env_start;
+ hh->env_end = mm->env_end;
+
+ hh->map_count = mm->map_count;
+
+ /* FIX: need also mm->flags */
+
+ ret = cr_write_obj(ctx, &h, hh);
+ if (ret < 0)
+ goto out;
+
+ /* write the vma's */
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if ((ret = cr_write_vma(ctx, vma)) < 0)
+ break;
+ }
+ up_read(&mm->mmap_sem);
+
+ if (ret < 0)
+ goto out;
+
+ ret = cr_write_mm_context(ctx, mm);
+
+ out:
+ mmput(mm);
+ return ret;
+}
diff --git a/ckpt/ckpt_mem.h b/ckpt/ckpt_mem.h
new file mode 100644
index 0000000..f9846eb
--- /dev/null
+++ b/ckpt/ckpt_mem.h
@@ -0,0 +1,32 @@
+/*
+ * Generic container checkpoint-restart
+ *
+ * Copyright (C) 2008 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <linux/mm_types.h>
+
+/* page-array chains: each pgarr hols a list of <addr,page> tuples */
+struct cr_pgarr {
+ unsigned long *addrs;
+ struct page **pages;
+ struct cr_pgarr *next;
+ unsigned short nleft;
+ unsigned short nused;
+};
+
+/* vma subtypes */
+enum {
+ CR_VMA_ANON = 1,
+ CR_VMA_FILE
+};
+
+extern void _cr_pgarr_release(struct cr_ctx *ctx, struct cr_pgarr *pgarr);
+extern void cr_pgarr_release(struct cr_ctx *ctx);
+extern void cr_pgarr_free(struct cr_ctx *ctx);
+extern struct cr_pgarr *cr_pgarr_alloc(struct cr_ctx *ctx, struct cr_pgarr **pgnew);
+extern struct cr_pgarr *cr_pgarr_prep(struct cr_ctx *ctx);
diff --git a/ckpt/restart.c b/ckpt/restart.c
new file mode 100644
index 0000000..9f52851
--- /dev/null
+++ b/ckpt/restart.c
@@ -0,0 +1,328 @@
+/*
+ * Restart logic and helpers
+ *
+ * Copyright (C) 2008 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+/*
+ * During restart the code reads in data from the chekcpoint image into a
+ * temporary buffer (ctx->hbuf). Because operations can be nested, one
+ * should call cr_hbuf_get() to reserve space in the buffer, and then
+ * cr_hbuf_put() when it no longer needs that space
+ */
+
+#include <linux/version.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+
+#if defined(CONFIG_X86)
+#include <asm/desc.h>
+#include <asm/i387.h>
+#endif
+
+#include "ckpt.h"
+#include "ckpt_hdr.h"
+
+/**
+ * cr_hbuf_get - reserve space on the hbuf
+ * @ctx: checkpoint context
+ * @n: number of bytes to reserve
+ */
+void *cr_hbuf_get(struct cr_ctx *ctx, int n)
+{
+ void *ptr;
+
+ BUG_ON(ctx->hpos + n > CR_HBUF_TOTAL);
+ ptr = (void *) (((char *) ctx->hbuf) + ctx->hpos);
+ ctx->hpos += n;
+ return ptr;
+}
+
+/**
+ * cr_hbuf_put - unreserve space on the hbuf
+ * @ctx: checkpoint context
+ * @n: number of bytes to reserve
+ */
+void cr_hbuf_put(struct cr_ctx *ctx, int n)
+{
+ BUG_ON(ctx->hpos < n);
+ ctx->hpos -= n;
+}
+
+/**
+ * cr_read_obj - read a whole record (cr_hdr followed by payload)
+ * @ctx: checkpoint context
+ * @h: record descriptor
+ * @buf: record buffer
+ * @n: available buffer size
+ */
+int cr_read_obj(struct cr_ctx *ctx, struct cr_hdr *h, void *buf, int n)
+{
+ int ret;
+
+ ret = cr_kread(ctx, h, sizeof(*h));
+ if (ret < 0)
+ return ret;
+
+ CR_PRINTK("type %d len %d id %d (%d)\n", h->type, h->len, h->id, n);
+ if (h->len < 0 || h->len > n)
+ return -EINVAL;
+
+ return cr_kread(ctx, buf, h->len);
+}
+
+/**
+ * cr_read_obj_type - read a whole record of expected type
+ * @ctx: checkpoint context
+ * @buf: record buffer
+ * @n: available buffer size
+ * @type: expected record type
+ */
+int cr_read_obj_type(struct cr_ctx *ctx, void *buf, int n, int type)
+{
+ struct cr_hdr h;
+ int ret;
+
+ ret = cr_read_obj(ctx, &h, buf, n);
+ if (!ret)
+ ret = (h.type == type ? h.id : -EINVAL);
+ return ret;
+}
+
+/**
+ * cr_read_str - read a string record
+ * @ctx: checkpoint context
+ * @str: string buffer
+ * @n: string length
+ */
+int cr_read_str(struct cr_ctx *ctx, void *str, int n)
+{
+ return cr_read_obj_type(ctx, str, n, CR_HDR_STR);
+}
+
+/* read the checkpoint header */
+static int cr_read_hdr(struct cr_ctx *ctx)
+{
+ struct cr_hdr_head *hh = cr_hbuf_get(ctx, sizeof(*hh));
+ int ret;
+
+ ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_HEAD);
+ if (ret < 0)
+ return ret;
+
+ if (hh->magic != 0x00a2d200 || hh->version != 1 ||
+ hh->major != ((LINUX_VERSION_CODE >> 16) & 0xff) ||
+ hh->minor != ((LINUX_VERSION_CODE >> 8) & 0xff) ||
+ hh->patch != ((LINUX_VERSION_CODE) & 0xff))
+ return -EINVAL;
+
+ if (hh->flags & ~CR_CTX_CKPT)
+ return -EINVAL;
+
+ ctx->oflags = hh->flags;
+
+ cr_hbuf_put(ctx, sizeof(*hh));
+ return 0;
+}
+
+/* read the checkpoint trailer */
+static int cr_read_tail(struct cr_ctx *ctx)
+{
+ struct cr_hdr_tail *hh = cr_hbuf_get(ctx, sizeof(*hh));
+ int ret;
+
+ ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TAIL);
+ if (ret < 0)
+ return ret;
+
+ if (hh->magic != 0x002d2a00 ||
+ hh->cksum[0] != 1 || hh->cksum[1] != 1)
+ return -EINVAL;
+
+ cr_hbuf_put(ctx, sizeof(*hh));
+ return 0;
+}
+
+/* read the task_struct into the current task */
+static int cr_read_task_struct(struct cr_ctx *ctx)
+{
+ struct cr_hdr_task *hh = cr_hbuf_get(ctx, sizeof(*hh));
+ struct task_struct *t = current;
+ int ret;
+
+ ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TASK);
+ if (ret < 0)
+ return ret;
+
+ /* for now, only restore t->comm */
+ if (hh->task_comm_len < 0 || hh->task_comm_len > TASK_COMM_LEN)
+ return -EINVAL;
+
+ memset(t->comm, 0, TASK_COMM_LEN);
+ memcpy(t->comm, hh->comm, hh->task_comm_len);
+
+ cr_hbuf_put(ctx, sizeof(*hh));
+ return 0;
+}
+
+#if defined(CONFIG_X86)
+/* read the thread_struct into the current task */
+static int cr_read_thread(struct cr_ctx *ctx)
+{
+ struct cr_hdr_thread *hh = cr_hbuf_get(ctx, sizeof(*hh));
+ struct task_struct *t = current;
+ struct thread_struct *thread = &t->thread;
+ int ret;
+
+ ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_THREAD);
+ if (ret < 0)
+ return ret;
+
+ CR_PRINTK("ntls %d\n", hh->ntls);
+
+ if (hh->gdt_entry_tls_entries != GDT_ENTRY_TLS_ENTRIES ||
+ hh->sizeof_tls_array != sizeof(thread->tls_array) ||
+ hh->ntls < 0 || hh->ntls > GDT_ENTRY_TLS_ENTRIES)
+ return -EINVAL;
+
+ if (hh->ntls > 0) {
+
+ /* restore TLS by hand: why convert to struct user_desc if
+ * sys_set_thread_entry() will convert it back ? */
+
+ struct desc_struct *buf = ctx->tbuf;
+ int size = sizeof(*buf) * GDT_ENTRY_TLS_ENTRIES;
+ int cpu;
+
+ BUG_ON(size > CR_TBUF_TOTAL);
+
+ ret = cr_kread(ctx, buf, size);
+ if (ret < 0)
+ return ret;
+
+ /* FIX: add sanity checks (eg. that values makes sense, that
+ * that we don't overwrite old values, etc */
+
+ cpu = get_cpu();
+ memcpy(thread->tls_array, buf, size);
+ load_TLS(thread, cpu);
+ put_cpu();
+ }
+
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_X86)
+/* read the cpu state nad registers for the current task */
+static int cr_read_cpu(struct cr_ctx *ctx)
+{
+ struct cr_hdr_cpu *hh = cr_hbuf_get(ctx, sizeof(*hh));
+ struct task_struct *t = current;
+ struct thread_struct *thread;
+ struct thread_info *thread_info;
+ struct pt_regs *regs;
+ int ret;
+
+ ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_CPU);
+ if (ret < 0)
+ return ret;
+
+ /* FIX: sanity check for sensitive registers (eg. eflags) */
+
+ thread = &t->thread;
+ thread_info = task_thread_info(t);
+ regs = task_pt_regs(t);
+
+ regs->bx = hh->bx;
+ regs->cx = hh->cx;
+ regs->dx = hh->dx;
+ regs->si = hh->si;
+ regs->di = hh->di;
+ regs->bp = hh->bp;
+ regs->ax = hh->ax;
+ regs->ds = hh->ds;
+ regs->es = hh->es;
+ regs->orig_ax = hh->orig_ax;
+ regs->ip = hh->ip;
+ regs->cs = hh->cs;
+ regs->flags = hh->flags;
+ regs->sp = hh->sp;
+ regs->ss = hh->ss;
+
+ thread->gs = hh->gs;
+ thread->fs = hh->fs;
+ loadsegment(gs, hh->gs);
+ loadsegment(fs, hh->fs);
+
+ CR_PRINTK("math %d debug %d\n", hh->used_math, hh->uses_debug);
+
+ /* FIX: this should work ... (someone double check !) */
+
+ preempt_disable();
+
+ /* i387 + MMU + SSE */
+ __clear_fpu(t); /* in case we used FPU in user mode */
+ if (!hh->used_math)
+ clear_used_math();
+ else {
+ if (hh->has_fxsr != cpu_has_fxsr) {
+ force_sig(SIGFPE, t);
+ return -EINVAL;
+ }
+ memcpy(&thread->xstate, &hh->xstate, sizeof(thread->xstate));
+ set_used_math();
+ }
+
+ /* debug regs */
+ if (hh->uses_debug) {
+ set_debugreg(hh->debugreg0, 0);
+ set_debugreg(hh->debugreg1, 1);
+ set_debugreg(hh->debugreg2, 2);
+ set_debugreg(hh->debugreg3, 3);
+ set_debugreg(hh->debugreg6, 6);
+ set_debugreg(hh->debugreg7, 7);
+ }
+
+ preempt_enable();
+
+ return 0;
+}
+#endif
+
+/* read the entire state of the current task */
+static int cr_read_task(struct cr_ctx *ctx)
+{
+ int ret;
+
+ ret = cr_read_task_struct(ctx);
+ CR_PRINTK("ret (task_struct) %d\n", ret);
+ if (!ret)
+ ret = cr_read_mm(ctx);
+ CR_PRINTK("ret (mm) %d\n", ret);
+ if (!ret)
+ ret = cr_read_thread(ctx);
+ CR_PRINTK("ret (thread) %d\n", ret);
+ if (!ret)
+ ret = cr_read_cpu(ctx);
+ CR_PRINTK("ret (cpu) %d\n", ret);
+
+ return ret;
+}
+
+int do_restart(struct cr_ctx *ctx)
+{
+ int ret;
+
+ ret = cr_read_hdr(ctx);
+ if (!ret)
+ ret = cr_read_task(ctx);
+ if (!ret)
+ ret = cr_read_tail(ctx);
+
+ return ret;
+}
diff --git a/ckpt/rstr_mem.c b/ckpt/rstr_mem.c
new file mode 100644
index 0000000..97fc14a
--- /dev/null
+++ b/ckpt/rstr_mem.c
@@ -0,0 +1,415 @@
+/*
+ * Restart memory contents
+ *
+ * Copyright (C) 2008 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <asm/unistd.h>
+
+#include <linux/sched.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/mm_types.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <asm/cacheflush.h>
+
+#if defined(CONFIG_X86)
+#include <asm/desc.h>
+#include <asm/ldt.h>
+#endif
+
+#include "ckpt.h"
+#include "ckpt_hdr.h"
+#include "ckpt_mem.h"
+
+/*
+ * Unlike checkpoint, restart is executed in the context of each restarting
+ * process: vma regions are restored via a call to mmap(), and the data is
+ * read in directly to the address space of the current process
+ */
+
+/**
+ * cr_vma_read_pages_addr - read addresses of pages to page-array chain
+ * @ctx - restart context
+ * @npages - number of pages
+ */
+static int cr_vma_read_pages_addr(struct cr_ctx *ctx, int npages)
+{
+ struct cr_pgarr *pgarr;
+ int nr, ret;
+
+ while (npages) {
+ if (!(pgarr = cr_pgarr_prep(ctx)))
+ return -ENOMEM;
+ nr = min(npages, (int) pgarr->nleft);
+ ret = cr_kread(ctx, pgarr->addrs, nr * sizeof(unsigned long));
+ if (ret < 0)
+ return ret;
+ pgarr->nleft -= nr;
+ pgarr->nused += nr;
+ npages -= nr;
+ }
+ return 0;
+}
+
+/**
+ * cr_vma_read_pages_data - read in data of pages in page-array chain
+ * @ctx - restart context
+ * @npages - number of pages
+ */
+static int cr_vma_read_pages_data(struct cr_ctx *ctx, int npages)
+{
+ struct cr_pgarr *pgarr;
+ unsigned long *addrs;
+ int nr, ret;
+
+ for (pgarr = ctx->pgarr; npages; pgarr = pgarr->next) {
+ addrs = pgarr->addrs;
+ nr = pgarr->nused;
+ npages -= nr;
+ while (nr--) {
+ ret = cr_uread(ctx, (void *) *(addrs++), PAGE_SIZE);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/* change the protection of an address range to be writable/non-writable.
+ * this is useful when restoring the memory of a read-only vma */
+static int cr_vma_writable(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int writable)
+{
+ struct vm_area_struct *vma, *prev;
+ unsigned long flags = 0;
+ int ret = -EINVAL;
+
+ CR_PRINTK("vma %#lx-%#lx writable %d\n", start, end, writable);
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma_prev(mm, start, &prev);
+ if (unlikely(!vma || vma->vm_start > end || vma->vm_end < start))
+ goto out;
+ if (writable && !(vma->vm_flags & VM_WRITE))
+ flags = vma->vm_flags | VM_WRITE;
+ else if (!writable && (vma->vm_flags & VM_WRITE))
+ flags = vma->vm_flags & ~VM_WRITE;
+ CR_PRINTK("flags %#lx\n", flags);
+ if (flags)
+ ret = mprotect_fixup(vma, &prev, vma->vm_start,
+ vma->vm_end, flags);
+ out:
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
+/**
+ * cr_vma_read_pages - read in pages for to restore a vma
+ * @ctx - restart context
+ * @cr_vma - vma descriptor from restart
+ */
+static int cr_vma_read_pages(struct cr_ctx *ctx, struct cr_hdr_vma *cr_vma)
+{
+ struct mm_struct *mm = current->mm;
+ int ret = 0;
+
+ if (!cr_vma->npages)
+ return 0;
+
+ /* in the unlikely case that this vma is read-only */
+ if (!(cr_vma->vm_flags & VM_WRITE))
+ ret = cr_vma_writable(mm, cr_vma->vm_start, cr_vma->vm_end, 1);
+
+ if (!ret)
+ ret = cr_vma_read_pages_addr(ctx, cr_vma->npages);
+ if (!ret)
+ ret = cr_vma_read_pages_data(ctx, cr_vma->npages);
+ if (ret < 0)
+ return ret;
+
+ cr_pgarr_release(ctx); /* reset page-array chain */
+
+ /* restore original protection for this vma */
+ if (!(cr_vma->vm_flags & VM_WRITE))
+ ret = cr_vma_writable(mm, cr_vma->vm_start, cr_vma->vm_end, 0);
+
+ return ret;
+}
+
+/**
+ * cr_calc_map_prot_bits - convert vm_flags to mmap protection
+ * orig_vm_flags: source vm_flags
+ */
+static unsigned long cr_calc_map_prot_bits(unsigned long orig_vm_flags)
+{
+ unsigned long vm_prot = 0;
+
+ if (orig_vm_flags & VM_READ)
+ vm_prot |= PROT_READ;
+ if (orig_vm_flags & VM_WRITE)
+ vm_prot |= PROT_WRITE;
+ if (orig_vm_flags & VM_EXEC)
+ vm_prot |= PROT_EXEC;
+ if (orig_vm_flags & PROT_SEM) /* only (?) with IPC-SHM */
+ vm_prot |= PROT_SEM;
+
+ return vm_prot;
+}
+
+/**
+ * cr_calc_map_flags_bits - convert vm_flags to mmap flags
+ * orig_vm_flags: source vm_flags
+ */
+static unsigned long cr_calc_map_flags_bits(unsigned long orig_vm_flags)
+{
+ unsigned long vm_flags = 0;
+
+ vm_flags = MAP_FIXED;
+ if (orig_vm_flags & VM_GROWSDOWN)
+ vm_flags |= MAP_GROWSDOWN;
+ if (orig_vm_flags & VM_DENYWRITE)
+ vm_flags |= MAP_DENYWRITE;
+ if (orig_vm_flags & VM_EXECUTABLE)
+ vm_flags |= MAP_EXECUTABLE;
+ if (orig_vm_flags & VM_MAYSHARE)
+ vm_flags |= MAP_SHARED;
+ else
+ vm_flags |= MAP_PRIVATE;
+
+ return vm_flags;
+}
+
+static int cr_read_vma(struct cr_ctx *ctx, struct mm_struct *mm)
+{
+ struct cr_hdr_vma *hh = cr_hbuf_get(ctx, sizeof(*hh));
+ unsigned long vm_size, vm_flags, vm_prot, vm_pgoff;
+ unsigned long addr;
+ unsigned long flags;
+ struct file *file = NULL;
+ char *fname = NULL;
+ int ret;
+
+ ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_VMA);
+ if (ret < 0)
+ return ret;
+
+ CR_PRINTK("vma %#lx-%#lx npages %d namelen %d\n",
+ (unsigned long) hh->vm_start, (unsigned long) hh->vm_end,
+ (int) hh->npages, (int) hh->namelen);
+
+ if (hh->vm_end < hh->vm_start)
+ return -EINVAL;
+ if (hh->npages < 0 || hh->namelen < 0)
+ return -EINVAL;
+
+ vm_size = hh->vm_end - hh->vm_start;
+ vm_prot = cr_calc_map_prot_bits(hh->vm_flags);
+ vm_flags = cr_calc_map_flags_bits(hh->vm_flags);
+ vm_pgoff = hh->vm_pgoff;
+
+ if (hh->namelen) {
+ fname = ctx->tbuf;
+ ret = cr_read_str(ctx, fname, PAGE_SIZE);
+ if (ret < 0)
+ return ret;
+ }
+
+ CR_PRINTK("vma fname '%s' how %d\n", fname, hh->how);
+
+ switch (hh->how) {
+
+ case CR_VMA_ANON: /* anonymous private mapping */
+ if (hh->namelen)
+ return -EINVAL;
+ /* vm_pgoff for anonymous mapping is the "global" page
+ offset (namely from addr 0x0), so we force a zero */
+ vm_pgoff = 0;
+ break;
+
+ case CR_VMA_FILE: /* private mapping from a file */
+ if (!hh->namelen)
+ return -EINVAL;
+ /* O_RDWR only needed if both (VM_WRITE|VM_SHARED) are set */
+ flags = hh->vm_flags & (VM_WRITE | VM_SHARED);
+ flags = (flags == (VM_WRITE | VM_SHARED) ? O_RDWR : O_RDONLY);
+ file = filp_open(fname, flags, 0);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ break;
+
+ default:
+ return -EINVAL;
+
+ }
+
+ addr = do_mmap_pgoff(file, (unsigned long) hh->vm_start,
+ vm_size, vm_prot, vm_flags, vm_pgoff);
+ CR_PRINTK("vma size %#lx prot %#lx flags %#lx pgoff %#lx => %#lx\n",
+ vm_size, vm_prot, vm_flags, vm_pgoff, addr);
+
+ /* the file (if opened) is now referenced by the vma */
+ if (file)
+ filp_close(file, NULL);
+
+ if (IS_ERR((void*) addr))
+ return (PTR_ERR((void *) addr));
+
+ /*
+ * CR_VMA_ANON: read in memory as is
+ * CR_VMA_FILE: read in memory as is
+ * (more to follow ...)
+ */
+
+ switch (hh->how) {
+ case CR_VMA_ANON:
+ case CR_VMA_FILE:
+ /* standard case: read the data into the memory */
+ ret = cr_vma_read_pages(ctx, hh);
+ break;
+ }
+
+ if (ret < 0)
+ return ret;
+
+ if (vm_prot & PROT_EXEC)
+ flush_icache_range(hh->vm_start, hh->vm_end);
+
+ cr_hbuf_put(ctx, sizeof(*hh));
+ CR_PRINTK("vma retval %d\n", ret);
+ return 0;
+}
+
+#if defined(CONFIG_X86)
+
+extern asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount);
+
+static int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm)
+{
+ struct cr_hdr_mm_context *hh = cr_hbuf_get(ctx, sizeof(*hh));
+ int n, ret;
+
+ ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_MM_CONTEXT);
+ if (ret < 0)
+ return ret;
+
+ CR_PRINTK("nldt %d\n", hh->nldt);
+
+ if (hh->nldt < 0 || hh->ldt_entry_size != LDT_ENTRY_SIZE)
+ return -EINVAL;
+
+ /* to utilize the syscall modify_ldt() we first convert the data
+ * in the checkpoint image from 'struct desc_struct' to 'struct
+ * user_desc' with reverse logic of inclue/asm/desc.h:fill_ldt() */
+
+ for (n = 0; n < hh->nldt; n++) {
+ struct user_desc info;
+ struct desc_struct desc;
+ mm_segment_t old_fs;
+
+ ret = cr_kread(ctx, &desc, LDT_ENTRY_SIZE);
+ if (ret < 0)
+ return ret;
+
+ info.entry_number = n;
+ info.base_addr = desc.base0 | (desc.base1 << 16);
+ info.limit = desc.limit0;
+ info.seg_32bit = desc.d;
+ info.contents = desc.type >> 2;
+ info.read_exec_only = (desc.type >> 1) ^ 1;
+ info.limit_in_pages = desc.g;
+ info.seg_not_present = desc.p ^ 1;
+ info.useable = desc.avl;
+
+ old_fs = get_fs();
+ set_fs(get_ds());
+ ret = sys_modify_ldt(1, &info, sizeof(info));
+ set_fs(old_fs);
+
+ if (ret < 0)
+ return ret;
+ }
+
+ load_LDT(&mm->context);
+
+ cr_hbuf_put(ctx, sizeof(*hh));
+ return 0;
+}
+#endif
+
+static int cr_destroy_mm(struct mm_struct *mm)
+{
+ struct vm_area_struct *vmnext = mm->mmap;
+ struct vm_area_struct *vma;
+ int ret;
+
+ while (vmnext) {
+ vma = vmnext;
+ vmnext = vmnext->vm_next;
+ ret = do_munmap(mm, vma->vm_start, vma->vm_end-vma->vm_start);
+ if (ret < 0)
+ return ret;
+ }
+ return 0;
+}
+
+int cr_read_mm(struct cr_ctx *ctx)
+{
+ struct cr_hdr_mm *hh = cr_hbuf_get(ctx, sizeof(*hh));
+ struct mm_struct *mm;
+ int nr, ret;
+
+ ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_MM);
+ if (ret < 0)
+ return ret;
+
+ CR_PRINTK("map_count %d\n", hh->map_count);
+
+ /* XXX need more sanity checks */
+ if (hh->start_code > hh->end_code ||
+ hh->start_data > hh->end_data || hh->map_count < 0)
+ return -EINVAL;
+
+ mm = current->mm;
+
+ /* point of no return -- destruct current mm */
+ down_write(&mm->mmap_sem);
+ ret = cr_destroy_mm(mm);
+ up_write(&mm->mmap_sem);
+
+ if (ret < 0)
+ return ret;
+
+ mm->start_code = hh->start_code;
+ mm->end_code = hh->end_code;
+ mm->start_data = hh->start_data;
+ mm->end_data = hh->end_data;
+ mm->start_brk = hh->start_brk;
+ mm->brk = hh->brk;
+ mm->start_stack = hh->start_stack;
+ mm->arg_start = hh->arg_start;
+ mm->arg_end = hh->arg_end;
+ mm->env_start = hh->env_start;
+ mm->env_end = hh->env_end;
+
+ /* FIX: need also mm->flags */
+
+ for (nr = hh->map_count; nr; nr--) {
+ ret = cr_read_vma(ctx, mm);
+ if (ret < 0)
+ return ret;
+ }
+
+ cr_hbuf_put(ctx, sizeof(*hh));
+
+ return cr_read_mm_context(ctx, mm);
+}
diff --git a/ckpt/sys.c b/ckpt/sys.c
new file mode 100644
index 0000000..95ebfc7
--- /dev/null
+++ b/ckpt/sys.c
@@ -0,0 +1,239 @@
+/*
+ * Generic container checkpoint-restart
+ *
+ * Copyright (C) 2008 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/capability.h>
+
+#include "ckpt.h"
+#include "ckpt_mem.h"
+
+/*
+ * helpers to write/read to/from the image file descriptor
+ *
+ * cr_uwrite() - write a user-space buffer to the checkpoint image
+ * cr_kwrite() - write a kernel-space buffer to the checkpoint image
+ * cr_uread() - read from the checkpoint image to a user-space buffer
+ * cr_kread() - read from the checkpoint image to a kernel-space buffer
+ *
+ */
+
+/* (temporarily added file_pos_read() and file_pos_write() because they
+ * are static in fs/read_write.c... should cleanup and remove later) */
+static inline loff_t file_pos_read(struct file *file)
+{
+ return file->f_pos;
+}
+
+static inline void file_pos_write(struct file *file, loff_t pos)
+{
+ file->f_pos = pos;
+}
+
+int cr_uwrite(struct cr_ctx *ctx, void *buf, int count)
+{
+ struct file *file = ctx->file;
+ ssize_t nwrite;
+ int nleft;
+
+ for (nleft = count; nleft; nleft -= nwrite) {
+ loff_t pos = file_pos_read(file);
+ nwrite = vfs_write(file, (char __user *) buf, nleft, &pos);
+ file_pos_write(file, pos);
+ if (unlikely(nwrite <= 0)) /* zero tolerance */
+ return (nwrite ? : -EIO);
+ buf += nwrite;
+ }
+
+ ctx->total += count;
+ return 0;
+}
+
+int cr_kwrite(struct cr_ctx *ctx, void *buf, int count)
+{
+ mm_segment_t oldfs;
+ int ret;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = cr_uwrite(ctx, buf, count);
+ set_fs(oldfs);
+
+ return ret;
+}
+
+int cr_uread(struct cr_ctx *ctx, void *buf, int count)
+{
+ struct file *file = ctx->file;
+ ssize_t nread;
+ int nleft;
+
+ for (nleft = count; nleft; nleft -= nread) {
+ loff_t pos = file_pos_read(file);
+ nread = vfs_read(file, (char __user *) buf, nleft, &pos);
+ file_pos_write(file, pos);
+ if (unlikely(nread <= 0)) /* zero tolerance */
+ return (nread ? : -EIO);
+ buf += nread;
+ }
+
+ ctx->total += count;
+ return 0;
+}
+
+int cr_kread(struct cr_ctx *ctx, void *buf, int count)
+{
+ mm_segment_t oldfs;
+ int ret;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = cr_uread(ctx, buf, count);
+ set_fs(oldfs);
+
+ return ret;
+}
+
+
+/*
+ * helpers to manage CR contexts: allocated for each checkpoint and/or
+ * restart operation, and persists until the operation is completed.
+ */
+
+static atomic_t cr_ctx_count; /* unique checkpoint identifier */
+
+void cr_ctx_free(struct cr_ctx *ctx)
+{
+
+ if (ctx->file)
+ fput(ctx->file);
+ if (ctx->vfsroot)
+ path_put(ctx->vfsroot);
+
+ cr_pgarr_free(ctx);
+
+ free_pages((unsigned long) ctx->tbuf, CR_ORDER_TBUF);
+ free_pages((unsigned long) ctx->hbuf, CR_ORDER_HBUF);
+
+ kfree(ctx);
+}
+
+struct cr_ctx *cr_ctx_alloc(pid_t pid, struct file *file, unsigned long flags)
+{
+ struct cr_ctx *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ ctx->tbuf = (void *) __get_free_pages(GFP_KERNEL, CR_ORDER_TBUF);
+ ctx->hbuf = (void *) __get_free_pages(GFP_KERNEL, CR_ORDER_HBUF);
+ if (!ctx->tbuf || !ctx->hbuf)
+ goto nomem;
+
+ if (!cr_pgarr_alloc(ctx, &ctx->pgarr))
+ goto nomem;
+
+ ctx->pid = pid;
+ ctx->flags = flags;
+
+ ctx->file = file;
+ get_file(file);
+
+ /* assume checkpointer is in container's root vfs */
+ ctx->vfsroot = ¤t->fs->root;
+ path_get(ctx->vfsroot);
+
+ ctx->crid = atomic_inc_return(&cr_ctx_count);
+
+ return ctx;
+
+ nomem:
+ cr_ctx_free(ctx);
+ return NULL;
+}
+
+/**
+ * sys_checkpoint - checkpoint a container
+ * @pid: pid of the container init(1) process
+ * @fd: file to which dump the checkpoint image
+ * @flags: checkpoint operation flags
+ */
+asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags)
+{
+ struct cr_ctx *ctx;
+ struct file *file;
+ int fput_needed;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ file = fget_light(fd, &fput_needed);
+ if (!file)
+ return -EBADF;
+
+ /* no flags for now */
+ if (flags)
+ return -EINVAL;
+
+ ctx = cr_ctx_alloc(pid, file, flags | CR_CTX_CKPT);
+ if (!ctx) {
+ fput_light(file, fput_needed);
+ return -ENOMEM;
+ }
+
+ ret = do_checkpoint(ctx);
+
+ cr_ctx_free(ctx);
+ fput_light(file, fput_needed);
+ CR_PRINTK("ckpt retval = %d\n", ret);
+ return ret;
+}
+
+/**
+ * sys_restart - restart a container
+ * @crid: checkpoint image identifier
+ * @fd: file from which read the checkpoint image
+ * @flags: restart operation flags
+ */
+asmlinkage long sys_restart(int crid, int fd, unsigned long flags)
+{
+ struct cr_ctx *ctx;
+ struct file *file;
+ int fput_needed;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ file = fget_light(fd, &fput_needed);
+ if (!file)
+ return -EBADF;
+
+ /* no flags for now */
+ if (flags)
+ return -EINVAL;
+
+ ctx = cr_ctx_alloc(crid, file, flags | CR_CTX_RSTR);
+ if (!ctx) {
+ fput_light(file, fput_needed);
+ return -ENOMEM;
+ }
+
+ ret = do_restart(ctx);
+
+ cr_ctx_free(ctx);
+ fput_light(file, fput_needed);
+ CR_PRINTK("restart retval = %d\n", ret);
+ return ret;
+}
--
1.5.4.3
^ permalink raw reply related [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <Pine.LNX.4.64.0807292325290.9868-CXF6herHY6ykSYb+qCZC/1i27PF6R63G9nwVQlTi/Pw@public.gmane.org>
@ 2008-07-30 4:51 ` KOSAKI Motohiro
[not found] ` <20080730132257.9DF2.KOSAKI.MOTOHIRO-+CUm20s59erQFUHtdCDX3A@public.gmane.org>
2008-07-30 20:58 ` Dave Hansen
2008-07-30 22:07 ` Serge E. Hallyn
2 siblings, 1 reply; 37+ messages in thread
From: KOSAKI Motohiro @ 2008-07-30 4:51 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers
Hi
> Expand the template sys_checkpoint and sys_restart to be able to dump
> and restore a single task. The task's address space may consist of only
> private, simple vma's - anonymous or file-mapped.
>
> This big patch adds a mechanism to transfer data between kernel or user
> space to and from the file given by the caller (sys.c), alloc/setup/free
> of the checkpoint/restart context (sys.c), output wrappers and basic
> checkpoint handling (checkpoint.c), memory dump (ckpt_mem.c), input
> wrappers and basic restart handling (restart.c), and finally the memory
> restore (rstr_mem.c).
>
> Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
please write a documentation of describe memory dump file format,
and split save and restore to two patches.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
@ 2008-07-30 16:52 Serge E. Hallyn
[not found] ` <20080730165249.GA23802-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Serge E. Hallyn @ 2008-07-30 16:52 UTC (permalink / raw)
To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
Cc: Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ
This list is getting on my nerves. Louis, I'm sorry the threading
is going to get messed up.
----- Forwarded message from mailman-bounces-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org -----
Subject: Content filtered message notification
From: mailman-bounces-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
To: containers-owner-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
Date: Wed, 30 Jul 2008 09:16:12 -0700
The attached message matched the containers mailing list's content
filtering rules and was prevented from being forwarded on to the list
membership. You are receiving the only remaining copy of the
discarded message.
Date: Wed, 30 Jul 2008 18:15:35 +0200
From: Louis Rilling <Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ@public.gmane.org>
To: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Cc: Linux Containers <containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org>
Subject: Re: [RFC][PATCH 2/2] CR: handle a single task with private memory
maps
Reply-To: Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ@public.gmane.org
Hi Oren,
On Tue, Jul 29, 2008 at 11:27:17PM -0400, Oren Laadan wrote:
>
> Expand the template sys_checkpoint and sys_restart to be able to dump
> and restore a single task. The task's address space may consist of only
> private, simple vma's - anonymous or file-mapped.
>
> This big patch adds a mechanism to transfer data between kernel or user
> space to and from the file given by the caller (sys.c), alloc/setup/free
> of the checkpoint/restart context (sys.c), output wrappers and basic
> checkpoint handling (checkpoint.c), memory dump (ckpt_mem.c), input
> wrappers and basic restart handling (restart.c), and finally the memory
> restore (rstr_mem.c).
This looks globally clean to me, but I'm sure that others will have stronger
arguments against or in favor of it.
Just a few comments inline, in case it helps.
[...]
> diff --git a/ckpt/checkpoint.c b/ckpt/checkpoint.c
> new file mode 100644
> index 0000000..1698a35
> --- /dev/null
> +++ b/ckpt/checkpoint.c
[...]
> +/* dump the task_struct of a given task */
> +static int cr_write_task_struct(struct cr_ctx *ctx, struct task_struct *t)
> +{
> + struct cr_hdr h;
> + struct cr_hdr_task *hh = ctx->tbuf;
> +
> + h.type = CR_HDR_TASK;
> + h.len = sizeof(*hh);
> + h.id = ctx->pid;
> +
> + hh->state = t->state;
> + hh->exit_state = t->exit_state;
> + hh->exit_code = t->exit_code;
> + hh->exit_signal = t->exit_signal;
> +
> + hh->pid = t->pid;
> + hh->tgid = t->tgid;
IIRC, it is assumed that pid and tgid will be restored before actually calling
sys_restart(), eg by giving the proper pid and clone flags to a variant of
do_fork(). So, maybe these ids are useless here and should be put earlier in the
checkpoint header (see also the matching comment below in cr_read_task_struct()).
[...]
> diff --git a/ckpt/ckpt_mem.c b/ckpt/ckpt_mem.c
> new file mode 100644
> index 0000000..12caad0
> --- /dev/null
> +++ b/ckpt/ckpt_mem.c
[...]
> +/**
> + * cr_vma_fill_pgarr - fill a page-array with addr/page tuples for a vma
> + * @ctx - checkpoint context
> + * @pgarr - page-array to fill
> + * @vma - vma to scan
> + * @start - start address (updated)
> + */
> +static int cr_vma_fill_pgarr(struct cr_ctx *ctx, struct cr_pgarr *pgarr,
> + struct vm_area_struct *vma, unsigned long *start)
> +{
> + unsigned long end = vma->vm_end;
> + unsigned long addr = *start;
> + struct page **pagep;
> + unsigned long *addrp;
> + int cow, nr, ret = 0;
> +
> + nr = pgarr->nleft;
> + pagep = &pgarr->pages[pgarr->nused];
> + addrp = &pgarr->addrs[pgarr->nused];
> + cow = !!vma->vm_file;
> +
> + while (addr < end) {
> + struct page *page;
> +
> + /* simplified version of get_user_pages(): already have vma,
> + * only need FOLL_TOUCH, and (for now) ignore fault stats */
> +
> + cond_resched();
> + while (!(page = follow_page(vma, addr, FOLL_TOUCH))) {
> + ret = handle_mm_fault(vma->vm_mm, vma, addr, 0);
> + if (ret & VM_FAULT_ERROR) {
> + if (ret & VM_FAULT_OOM)
> + ret = -ENOMEM;
> + else if (ret & VM_FAULT_SIGBUS)
> + ret = -EFAULT;
> + else
> + BUG();
> + break;
> + }
> + cond_resched();
> + }
I guess that 'ret' should be checked somewhere after this loop.
> +
> + if (IS_ERR(page)) {
> + ret = PTR_ERR(page);
> + break;
> + }
> +
> + if (page == ZERO_PAGE(0))
> + page = NULL; /* zero page: ignore */
> + else if (cow && page_mapping(page) != NULL)
> + page = NULL; /* clean cow: ignore */
> + else {
> + get_page(page);
> + *(addrp++) = addr;
> + *(pagep++) = page;
> + if (--nr == 0) {
> + addr += PAGE_SIZE;
> + break;
> + }
> + }
> +
> + addr += PAGE_SIZE;
> + }
> +
> + if (unlikely(ret < 0)) {
> + nr = pgarr->nleft - nr;
> + while (nr--)
> + page_cache_release(*(--pagep));
> + return ret;
> + }
> +
> + *start = addr;
> + return (pgarr->nleft - nr);
> +}
> +
[...]
> +int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t)
> +{
> + struct cr_hdr h;
> + struct cr_hdr_mm *hh = ctx->tbuf;
> + struct mm_struct *mm;
> + struct vm_area_struct *vma;
> + int ret;
> +
> + h.type = CR_HDR_MM;
> + h.len = sizeof(*hh);
> + h.id = ctx->pid;
> +
> + mm = get_task_mm(t);
> +
> + hh->tag = 1; /* non-zero will mean first time encounter */
> +
> + hh->start_code = mm->start_code;
> + hh->end_code = mm->end_code;
> + hh->start_data = mm->start_data;
> + hh->end_data = mm->end_data;
> + hh->start_brk = mm->start_brk;
> + hh->brk = mm->brk;
> + hh->start_stack = mm->start_stack;
> + hh->arg_start = mm->arg_start;
> + hh->arg_end = mm->arg_end;
> + hh->env_start = mm->env_start;
> + hh->env_end = mm->env_end;
> +
> + hh->map_count = mm->map_count;
Some fields above should also be protected with mmap_sem, like ->brk,
->map_count, and possibly others (I'm not a memory expert though).
> +
> + /* FIX: need also mm->flags */
> +
> + ret = cr_write_obj(ctx, &h, hh);
> + if (ret < 0)
> + goto out;
> +
> + /* write the vma's */
> + down_read(&mm->mmap_sem);
> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> + if ((ret = cr_write_vma(ctx, vma)) < 0)
> + break;
> + }
> + up_read(&mm->mmap_sem);
> +
> + if (ret < 0)
> + goto out;
> +
> + ret = cr_write_mm_context(ctx, mm);
> +
> + out:
> + mmput(mm);
> + return ret;
> +}
[...]
> diff --git a/ckpt/restart.c b/ckpt/restart.c
> new file mode 100644
> index 0000000..9f52851
> --- /dev/null
> +++ b/ckpt/restart.c
[...]
> +/* read the task_struct into the current task */
> +static int cr_read_task_struct(struct cr_ctx *ctx)
> +{
> + struct cr_hdr_task *hh = cr_hbuf_get(ctx, sizeof(*hh));
> + struct task_struct *t = current;
> + int ret;
> +
> + ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TASK);
> + if (ret < 0)
> + return ret;
> +
> + /* for now, only restore t->comm */
+ /* current should already have correct pid and tgid */
> + if (hh->task_comm_len < 0 || hh->task_comm_len > TASK_COMM_LEN)
> + return -EINVAL;
> +
> + memset(t->comm, 0, TASK_COMM_LEN);
> + memcpy(t->comm, hh->comm, hh->task_comm_len);
> +
> + cr_hbuf_put(ctx, sizeof(*hh));
> + return 0;
> +}
> +
Louis
--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes
----- End forwarded message -----
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <20080730165249.GA23802-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2008-07-30 17:40 ` Dave Hansen
2008-07-31 13:59 ` Louis Rilling
0 siblings, 1 reply; 37+ messages in thread
From: Dave Hansen @ 2008-07-30 17:40 UTC (permalink / raw)
To: Serge E. Hallyn
Cc: Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
On Wed, 2008-07-30 at 11:52 -0500, Serge E. Hallyn wrote:
>
> This list is getting on my nerves. Louis, I'm sorry the threading
> is going to get messed up.
I think I just cleared out the mime type filtering.
-- Dave
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <20080730132257.9DF2.KOSAKI.MOTOHIRO-+CUm20s59erQFUHtdCDX3A@public.gmane.org>
@ 2008-07-30 18:22 ` Oren Laadan
0 siblings, 0 replies; 37+ messages in thread
From: Oren Laadan @ 2008-07-30 18:22 UTC (permalink / raw)
To: KOSAKI Motohiro; +Cc: Linux Containers
KOSAKI Motohiro wrote:
> Hi
>
>> Expand the template sys_checkpoint and sys_restart to be able to dump
>> and restore a single task. The task's address space may consist of only
>> private, simple vma's - anonymous or file-mapped.
>>
>> This big patch adds a mechanism to transfer data between kernel or user
>> space to and from the file given by the caller (sys.c), alloc/setup/free
>> of the checkpoint/restart context (sys.c), output wrappers and basic
>> checkpoint handling (checkpoint.c), memory dump (ckpt_mem.c), input
>> wrappers and basic restart handling (restart.c), and finally the memory
>> restore (rstr_mem.c).
>>
>> Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
>
> please write a documentation of describe memory dump file format,
> and split save and restore to two patches.
While save and restore functionality is already split to different source
files, I can easily refine the patch.
Dump file format: as agreed during the OLS, the format will be nested (as
in "depth-first" as opposed to "breadth-first"). The rationale is to be
able to stream the entire checkpoint image without file seeks. The suggested
layout looks like this:
1. Image header: information about kernel version, CR version, kernel
configuration, CPU capabilities etc.
2. Container global section: state that is global to the container, e.g.
SysV IPC, network setup.
3. Task tree/forest state: number of tasks and their relationships
4. State of each task (one by one): including task_struct state, thread
state, cpu registers, followed by memory, files, signals etc.
5. Image trailer: marking the end of the image and providing checksum and
the like.
Since this patch is only a proof-of-concept, it has a very simple #1,
no #2 or #3, limited #4 and very simple #5.
This patch still doesn't handle shared objects, but they will be handled
as follows: the first time a shared object is accessed (to dump it) it is
given a unique identifier and dumped in full. The next time(s) the object
is found, only the identifier is saved instead.
A bit more specific about the format: it will be composed of "records",
such that each record has a pre-header that identifies its contents and a
payload. (The idea here is to enable parallel checkpointing in the future
in which multiple threads interleave data from multiple processes into
a single stream).
The pre-header is:
struct cr_hdr {
__s16 type;
__s16 len;
__u32 id;
};
'type' identified the type of the following payload, 'len' tells its length.
The 'id' identifies the object instance to which it belongs (it is currently
unused). The meaning of the 'id' field may vary depending on the type. For
example, for type CR_HDR_MM, the 'id' will identify the task to which this
MM belongs. The payload varies depending on its type, for instance, the data
describing a task_struct is given by a 'struct cr_hdr_task' (type CR_HDR_TASK)
and so on.
The format of the memory dump is slightly different: for each vma, there is
a 'struct cr_vma'; if the vma is file-mapped, it will be followed by the file
name. The cr_vma->npages will tell how many pages were dumped for this vma.
Then it will be followed by the actual data: first a dump of the addresses of
all dumped pages (npages entries) followed by a dump of the contents of all
dumped pages (npages pages). Then will come the next vma and so on.
For a single simple task, the format of the resulting checkpoint image would
look like this (assume 2 vma's, one file mapped with 2 dumped pages and the
other anonymous with 3 dumped pages):
cr_hdr + cr_hdr_head
cr_hdr + cr_hdr_task
cr_hdr + cr_hdr_mm
cr_hdr + cr_hdr_vma + cr_hdr + string
addr1, addr2
page1, page2
cr_hdr + cr_hdr_vma
addr3, addr4, addr5
page3, page4, page5
cr_hdr + cr_mm_context
cr_hdr + cr_hdr_thread
cr_hdr + cr_hdr_cpu
cr_hdr + cr_hdr_tail
Will add this documentation to the next version of the patch.
Oren.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <20080730161535.GB22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
@ 2008-07-30 18:27 ` Oren Laadan
[not found] ` <4890B2A8.8010808-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Oren Laadan @ 2008-07-30 18:27 UTC (permalink / raw)
To: Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ; +Cc: Linux Containers
Louis Rilling wrote:
> Hi Oren,
>
> On Tue, Jul 29, 2008 at 11:27:17PM -0400, Oren Laadan wrote:
>> Expand the template sys_checkpoint and sys_restart to be able to dump
>> and restore a single task. The task's address space may consist of only
>> private, simple vma's - anonymous or file-mapped.
>>
>> This big patch adds a mechanism to transfer data between kernel or user
>> space to and from the file given by the caller (sys.c), alloc/setup/free
>> of the checkpoint/restart context (sys.c), output wrappers and basic
>> checkpoint handling (checkpoint.c), memory dump (ckpt_mem.c), input
>> wrappers and basic restart handling (restart.c), and finally the memory
>> restore (rstr_mem.c).
>
> This looks globally clean to me, but I'm sure that others will have stronger
> arguments against or in favor of it.
>
> Just a few comments inline, in case it helps.
>
> [...]
>
>> diff --git a/ckpt/checkpoint.c b/ckpt/checkpoint.c
>> new file mode 100644
>> index 0000000..1698a35
>> --- /dev/null
>> +++ b/ckpt/checkpoint.c
>
> [...]
>
>> +/* dump the task_struct of a given task */
>> +static int cr_write_task_struct(struct cr_ctx *ctx, struct task_struct *t)
>> +{
>> + struct cr_hdr h;
>> + struct cr_hdr_task *hh = ctx->tbuf;
>> +
>> + h.type = CR_HDR_TASK;
>> + h.len = sizeof(*hh);
>> + h.id = ctx->pid;
>> +
>> + hh->state = t->state;
>> + hh->exit_state = t->exit_state;
>> + hh->exit_code = t->exit_code;
>> + hh->exit_signal = t->exit_signal;
>> +
>> + hh->pid = t->pid;
>> + hh->tgid = t->tgid;
>
> IIRC, it is assumed that pid and tgid will be restored before actually calling
> sys_restart(), eg by giving the proper pid and clone flags to a variant of
> do_fork(). So, maybe these ids are useless here and should be put earlier in the
> checkpoint header (see also the matching comment below in cr_read_task_struct()).
oops .. left-overs -- definitely don't belong there anymore.
>
> [...]
>
>> diff --git a/ckpt/ckpt_mem.c b/ckpt/ckpt_mem.c
>> new file mode 100644
>> index 0000000..12caad0
>> --- /dev/null
>> +++ b/ckpt/ckpt_mem.c
>
> [...]
>
>> +/**
>> + * cr_vma_fill_pgarr - fill a page-array with addr/page tuples for a vma
>> + * @ctx - checkpoint context
>> + * @pgarr - page-array to fill
>> + * @vma - vma to scan
>> + * @start - start address (updated)
>> + */
>> +static int cr_vma_fill_pgarr(struct cr_ctx *ctx, struct cr_pgarr *pgarr,
>> + struct vm_area_struct *vma, unsigned long *start)
>> +{
>> + unsigned long end = vma->vm_end;
>> + unsigned long addr = *start;
>> + struct page **pagep;
>> + unsigned long *addrp;
>> + int cow, nr, ret = 0;
>> +
>> + nr = pgarr->nleft;
>> + pagep = &pgarr->pages[pgarr->nused];
>> + addrp = &pgarr->addrs[pgarr->nused];
>> + cow = !!vma->vm_file;
>> +
>> + while (addr < end) {
>> + struct page *page;
>> +
>> + /* simplified version of get_user_pages(): already have vma,
>> + * only need FOLL_TOUCH, and (for now) ignore fault stats */
>> +
>> + cond_resched();
>> + while (!(page = follow_page(vma, addr, FOLL_TOUCH))) {
>> + ret = handle_mm_fault(vma->vm_mm, vma, addr, 0);
>> + if (ret & VM_FAULT_ERROR) {
>> + if (ret & VM_FAULT_OOM)
>> + ret = -ENOMEM;
>> + else if (ret & VM_FAULT_SIGBUS)
>> + ret = -EFAULT;
>> + else
>> + BUG();
>> + break;
>> + }
>> + cond_resched();
>> + }
>
> I guess that 'ret' should be checked somewhere after this loop.
yes; this is where a "break(2)" construct in C would come handy :)
>
>> +
>> + if (IS_ERR(page)) {
>> + ret = PTR_ERR(page);
>> + break;
>> + }
>> +
>> + if (page == ZERO_PAGE(0))
>> + page = NULL; /* zero page: ignore */
>> + else if (cow && page_mapping(page) != NULL)
>> + page = NULL; /* clean cow: ignore */
>> + else {
>> + get_page(page);
>> + *(addrp++) = addr;
>> + *(pagep++) = page;
>> + if (--nr == 0) {
>> + addr += PAGE_SIZE;
>> + break;
>> + }
>> + }
>> +
>> + addr += PAGE_SIZE;
>> + }
>> +
>> + if (unlikely(ret < 0)) {
>> + nr = pgarr->nleft - nr;
>> + while (nr--)
>> + page_cache_release(*(--pagep));
>> + return ret;
>> + }
>> +
>> + *start = addr;
>> + return (pgarr->nleft - nr);
>> +}
>> +
>
> [...]
>
>> +int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t)
>> +{
>> + struct cr_hdr h;
>> + struct cr_hdr_mm *hh = ctx->tbuf;
>> + struct mm_struct *mm;
>> + struct vm_area_struct *vma;
>> + int ret;
>> +
>> + h.type = CR_HDR_MM;
>> + h.len = sizeof(*hh);
>> + h.id = ctx->pid;
>> +
>> + mm = get_task_mm(t);
>> +
>> + hh->tag = 1; /* non-zero will mean first time encounter */
>> +
>> + hh->start_code = mm->start_code;
>> + hh->end_code = mm->end_code;
>> + hh->start_data = mm->start_data;
>> + hh->end_data = mm->end_data;
>> + hh->start_brk = mm->start_brk;
>> + hh->brk = mm->brk;
>> + hh->start_stack = mm->start_stack;
>> + hh->arg_start = mm->arg_start;
>> + hh->arg_end = mm->arg_end;
>> + hh->env_start = mm->env_start;
>> + hh->env_end = mm->env_end;
>> +
>> + hh->map_count = mm->map_count;
>
> Some fields above should also be protected with mmap_sem, like ->brk,
> ->map_count, and possibly others (I'm not a memory expert though).
true; keep in mind, though, that the container will be frozen during
this time, so nothing should change at all. The only exception would
be if, for instance, someone is killing the container while we save
its state.
>
>> +
>> + /* FIX: need also mm->flags */
>> +
>> + ret = cr_write_obj(ctx, &h, hh);
>> + if (ret < 0)
>> + goto out;
>> +
>> + /* write the vma's */
>> + down_read(&mm->mmap_sem);
>> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
>> + if ((ret = cr_write_vma(ctx, vma)) < 0)
>> + break;
>> + }
>> + up_read(&mm->mmap_sem);
>> +
>> + if (ret < 0)
>> + goto out;
>> +
>> + ret = cr_write_mm_context(ctx, mm);
>> +
>> + out:
>> + mmput(mm);
>> + return ret;
>> +}
>
> [...]
>
>> diff --git a/ckpt/restart.c b/ckpt/restart.c
>> new file mode 100644
>> index 0000000..9f52851
>> --- /dev/null
>> +++ b/ckpt/restart.c
>
> [...]
>
>> +/* read the task_struct into the current task */
>> +static int cr_read_task_struct(struct cr_ctx *ctx)
>> +{
>> + struct cr_hdr_task *hh = cr_hbuf_get(ctx, sizeof(*hh));
>> + struct task_struct *t = current;
>> + int ret;
>> +
>> + ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TASK);
>> + if (ret < 0)
>> + return ret;
>> +
>> + /* for now, only restore t->comm */
>
> + /* current should already have correct pid and tgid */
>
>> + if (hh->task_comm_len < 0 || hh->task_comm_len > TASK_COMM_LEN)
>> + return -EINVAL;
>> +
>> + memset(t->comm, 0, TASK_COMM_LEN);
>> + memcpy(t->comm, hh->comm, hh->task_comm_len);
>> +
>> + cr_hbuf_put(ctx, sizeof(*hh));
>> + return 0;
>> +}
>> +
>
> Louis
>
Thanks,
Oren.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <Pine.LNX.4.64.0807292325290.9868-CXF6herHY6ykSYb+qCZC/1i27PF6R63G9nwVQlTi/Pw@public.gmane.org>
2008-07-30 4:51 ` KOSAKI Motohiro
@ 2008-07-30 20:58 ` Dave Hansen
2008-07-30 22:07 ` Serge E. Hallyn
2 siblings, 0 replies; 37+ messages in thread
From: Dave Hansen @ 2008-07-30 20:58 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers
On Tue, 2008-07-29 at 23:27 -0400, Oren Laadan wrote:
> Expand the template sys_checkpoint and sys_restart to be able to dump
> and restore a single task. The task's address space may consist of only
> private, simple vma's - anonymous or file-mapped.
So, can we all agree that this is a good example of the in-kernel
checkpoint/restart approach? It may not be the smallest possible
example, but it certainly demonstrates the approach for me.
-- Dave
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <Pine.LNX.4.64.0807292325290.9868-CXF6herHY6ykSYb+qCZC/1i27PF6R63G9nwVQlTi/Pw@public.gmane.org>
2008-07-30 4:51 ` KOSAKI Motohiro
2008-07-30 20:58 ` Dave Hansen
@ 2008-07-30 22:07 ` Serge E. Hallyn
[not found] ` <20080730220752.GA3518-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2 siblings, 1 reply; 37+ messages in thread
From: Serge E. Hallyn @ 2008-07-30 22:07 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers
Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> +int do_checkpoint(struct cr_ctx *ctx)
> +{
> + int ret;
> +
> + /* FIX: need to test whether container is checkpointable */
> +
> + ret = cr_write_hdr(ctx);
> + if (!ret)
> + ret = cr_write_task(ctx, current);
> + if (!ret)
> + ret = cr_write_tail(ctx);
> +
> + /* on success, return (unique) checkpoint identifier */
> + if (!ret)
> + ret = ctx->crid;
Does this crid have a purpose?
> +
> + return ret;
> +}
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <20080730220752.GA3518-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2008-07-30 22:20 ` Oren Laadan
[not found] ` <4890E930.9090204-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Oren Laadan @ 2008-07-30 22:20 UTC (permalink / raw)
To: Serge E. Hallyn; +Cc: Linux Containers
Serge E. Hallyn wrote:
> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>> +int do_checkpoint(struct cr_ctx *ctx)
>> +{
>> + int ret;
>> +
>> + /* FIX: need to test whether container is checkpointable */
>> +
>> + ret = cr_write_hdr(ctx);
>> + if (!ret)
>> + ret = cr_write_task(ctx, current);
>> + if (!ret)
>> + ret = cr_write_tail(ctx);
>> +
>> + /* on success, return (unique) checkpoint identifier */
>> + if (!ret)
>> + ret = ctx->crid;
>
> Does this crid have a purpose?
yes, at least three; both are for the future, but important to set the
meaning of the return value of the syscall already now. The "crid" is
the CR-identifier that identifies the checkpoint. Every checkpoint is
assigned a unique number (using an atomic counter).
1) if a checkpoint is taken and kept in memory (instead of to a file) then
this will be the identifier with which the restart (or cleanup) would refer
to the (in memory) checkpoint image
2) to reduce downtime of the checkpoint, data will be aggregated on the
checkpoint context, as well as referenced to (cow-ed) pages. This data can
persist between calls to sys_checkpoint(), and the 'crid', again, will be
used to identify the (in-memory-to-be-dumped-to-storage) context.
3) for incremental checkpoint (where a successive checkpoint will only
save what has changed since the previous checkpoint) there will be a need
to identify the previous checkpoints (to be able to know where to take
data from during restart). Again, a 'crid' is handy.
[in fact, for the 3rd use, it will make sense to write that number as
part of the checkpoint image header]
Note that by doing so, a process that checkpoints itself (in its own
context), can use code that is similar to the logic of fork():
...
crid = checkpoint(...);
switch (crid) {
case -1:
perror("checkpoint failed");
break;
default:
fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
/* proceed with execution after checkpoint */
...
break;
case 0:
fprintf(stderr, "returned after restart\n");
/* proceed with action required following a restart */
...
break;
}
...
Oren.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <4890E930.9090204-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2008-07-31 13:57 ` Louis Rilling
[not found] ` <20080731135703.GC22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
2008-07-31 21:25 ` Serge E. Hallyn
1 sibling, 1 reply; 37+ messages in thread
From: Louis Rilling @ 2008-07-31 13:57 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers
On Wed, Jul 30, 2008 at 06:20:32PM -0400, Oren Laadan wrote:
>
>
> Serge E. Hallyn wrote:
> > Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> >> +int do_checkpoint(struct cr_ctx *ctx)
> >> +{
> >> + int ret;
> >> +
> >> + /* FIX: need to test whether container is checkpointable */
> >> +
> >> + ret = cr_write_hdr(ctx);
> >> + if (!ret)
> >> + ret = cr_write_task(ctx, current);
> >> + if (!ret)
> >> + ret = cr_write_tail(ctx);
> >> +
> >> + /* on success, return (unique) checkpoint identifier */
> >> + if (!ret)
> >> + ret = ctx->crid;
> >
> > Does this crid have a purpose?
>
> yes, at least three; both are for the future, but important to set the
> meaning of the return value of the syscall already now. The "crid" is
> the CR-identifier that identifies the checkpoint. Every checkpoint is
> assigned a unique number (using an atomic counter).
>
> 1) if a checkpoint is taken and kept in memory (instead of to a file) then
> this will be the identifier with which the restart (or cleanup) would refer
> to the (in memory) checkpoint image
>
> 2) to reduce downtime of the checkpoint, data will be aggregated on the
> checkpoint context, as well as referenced to (cow-ed) pages. This data can
> persist between calls to sys_checkpoint(), and the 'crid', again, will be
> used to identify the (in-memory-to-be-dumped-to-storage) context.
>
> 3) for incremental checkpoint (where a successive checkpoint will only
> save what has changed since the previous checkpoint) there will be a need
> to identify the previous checkpoints (to be able to know where to take
> data from during restart). Again, a 'crid' is handy.
>
> [in fact, for the 3rd use, it will make sense to write that number as
> part of the checkpoint image header]
>
> Note that by doing so, a process that checkpoints itself (in its own
> context), can use code that is similar to the logic of fork():
>
> ...
> crid = checkpoint(...);
> switch (crid) {
> case -1:
> perror("checkpoint failed");
> break;
> default:
> fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
> /* proceed with execution after checkpoint */
> ...
> break;
> case 0:
> fprintf(stderr, "returned after restart\n");
> /* proceed with action required following a restart */
> ...
> break;
> }
> ...
If I understand correctly, this crid can live for quite a long time. So many of
them could be generated while some container would accumulate incremental
checkpoints on, say crid 5, and possibly crid 5 could be reused for another
unrelated checkpoint during that time. This brings the issue of allocating crids
reliably (using something like a pidmap for instance). Moreover, if such ids are
exposed to userspace, we need to remember which ones are allocated accross
reboots and migrations.
I'm afraid that this becomes too complex...
It would be way easier if the only (kernel-level) references to a checkpoint
were pointers to its context. Ideally, the only reference would live in a
'struct container' and would be easily updated at restart-time.
My $0.02 ...
Louis
--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
2008-07-30 17:40 ` Dave Hansen
@ 2008-07-31 13:59 ` Louis Rilling
[not found] ` <20080731135910.GD22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Louis Rilling @ 2008-07-31 13:59 UTC (permalink / raw)
To: Dave Hansen; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
On Wed, Jul 30, 2008 at 10:40:35AM -0700, Dave Hansen wrote:
> On Wed, 2008-07-30 at 11:52 -0500, Serge E. Hallyn wrote:
> >
> > This list is getting on my nerves. Louis, I'm sorry the threading
> > is going to get messed up.
>
> I think I just cleared out the mime type filtering.
Could the digital signature be the guily part of my email?
Louis
--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <4890B2A8.8010808-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2008-07-31 14:08 ` Louis Rilling
[not found] ` <20080731140844.GE22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Louis Rilling @ 2008-07-31 14:08 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers
On Wed, Jul 30, 2008 at 02:27:52PM -0400, Oren Laadan wrote:
>
>
> Louis Rilling wrote:
> >> +/**
> >> + * cr_vma_fill_pgarr - fill a page-array with addr/page tuples for a vma
> >> + * @ctx - checkpoint context
> >> + * @pgarr - page-array to fill
> >> + * @vma - vma to scan
> >> + * @start - start address (updated)
> >> + */
> >> +static int cr_vma_fill_pgarr(struct cr_ctx *ctx, struct cr_pgarr *pgarr,
> >> + struct vm_area_struct *vma, unsigned long *start)
> >> +{
> >> + unsigned long end = vma->vm_end;
> >> + unsigned long addr = *start;
> >> + struct page **pagep;
> >> + unsigned long *addrp;
> >> + int cow, nr, ret = 0;
> >> +
> >> + nr = pgarr->nleft;
> >> + pagep = &pgarr->pages[pgarr->nused];
> >> + addrp = &pgarr->addrs[pgarr->nused];
> >> + cow = !!vma->vm_file;
> >> +
> >> + while (addr < end) {
> >> + struct page *page;
> >> +
> >> + /* simplified version of get_user_pages(): already have vma,
> >> + * only need FOLL_TOUCH, and (for now) ignore fault stats */
> >> +
> >> + cond_resched();
> >> + while (!(page = follow_page(vma, addr, FOLL_TOUCH))) {
> >> + ret = handle_mm_fault(vma->vm_mm, vma, addr, 0);
> >> + if (ret & VM_FAULT_ERROR) {
> >> + if (ret & VM_FAULT_OOM)
> >> + ret = -ENOMEM;
> >> + else if (ret & VM_FAULT_SIGBUS)
> >> + ret = -EFAULT;
> >> + else
> >> + BUG();
> >> + break;
> >> + }
> >> + cond_resched();
> >> + }
> >
> > I guess that 'ret' should be checked somewhere after this loop.
>
> yes; this is where a "break(2)" construct in C would come handy :)
Alternatively, putting the inner loop in a separate function often helps to
handle errors in a cleaner way.
>
> >
> >> +
> >> + if (IS_ERR(page)) {
> >> + ret = PTR_ERR(page);
> >> + break;
> >> + }
> >> +
> >> + if (page == ZERO_PAGE(0))
> >> + page = NULL; /* zero page: ignore */
> >> + else if (cow && page_mapping(page) != NULL)
> >> + page = NULL; /* clean cow: ignore */
> >> + else {
> >> + get_page(page);
> >> + *(addrp++) = addr;
> >> + *(pagep++) = page;
> >> + if (--nr == 0) {
> >> + addr += PAGE_SIZE;
> >> + break;
> >> + }
> >> + }
> >> +
> >> + addr += PAGE_SIZE;
> >> + }
> >> +
> >> + if (unlikely(ret < 0)) {
> >> + nr = pgarr->nleft - nr;
> >> + while (nr--)
> >> + page_cache_release(*(--pagep));
> >> + return ret;
> >> + }
> >> +
> >> + *start = addr;
> >> + return (pgarr->nleft - nr);
> >> +}
> >> +
> >> +int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t)
> >> +{
> >> + struct cr_hdr h;
> >> + struct cr_hdr_mm *hh = ctx->tbuf;
> >> + struct mm_struct *mm;
> >> + struct vm_area_struct *vma;
> >> + int ret;
> >> +
> >> + h.type = CR_HDR_MM;
> >> + h.len = sizeof(*hh);
> >> + h.id = ctx->pid;
> >> +
> >> + mm = get_task_mm(t);
> >> +
> >> + hh->tag = 1; /* non-zero will mean first time encounter */
> >> +
> >> + hh->start_code = mm->start_code;
> >> + hh->end_code = mm->end_code;
> >> + hh->start_data = mm->start_data;
> >> + hh->end_data = mm->end_data;
> >> + hh->start_brk = mm->start_brk;
> >> + hh->brk = mm->brk;
> >> + hh->start_stack = mm->start_stack;
> >> + hh->arg_start = mm->arg_start;
> >> + hh->arg_end = mm->arg_end;
> >> + hh->env_start = mm->env_start;
> >> + hh->env_end = mm->env_end;
> >> +
> >> + hh->map_count = mm->map_count;
> >
> > Some fields above should also be protected with mmap_sem, like ->brk,
> > ->map_count, and possibly others (I'm not a memory expert though).
>
> true; keep in mind, though, that the container will be frozen during
> this time, so nothing should change at all. The only exception would
> be if, for instance, someone is killing the container while we save
> its state.
Sure. So you think that taking mm->mmap_sem below is useless? I tend to believe
so, since no other task should share this mm_struct at this time, and we could
state that ptrace should not interfere during restart. However, I'm never
confident when ptrace considerations come in...
>
> >
> >> +
> >> + /* FIX: need also mm->flags */
> >> +
> >> + ret = cr_write_obj(ctx, &h, hh);
> >> + if (ret < 0)
> >> + goto out;
> >> +
> >> + /* write the vma's */
> >> + down_read(&mm->mmap_sem);
> >> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> >> + if ((ret = cr_write_vma(ctx, vma)) < 0)
> >> + break;
> >> + }
> >> + up_read(&mm->mmap_sem);
> >> +
> >> + if (ret < 0)
> >> + goto out;
> >> +
> >> + ret = cr_write_mm_context(ctx, mm);
> >> +
> >> + out:
> >> + mmput(mm);
> >> + return ret;
> >> +}
> >
Thanks,
Louis
--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <20080731135910.GD22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
@ 2008-07-31 14:14 ` Serge E. Hallyn
0 siblings, 0 replies; 37+ messages in thread
From: Serge E. Hallyn @ 2008-07-31 14:14 UTC (permalink / raw)
To: Louis Rilling
Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Dave Hansen
Quoting Louis Rilling (Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ@public.gmane.org):
> On Wed, Jul 30, 2008 at 10:40:35AM -0700, Dave Hansen wrote:
> > On Wed, 2008-07-30 at 11:52 -0500, Serge E. Hallyn wrote:
> > >
> > > This list is getting on my nerves. Louis, I'm sorry the threading
> > > is going to get messed up.
> >
> > I think I just cleared out the mime type filtering.
>
> Could the digital signature be the guily part of my email?
Yeah, that was Dave's guess, and it seems likely. Dave thinks he
unset whatever setting caused the bounce, so you should be fine to
keep the signatures in there.
thanks,
-serge
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <20080731140844.GE22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
@ 2008-07-31 14:44 ` Oren Laadan
0 siblings, 0 replies; 37+ messages in thread
From: Oren Laadan @ 2008-07-31 14:44 UTC (permalink / raw)
To: Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ; +Cc: Linux Containers
Louis Rilling wrote:
> On Wed, Jul 30, 2008 at 02:27:52PM -0400, Oren Laadan wrote:
>>
>> Louis Rilling wrote:
>>>> +/**
>>>> + * cr_vma_fill_pgarr - fill a page-array with addr/page tuples for a vma
>>>> + * @ctx - checkpoint context
>>>> + * @pgarr - page-array to fill
>>>> + * @vma - vma to scan
>>>> + * @start - start address (updated)
>>>> + */
>>>> +static int cr_vma_fill_pgarr(struct cr_ctx *ctx, struct cr_pgarr *pgarr,
>>>> + struct vm_area_struct *vma, unsigned long *start)
>>>> +{
>>>> + unsigned long end = vma->vm_end;
>>>> + unsigned long addr = *start;
>>>> + struct page **pagep;
>>>> + unsigned long *addrp;
>>>> + int cow, nr, ret = 0;
>>>> +
>>>> + nr = pgarr->nleft;
>>>> + pagep = &pgarr->pages[pgarr->nused];
>>>> + addrp = &pgarr->addrs[pgarr->nused];
>>>> + cow = !!vma->vm_file;
>>>> +
>>>> + while (addr < end) {
>>>> + struct page *page;
>>>> +
>>>> + /* simplified version of get_user_pages(): already have vma,
>>>> + * only need FOLL_TOUCH, and (for now) ignore fault stats */
>>>> +
>>>> + cond_resched();
>>>> + while (!(page = follow_page(vma, addr, FOLL_TOUCH))) {
>>>> + ret = handle_mm_fault(vma->vm_mm, vma, addr, 0);
>>>> + if (ret & VM_FAULT_ERROR) {
>>>> + if (ret & VM_FAULT_OOM)
>>>> + ret = -ENOMEM;
>>>> + else if (ret & VM_FAULT_SIGBUS)
>>>> + ret = -EFAULT;
>>>> + else
>>>> + BUG();
>>>> + break;
>>>> + }
>>>> + cond_resched();
>>>> + }
>>> I guess that 'ret' should be checked somewhere after this loop.
>> yes; this is where a "break(2)" construct in C would come handy :)
>
> Alternatively, putting the inner loop in a separate function often helps to
> handle errors in a cleaner way.
Also true. I opted to keep it that way to keep the code as similar as
possible to get_user_pages().
Note that the logic can be optimized by, instead of traversing the page
table once for each page, we could aggregate a few pages in each round.
I wanted to keep the code simple.
>
>>>> +
>>>> + if (IS_ERR(page)) {
>>>> + ret = PTR_ERR(page);
>>>> + break;
>>>> + }
>>>> +
>>>> + if (page == ZERO_PAGE(0))
>>>> + page = NULL; /* zero page: ignore */
>>>> + else if (cow && page_mapping(page) != NULL)
>>>> + page = NULL; /* clean cow: ignore */
>>>> + else {
>>>> + get_page(page);
>>>> + *(addrp++) = addr;
>>>> + *(pagep++) = page;
>>>> + if (--nr == 0) {
>>>> + addr += PAGE_SIZE;
>>>> + break;
>>>> + }
>>>> + }
>>>> +
>>>> + addr += PAGE_SIZE;
>>>> + }
>>>> +
>>>> + if (unlikely(ret < 0)) {
>>>> + nr = pgarr->nleft - nr;
>>>> + while (nr--)
>>>> + page_cache_release(*(--pagep));
>>>> + return ret;
>>>> + }
>>>> +
>>>> + *start = addr;
>>>> + return (pgarr->nleft - nr);
>>>> +}
>>>> +
>
>
>>>> +int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t)
>>>> +{
>>>> + struct cr_hdr h;
>>>> + struct cr_hdr_mm *hh = ctx->tbuf;
>>>> + struct mm_struct *mm;
>>>> + struct vm_area_struct *vma;
>>>> + int ret;
>>>> +
>>>> + h.type = CR_HDR_MM;
>>>> + h.len = sizeof(*hh);
>>>> + h.id = ctx->pid;
>>>> +
>>>> + mm = get_task_mm(t);
>>>> +
>>>> + hh->tag = 1; /* non-zero will mean first time encounter */
>>>> +
>>>> + hh->start_code = mm->start_code;
>>>> + hh->end_code = mm->end_code;
>>>> + hh->start_data = mm->start_data;
>>>> + hh->end_data = mm->end_data;
>>>> + hh->start_brk = mm->start_brk;
>>>> + hh->brk = mm->brk;
>>>> + hh->start_stack = mm->start_stack;
>>>> + hh->arg_start = mm->arg_start;
>>>> + hh->arg_end = mm->arg_end;
>>>> + hh->env_start = mm->env_start;
>>>> + hh->env_end = mm->env_end;
>>>> +
>>>> + hh->map_count = mm->map_count;
>>> Some fields above should also be protected with mmap_sem, like ->brk,
>>> ->map_count, and possibly others (I'm not a memory expert though).
>> true; keep in mind, though, that the container will be frozen during
>> this time, so nothing should change at all. The only exception would
>> be if, for instance, someone is killing the container while we save
>> its state.
>
> Sure. So you think that taking mm->mmap_sem below is useless? I tend to believe
> so, since no other task should share this mm_struct at this time, and we could
> state that ptrace should not interfere during restart. However, I'm never
> confident when ptrace considerations come in...
Not quite.
Probing the value of mm->brk is always safe, although it may turn out to
yield incorrect value. Traversing the vma's isn't safe, because - if for
instance the target task dies in the middle, it may alter the vma list.
So the mmap_sem protects against the latter.
Anyway, it won't hurt to be extra safe and take the semaphore earlier.
Ptrace, btw, cannot come in because the container is (supposedly) frozen.
Oren.
>>>> +
>>>> + /* FIX: need also mm->flags */
>>>> +
>>>> + ret = cr_write_obj(ctx, &h, hh);
>>>> + if (ret < 0)
>>>> + goto out;
>>>> +
>>>> + /* write the vma's */
>>>> + down_read(&mm->mmap_sem);
>>>> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
>>>> + if ((ret = cr_write_vma(ctx, vma)) < 0)
>>>> + break;
>>>> + }
>>>> + up_read(&mm->mmap_sem);
>>>> +
>>>> + if (ret < 0)
>>>> + goto out;
>>>> +
>>>> + ret = cr_write_mm_context(ctx, mm);
>>>> +
>>>> + out:
>>>> + mmput(mm);
>>>> + return ret;
>>>> +}
>
> Thanks,
>
> Louis
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <20080731135703.GC22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
@ 2008-07-31 15:09 ` Oren Laadan
[not found] ` <4891D5C2.8090000-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Oren Laadan @ 2008-07-31 15:09 UTC (permalink / raw)
To: Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ; +Cc: Linux Containers
Louis Rilling wrote:
> On Wed, Jul 30, 2008 at 06:20:32PM -0400, Oren Laadan wrote:
>>
>> Serge E. Hallyn wrote:
>>> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>>> +int do_checkpoint(struct cr_ctx *ctx)
>>>> +{
>>>> + int ret;
>>>> +
>>>> + /* FIX: need to test whether container is checkpointable */
>>>> +
>>>> + ret = cr_write_hdr(ctx);
>>>> + if (!ret)
>>>> + ret = cr_write_task(ctx, current);
>>>> + if (!ret)
>>>> + ret = cr_write_tail(ctx);
>>>> +
>>>> + /* on success, return (unique) checkpoint identifier */
>>>> + if (!ret)
>>>> + ret = ctx->crid;
>>> Does this crid have a purpose?
>> yes, at least three; both are for the future, but important to set the
>> meaning of the return value of the syscall already now. The "crid" is
>> the CR-identifier that identifies the checkpoint. Every checkpoint is
>> assigned a unique number (using an atomic counter).
>>
>> 1) if a checkpoint is taken and kept in memory (instead of to a file) then
>> this will be the identifier with which the restart (or cleanup) would refer
>> to the (in memory) checkpoint image
>>
>> 2) to reduce downtime of the checkpoint, data will be aggregated on the
>> checkpoint context, as well as referenced to (cow-ed) pages. This data can
>> persist between calls to sys_checkpoint(), and the 'crid', again, will be
>> used to identify the (in-memory-to-be-dumped-to-storage) context.
>>
>> 3) for incremental checkpoint (where a successive checkpoint will only
>> save what has changed since the previous checkpoint) there will be a need
>> to identify the previous checkpoints (to be able to know where to take
>> data from during restart). Again, a 'crid' is handy.
>>
>> [in fact, for the 3rd use, it will make sense to write that number as
>> part of the checkpoint image header]
>>
>> Note that by doing so, a process that checkpoints itself (in its own
>> context), can use code that is similar to the logic of fork():
>>
>> ...
>> crid = checkpoint(...);
>> switch (crid) {
>> case -1:
>> perror("checkpoint failed");
>> break;
>> default:
>> fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
>> /* proceed with execution after checkpoint */
>> ...
>> break;
>> case 0:
>> fprintf(stderr, "returned after restart\n");
>> /* proceed with action required following a restart */
>> ...
>> break;
>> }
>> ...
>
> If I understand correctly, this crid can live for quite a long time. So many of
> them could be generated while some container would accumulate incremental
> checkpoints on, say crid 5, and possibly crid 5 could be reused for another
> unrelated checkpoint during that time. This brings the issue of allocating crids
> reliably (using something like a pidmap for instance). Moreover, if such ids are
> exposed to userspace, we need to remember which ones are allocated accross
> reboots and migrations.
>
> I'm afraid that this becomes too complex...
And I'm afraid I didn't explain myself well. So let me rephrase:
CRIDs are always _local_ to a specific node. The local CRID counter is
bumped (atomically) with each checkpoint attempt. The main use case is
for when the checkpoint is kept is memory either shortly (until it is
written back to disk) or for a longer time (use-cases that want to keep
it there). It only remains valid as long as the checkpoint image is
still in memory and have not been committed to storage/network. Think
of it as a way to identify the operation instance.
So they can live quite a long time, but only as long as the original
node is still alive and the checkpoint is still kept in memory. They
are meaningless across reboots and migrations. I don't think a wrap
around is a concern, but we can use 64 bit if that is the case.
Finally, the incremental checkpoint use-case: imagine a container that
is checkpointed regularly every minutes. The first checkpoint will be
a full checkpoint, say CRID=1. The second will be incremental with
respect to the first, with CRID=2, and so on the third and the forth.
Userspace could use these CRID to name the image files (for example,
app.img.CRID). Assume that we decide (big "if") that the convention is
that the last part of the filename must be the CRID, and if we decide
(another big "if") to save the CRID as part of the checkpoint image --
the part that describe the "incremental nature" of a new checkpoint.
(That part would specify where to get state that wasn't really saved
in the new checkpoint but instead can be retrieved from older ones).
If that was the case, then the logic in the kernel would be fairly
to find (and access) the actual files that hold the data. Note, that
in this case - the CRID are guaranteed to be unique per series of
incremental checkpoints, and incremental chekcpoint is meaningless
across reboots (and we can require that across migration too).
We probably don't want to use something like a pid to identify the
checkpoint (while in memory), because we may have multiple checkpoints
in memory at a time (of the same container).
>
> It would be way easier if the only (kernel-level) references to a checkpoint
> were pointers to its context. Ideally, the only reference would live in a
> 'struct container' and would be easily updated at restart-time.
Consider the following scenario of calls from user-space (which is
how I envision the checkpoint optimized for minimal downtime, in the
future):
1) while (syscall_to_do_precopy) <- do precopy until ready to
if (too_long_already) <- checkpoint or too long
break;
2) freeze_container();
3) crid = checkpoint(.., .., CR_CKPT_LAZY); <- checkpoint container
<- don't commit to disk
<- (minimize owntime)
4) unfreeze_container(); <- now can unfreeze container
<- already as soon as possible
5) ckpt_writeback(crid, fd); <- container is back running. we
<- can commit data to storage or
<- network in the background.
#2 and #4 are done with freezer_cgroup()
#1, #3 and #5 must be syscalls
More specifically, syscall #5 must be able to refer to the result of syscall #3
(that is the CRID !). It is possible that another syscall #3 occur, on the same
container, between steps 4 and 5 ... but then that checkpoint will be assigned
another, unique CRID.
> My $0.02 ...
Thanks... American or Canadian ? ;)
Oren.
>
> Louis
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <4891D5C2.8090000-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2008-07-31 15:58 ` Louis Rilling
[not found] ` <20080731155856.GH22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Louis Rilling @ 2008-07-31 15:58 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers
[-- Attachment #1.1: Type: text/plain, Size: 7584 bytes --]
On Thu, Jul 31, 2008 at 11:09:54AM -0400, Oren Laadan wrote:
>
>
> Louis Rilling wrote:
>> On Wed, Jul 30, 2008 at 06:20:32PM -0400, Oren Laadan wrote:
>>>
>>> Serge E. Hallyn wrote:
>>>> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>>>> +int do_checkpoint(struct cr_ctx *ctx)
>>>>> +{
>>>>> + int ret;
>>>>> +
>>>>> + /* FIX: need to test whether container is checkpointable */
>>>>> +
>>>>> + ret = cr_write_hdr(ctx);
>>>>> + if (!ret)
>>>>> + ret = cr_write_task(ctx, current);
>>>>> + if (!ret)
>>>>> + ret = cr_write_tail(ctx);
>>>>> +
>>>>> + /* on success, return (unique) checkpoint identifier */
>>>>> + if (!ret)
>>>>> + ret = ctx->crid;
>>>> Does this crid have a purpose?
>>> yes, at least three; both are for the future, but important to set the
>>> meaning of the return value of the syscall already now. The "crid" is
>>> the CR-identifier that identifies the checkpoint. Every checkpoint is
>>> assigned a unique number (using an atomic counter).
>>>
>>> 1) if a checkpoint is taken and kept in memory (instead of to a file) then
>>> this will be the identifier with which the restart (or cleanup) would refer
>>> to the (in memory) checkpoint image
>>>
>>> 2) to reduce downtime of the checkpoint, data will be aggregated on the
>>> checkpoint context, as well as referenced to (cow-ed) pages. This data can
>>> persist between calls to sys_checkpoint(), and the 'crid', again, will be
>>> used to identify the (in-memory-to-be-dumped-to-storage) context.
>>>
>>> 3) for incremental checkpoint (where a successive checkpoint will only
>>> save what has changed since the previous checkpoint) there will be a need
>>> to identify the previous checkpoints (to be able to know where to take
>>> data from during restart). Again, a 'crid' is handy.
>>>
>>> [in fact, for the 3rd use, it will make sense to write that number as
>>> part of the checkpoint image header]
>>>
>>> Note that by doing so, a process that checkpoints itself (in its own
>>> context), can use code that is similar to the logic of fork():
>>>
>>> ...
>>> crid = checkpoint(...);
>>> switch (crid) {
>>> case -1:
>>> perror("checkpoint failed");
>>> break;
>>> default:
>>> fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
>>> /* proceed with execution after checkpoint */
>>> ...
>>> break;
>>> case 0:
>>> fprintf(stderr, "returned after restart\n");
>>> /* proceed with action required following a restart */
>>> ...
>>> break;
>>> }
>>> ...
>>
>> If I understand correctly, this crid can live for quite a long time. So many of
>> them could be generated while some container would accumulate incremental
>> checkpoints on, say crid 5, and possibly crid 5 could be reused for another
>> unrelated checkpoint during that time. This brings the issue of allocating crids
>> reliably (using something like a pidmap for instance). Moreover, if such ids are
>> exposed to userspace, we need to remember which ones are allocated accross
>> reboots and migrations.
>>
>> I'm afraid that this becomes too complex...
>
> And I'm afraid I didn't explain myself well. So let me rephrase:
>
> CRIDs are always _local_ to a specific node. The local CRID counter is
> bumped (atomically) with each checkpoint attempt. The main use case is
> for when the checkpoint is kept is memory either shortly (until it is
> written back to disk) or for a longer time (use-cases that want to keep
> it there). It only remains valid as long as the checkpoint image is
> still in memory and have not been committed to storage/network. Think
> of it as a way to identify the operation instance.
>
> So they can live quite a long time, but only as long as the original
> node is still alive and the checkpoint is still kept in memory. They
> are meaningless across reboots and migrations. I don't think a wrap
> around is a concern, but we can use 64 bit if that is the case.
>
> Finally, the incremental checkpoint use-case: imagine a container that
> is checkpointed regularly every minutes. The first checkpoint will be
> a full checkpoint, say CRID=1. The second will be incremental with
> respect to the first, with CRID=2, and so on the third and the forth.
> Userspace could use these CRID to name the image files (for example,
> app.img.CRID). Assume that we decide (big "if") that the convention is
> that the last part of the filename must be the CRID, and if we decide
> (another big "if") to save the CRID as part of the checkpoint image --
> the part that describe the "incremental nature" of a new checkpoint.
> (That part would specify where to get state that wasn't really saved
> in the new checkpoint but instead can be retrieved from older ones).
> If that was the case, then the logic in the kernel would be fairly
> to find (and access) the actual files that hold the data. Note, that
> in this case - the CRID are guaranteed to be unique per series of
> incremental checkpoints, and incremental chekcpoint is meaningless
> across reboots (and we can require that across migration too).
Letting the kernel guess where to find the missing data of an incremental
checkpoint seems a bit hazardous indeed. What about just appending incremental
checkpoints to the last full checkpoint file?
>
> We probably don't want to use something like a pid to identify the
> checkpoint (while in memory), because we may have multiple checkpoints
> in memory at a time (of the same container).
Agreed.
>
>>
>> It would be way easier if the only (kernel-level) references to a checkpoint
>> were pointers to its context. Ideally, the only reference would live in a
>> 'struct container' and would be easily updated at restart-time.
>
> Consider the following scenario of calls from user-space (which is
> how I envision the checkpoint optimized for minimal downtime, in the
> future):
>
> 1) while (syscall_to_do_precopy) <- do precopy until ready to
> if (too_long_already) <- checkpoint or too long
> break;
>
> 2) freeze_container();
>
> 3) crid = checkpoint(.., .., CR_CKPT_LAZY); <- checkpoint container
> <- don't commit to disk
> <- (minimize owntime)
>
> 4) unfreeze_container(); <- now can unfreeze container
> <- already as soon as possible
>
> 5) ckpt_writeback(crid, fd); <- container is back running. we
> <- can commit data to storage or
> <- network in the background.
>
> #2 and #4 are done with freezer_cgroup()
>
> #1, #3 and #5 must be syscalls
>
> More specifically, syscall #5 must be able to refer to the result of syscall #3
> (that is the CRID !). It is possible that another syscall #3 occur, on the same
> container, between steps 4 and 5 ... but then that checkpoint will be assigned
> another, unique CRID.
Hm, assuming that, as proposed above, incremental checkpoints are stored in the
same file as the ancestor full checkpoint, why not simply give fd as argument in
#5? I'd expect that the kernel would associate the file descriptor to the
checkpoint until it is finalized (written back, sent over the wire, etc.).
Maybe I'm still missing something...
>
>> My $0.02 ...
>
> Thanks... American or Canadian ? ;)
Since I only have the canadian cityzenship, you can guess easily ;)
Thanks for your patient explanations!
Louis
--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes
[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
[-- Attachment #2: Type: text/plain, Size: 206 bytes --]
_______________________________________________
Containers mailing list
Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
https://lists.linux-foundation.org/mailman/listinfo/containers
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <20080731155856.GH22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
@ 2008-07-31 16:28 ` Oren Laadan
[not found] ` <4891E849.1050701-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Oren Laadan @ 2008-07-31 16:28 UTC (permalink / raw)
To: Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ; +Cc: Linux Containers
Louis Rilling wrote:
> On Thu, Jul 31, 2008 at 11:09:54AM -0400, Oren Laadan wrote:
>>
>> Louis Rilling wrote:
>>> On Wed, Jul 30, 2008 at 06:20:32PM -0400, Oren Laadan wrote:
>>>> Serge E. Hallyn wrote:
>>>>> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>>>>> +int do_checkpoint(struct cr_ctx *ctx)
>>>>>> +{
>>>>>> + int ret;
>>>>>> +
>>>>>> + /* FIX: need to test whether container is checkpointable */
>>>>>> +
>>>>>> + ret = cr_write_hdr(ctx);
>>>>>> + if (!ret)
>>>>>> + ret = cr_write_task(ctx, current);
>>>>>> + if (!ret)
>>>>>> + ret = cr_write_tail(ctx);
>>>>>> +
>>>>>> + /* on success, return (unique) checkpoint identifier */
>>>>>> + if (!ret)
>>>>>> + ret = ctx->crid;
>>>>> Does this crid have a purpose?
>>>> yes, at least three; both are for the future, but important to set the
>>>> meaning of the return value of the syscall already now. The "crid" is
>>>> the CR-identifier that identifies the checkpoint. Every checkpoint is
>>>> assigned a unique number (using an atomic counter).
>>>>
>>>> 1) if a checkpoint is taken and kept in memory (instead of to a file) then
>>>> this will be the identifier with which the restart (or cleanup) would refer
>>>> to the (in memory) checkpoint image
>>>>
>>>> 2) to reduce downtime of the checkpoint, data will be aggregated on the
>>>> checkpoint context, as well as referenced to (cow-ed) pages. This data can
>>>> persist between calls to sys_checkpoint(), and the 'crid', again, will be
>>>> used to identify the (in-memory-to-be-dumped-to-storage) context.
>>>>
>>>> 3) for incremental checkpoint (where a successive checkpoint will only
>>>> save what has changed since the previous checkpoint) there will be a need
>>>> to identify the previous checkpoints (to be able to know where to take
>>>> data from during restart). Again, a 'crid' is handy.
>>>>
>>>> [in fact, for the 3rd use, it will make sense to write that number as
>>>> part of the checkpoint image header]
>>>>
>>>> Note that by doing so, a process that checkpoints itself (in its own
>>>> context), can use code that is similar to the logic of fork():
>>>>
>>>> ...
>>>> crid = checkpoint(...);
>>>> switch (crid) {
>>>> case -1:
>>>> perror("checkpoint failed");
>>>> break;
>>>> default:
>>>> fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
>>>> /* proceed with execution after checkpoint */
>>>> ...
>>>> break;
>>>> case 0:
>>>> fprintf(stderr, "returned after restart\n");
>>>> /* proceed with action required following a restart */
>>>> ...
>>>> break;
>>>> }
>>>> ...
>>> If I understand correctly, this crid can live for quite a long time. So many of
>>> them could be generated while some container would accumulate incremental
>>> checkpoints on, say crid 5, and possibly crid 5 could be reused for another
>>> unrelated checkpoint during that time. This brings the issue of allocating crids
>>> reliably (using something like a pidmap for instance). Moreover, if such ids are
>>> exposed to userspace, we need to remember which ones are allocated accross
>>> reboots and migrations.
>>>
>>> I'm afraid that this becomes too complex...
>> And I'm afraid I didn't explain myself well. So let me rephrase:
>>
>> CRIDs are always _local_ to a specific node. The local CRID counter is
>> bumped (atomically) with each checkpoint attempt. The main use case is
>> for when the checkpoint is kept is memory either shortly (until it is
>> written back to disk) or for a longer time (use-cases that want to keep
>> it there). It only remains valid as long as the checkpoint image is
>> still in memory and have not been committed to storage/network. Think
>> of it as a way to identify the operation instance.
>>
>> So they can live quite a long time, but only as long as the original
>> node is still alive and the checkpoint is still kept in memory. They
>> are meaningless across reboots and migrations. I don't think a wrap
>> around is a concern, but we can use 64 bit if that is the case.
>>
>> Finally, the incremental checkpoint use-case: imagine a container that
>> is checkpointed regularly every minutes. The first checkpoint will be
>> a full checkpoint, say CRID=1. The second will be incremental with
>> respect to the first, with CRID=2, and so on the third and the forth.
>> Userspace could use these CRID to name the image files (for example,
>> app.img.CRID). Assume that we decide (big "if") that the convention is
>> that the last part of the filename must be the CRID, and if we decide
>> (another big "if") to save the CRID as part of the checkpoint image --
>> the part that describe the "incremental nature" of a new checkpoint.
>> (That part would specify where to get state that wasn't really saved
>> in the new checkpoint but instead can be retrieved from older ones).
>> If that was the case, then the logic in the kernel would be fairly
>> to find (and access) the actual files that hold the data. Note, that
>> in this case - the CRID are guaranteed to be unique per series of
>> incremental checkpoints, and incremental chekcpoint is meaningless
>> across reboots (and we can require that across migration too).
>
> Letting the kernel guess where to find the missing data of an incremental
> checkpoint seems a bit hazardous indeed. What about just appending incremental
> checkpoints to the last full checkpoint file?
It isn't quite a "guess", it's like the kernel assumes that a kernel-helper
resides in some directory - it's a convention. I agree, though, that it may
not be the best method to do it.
As for putting everything in a single file, I prefer not to do that, and it
may not even always possible I believe.
An incremental would include a section that describes how to find the missing
data from previous checkpoints, so it must have a way to identify a previous
checkpoint.
On way is like I suggested name them with this identifier, another would be,
for example, that the user provides a list of file-descriptors that match
the required identifiers. Other ways may be possible too.
In any event, I think it is now bit early to discuss the exact format and
logic, when we don't even have a simple checkpoint working :)
Incremental checkpoint is one of a few reasons to use CRIDs, let us first
agree about CRIDs, and later, when we design incremental checkpoints, decide
on the technical details of incorporating this CRIDs.
(Just to avoid confusion, an incremental checkpoint is _not_ a pre-copy or
live-migration: in a pre-copy, we repeatedly copy the state of the container
without freezing it until the delta is small enough, then we freeze and then
we checkpoint the remaining residues. All this activity belongs to a single
checkpoint. In incremental checkpoints, we talk about multiple checkpoints
that save only the delta with respect to their preceding checkpoint).
>
>> We probably don't want to use something like a pid to identify the
>> checkpoint (while in memory), because we may have multiple checkpoints
>> in memory at a time (of the same container).
>
> Agreed.
>
>>> It would be way easier if the only (kernel-level) references to a checkpoint
>>> were pointers to its context. Ideally, the only reference would live in a
>>> 'struct container' and would be easily updated at restart-time.
>> Consider the following scenario of calls from user-space (which is
>> how I envision the checkpoint optimized for minimal downtime, in the
>> future):
>>
>> 1) while (syscall_to_do_precopy) <- do precopy until ready to
>> if (too_long_already) <- checkpoint or too long
>> break;
>>
>> 2) freeze_container();
>>
>> 3) crid = checkpoint(.., .., CR_CKPT_LAZY); <- checkpoint container
>> <- don't commit to disk
>> <- (minimize owntime)
>>
>> 4) unfreeze_container(); <- now can unfreeze container
>> <- already as soon as possible
>>
>> 5) ckpt_writeback(crid, fd); <- container is back running. we
>> <- can commit data to storage or
>> <- network in the background.
>>
>> #2 and #4 are done with freezer_cgroup()
>>
>> #1, #3 and #5 must be syscalls
>>
>> More specifically, syscall #5 must be able to refer to the result of syscall #3
>> (that is the CRID !). It is possible that another syscall #3 occur, on the same
>> container, between steps 4 and 5 ... but then that checkpoint will be assigned
>> another, unique CRID.
>
> Hm, assuming that, as proposed above, incremental checkpoints are stored in the
> same file as the ancestor full checkpoint, why not simply give fd as argument in
> #5? I'd expect that the kernel would associate the file descriptor to the
> checkpoint until it is finalized (written back, sent over the wire, etc.).
The above procedure, step 1-5 are for a _single_ checkpoint.
Why would the kernel associate a file descriptor with the checkpoint until it
is finalized ? As far as I'm concerned, the checkpoint call in step 3 can go
without any FD. Also, what happens if there is another checkpoint, of the
same container, taken between steps 4 and 5, how would you tell the difference
or select which one goes in first ? Finally, keeping that FD alive between
multiple checkpoints would require the checkpointer (e.g. a daemon that will
periodically checkpoint) to keep it alive.
I view it differently: a checkpoint held in memory is like a kernel resource,
and requires a handle/identifier for user space to refer to it. Like an IPC
object. Why tie that object to a specific file descriptor ?
The only exception I can see, is the need to tie it to a some process - the
checkpointer for instance, such that if that process dies without completing
the work, the checkpoint image in memory will be cleaned up.
That, however, still is problematic, because it will not allow you to use
different procesess for different steps (above).
Since we are not yet optimizing the checkpoint procedure, just building the
infrastructure, my goal is to convince that a CRID is a desired feature (and
I can certainly see how it will be used in various scenarios).
Oren.
>
> Maybe I'm still missing something...
>
>>> My $0.02 ...
>> Thanks... American or Canadian ? ;)
>
> Since I only have the canadian cityzenship, you can guess easily ;)
>
> Thanks for your patient explanations!
>
> Louis
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <4891E849.1050701-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2008-07-31 17:50 ` Louis Rilling
[not found] ` <20080731175058.GI22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Louis Rilling @ 2008-07-31 17:50 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers
[-- Attachment #1.1: Type: text/plain, Size: 13111 bytes --]
On Thu, Jul 31, 2008 at 12:28:57PM -0400, Oren Laadan wrote:
>
>
> Louis Rilling wrote:
>> On Thu, Jul 31, 2008 at 11:09:54AM -0400, Oren Laadan wrote:
>>>
>>> Louis Rilling wrote:
>>>> On Wed, Jul 30, 2008 at 06:20:32PM -0400, Oren Laadan wrote:
>>>>> Serge E. Hallyn wrote:
>>>>>> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>>>>>> +int do_checkpoint(struct cr_ctx *ctx)
>>>>>>> +{
>>>>>>> + int ret;
>>>>>>> +
>>>>>>> + /* FIX: need to test whether container is checkpointable */
>>>>>>> +
>>>>>>> + ret = cr_write_hdr(ctx);
>>>>>>> + if (!ret)
>>>>>>> + ret = cr_write_task(ctx, current);
>>>>>>> + if (!ret)
>>>>>>> + ret = cr_write_tail(ctx);
>>>>>>> +
>>>>>>> + /* on success, return (unique) checkpoint identifier */
>>>>>>> + if (!ret)
>>>>>>> + ret = ctx->crid;
>>>>>> Does this crid have a purpose?
>>>>> yes, at least three; both are for the future, but important to set the
>>>>> meaning of the return value of the syscall already now. The "crid" is
>>>>> the CR-identifier that identifies the checkpoint. Every checkpoint is
>>>>> assigned a unique number (using an atomic counter).
>>>>>
>>>>> 1) if a checkpoint is taken and kept in memory (instead of to a file) then
>>>>> this will be the identifier with which the restart (or cleanup) would refer
>>>>> to the (in memory) checkpoint image
>>>>>
>>>>> 2) to reduce downtime of the checkpoint, data will be aggregated on the
>>>>> checkpoint context, as well as referenced to (cow-ed) pages. This data can
>>>>> persist between calls to sys_checkpoint(), and the 'crid', again, will be
>>>>> used to identify the (in-memory-to-be-dumped-to-storage) context.
>>>>>
>>>>> 3) for incremental checkpoint (where a successive checkpoint will only
>>>>> save what has changed since the previous checkpoint) there will be a need
>>>>> to identify the previous checkpoints (to be able to know where to take
>>>>> data from during restart). Again, a 'crid' is handy.
>>>>>
>>>>> [in fact, for the 3rd use, it will make sense to write that number as
>>>>> part of the checkpoint image header]
>>>>>
>>>>> Note that by doing so, a process that checkpoints itself (in its own
>>>>> context), can use code that is similar to the logic of fork():
>>>>>
>>>>> ...
>>>>> crid = checkpoint(...);
>>>>> switch (crid) {
>>>>> case -1:
>>>>> perror("checkpoint failed");
>>>>> break;
>>>>> default:
>>>>> fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
>>>>> /* proceed with execution after checkpoint */
>>>>> ...
>>>>> break;
>>>>> case 0:
>>>>> fprintf(stderr, "returned after restart\n");
>>>>> /* proceed with action required following a restart */
>>>>> ...
>>>>> break;
>>>>> }
>>>>> ...
>>>> If I understand correctly, this crid can live for quite a long time. So many of
>>>> them could be generated while some container would accumulate incremental
>>>> checkpoints on, say crid 5, and possibly crid 5 could be reused for another
>>>> unrelated checkpoint during that time. This brings the issue of allocating crids
>>>> reliably (using something like a pidmap for instance). Moreover, if such ids are
>>>> exposed to userspace, we need to remember which ones are allocated accross
>>>> reboots and migrations.
>>>>
>>>> I'm afraid that this becomes too complex...
>>> And I'm afraid I didn't explain myself well. So let me rephrase:
>>>
>>> CRIDs are always _local_ to a specific node. The local CRID counter is
>>> bumped (atomically) with each checkpoint attempt. The main use case is
>>> for when the checkpoint is kept is memory either shortly (until it is
>>> written back to disk) or for a longer time (use-cases that want to keep
>>> it there). It only remains valid as long as the checkpoint image is
>>> still in memory and have not been committed to storage/network. Think
>>> of it as a way to identify the operation instance.
>>>
>>> So they can live quite a long time, but only as long as the original
>>> node is still alive and the checkpoint is still kept in memory. They
>>> are meaningless across reboots and migrations. I don't think a wrap
>>> around is a concern, but we can use 64 bit if that is the case.
>>>
>>> Finally, the incremental checkpoint use-case: imagine a container that
>>> is checkpointed regularly every minutes. The first checkpoint will be
>>> a full checkpoint, say CRID=1. The second will be incremental with
>>> respect to the first, with CRID=2, and so on the third and the forth.
>>> Userspace could use these CRID to name the image files (for example,
>>> app.img.CRID). Assume that we decide (big "if") that the convention is
>>> that the last part of the filename must be the CRID, and if we decide
>>> (another big "if") to save the CRID as part of the checkpoint image --
>>> the part that describe the "incremental nature" of a new checkpoint.
>>> (That part would specify where to get state that wasn't really saved
>>> in the new checkpoint but instead can be retrieved from older ones).
>>> If that was the case, then the logic in the kernel would be fairly
>>> to find (and access) the actual files that hold the data. Note, that
>>> in this case - the CRID are guaranteed to be unique per series of
>>> incremental checkpoints, and incremental chekcpoint is meaningless
>>> across reboots (and we can require that across migration too).
>>
>> Letting the kernel guess where to find the missing data of an incremental
>> checkpoint seems a bit hazardous indeed. What about just appending incremental
>> checkpoints to the last full checkpoint file?
>
> It isn't quite a "guess", it's like the kernel assumes that a kernel-helper
> resides in some directory - it's a convention. I agree, though, that it may
> not be the best method to do it.
>
> As for putting everything in a single file, I prefer not to do that, and it
> may not even always possible I believe.
>
> An incremental would include a section that describes how to find the missing
> data from previous checkpoints, so it must have a way to identify a previous
> checkpoint.
>
> On way is like I suggested name them with this identifier, another would be,
> for example, that the user provides a list of file-descriptors that match
> the required identifiers. Other ways may be possible too.
>
> In any event, I think it is now bit early to discuss the exact format and
> logic, when we don't even have a simple checkpoint working :)
>
> Incremental checkpoint is one of a few reasons to use CRIDs, let us first
> agree about CRIDs, and later, when we design incremental checkpoints, decide
> on the technical details of incorporating this CRIDs.
>
Agreed, but since your point is to introduce CRIDs, I'd like to be convinced
that they are needed :) At least I'd like to be convinced that they will not
generate hard-to-manage side effects.
> (Just to avoid confusion, an incremental checkpoint is _not_ a pre-copy or
> live-migration: in a pre-copy, we repeatedly copy the state of the container
> without freezing it until the delta is small enough, then we freeze and then
> we checkpoint the remaining residues. All this activity belongs to a single
> checkpoint. In incremental checkpoints, we talk about multiple checkpoints
> that save only the delta with respect to their preceding checkpoint).
Don't worry, I know what incremental checkpointing is.
>
>>
>>> We probably don't want to use something like a pid to identify the
>>> checkpoint (while in memory), because we may have multiple checkpoints
>>> in memory at a time (of the same container).
>>
>> Agreed.
>>
>>>> It would be way easier if the only (kernel-level) references to a checkpoint
>>>> were pointers to its context. Ideally, the only reference would live in a
>>>> 'struct container' and would be easily updated at restart-time.
>>> Consider the following scenario of calls from user-space (which is
>>> how I envision the checkpoint optimized for minimal downtime, in the
>>> future):
>>>
>>> 1) while (syscall_to_do_precopy) <- do precopy until ready to
>>> if (too_long_already) <- checkpoint or too long
>>> break;
>>>
>>> 2) freeze_container();
>>>
>>> 3) crid = checkpoint(.., .., CR_CKPT_LAZY); <- checkpoint container
>>> <- don't commit to disk
>>> <- (minimize owntime)
>>>
>>> 4) unfreeze_container(); <- now can unfreeze container
>>> <- already as soon as possible
>>>
>>> 5) ckpt_writeback(crid, fd); <- container is back running. we
>>> <- can commit data to storage or
>>> <- network in the background.
>>>
>>> #2 and #4 are done with freezer_cgroup()
>>>
>>> #1, #3 and #5 must be syscalls
>>>
>>> More specifically, syscall #5 must be able to refer to the result of syscall #3
>>> (that is the CRID !). It is possible that another syscall #3 occur, on the same
>>> container, between steps 4 and 5 ... but then that checkpoint will be assigned
>>> another, unique CRID.
>>
>> Hm, assuming that, as proposed above, incremental checkpoints are stored in the
>> same file as the ancestor full checkpoint, why not simply give fd as argument in
>> #5? I'd expect that the kernel would associate the file descriptor to the
>> checkpoint until it is finalized (written back, sent over the wire, etc.).
>
> The above procedure, step 1-5 are for a _single_ checkpoint.
This is what I understood.
>
> Why would the kernel associate a file descriptor with the checkpoint until it
> is finalized ? As far as I'm concerned, the checkpoint call in step 3 can go
> without any FD. Also, what happens if there is another checkpoint, of the
> same container, taken between steps 4 and 5, how would you tell the difference
> or select which one goes in first ? Finally, keeping that FD alive between
> multiple checkpoints would require the checkpointer (e.g. a daemon that will
> periodically checkpoint) to keep it alive.
>
> I view it differently: a checkpoint held in memory is like a kernel resource,
> and requires a handle/identifier for user space to refer to it. Like an IPC
> object. Why tie that object to a specific file descriptor ?
> The only exception I can see, is the need to tie it to a some process - the
> checkpointer for instance, such that if that process dies without completing
> the work, the checkpoint image in memory will be cleaned up.
> That, however, still is problematic, because it will not allow you to use
> different procesess for different steps (above).
>
> Since we are not yet optimizing the checkpoint procedure, just building the
> infrastructure, my goal is to convince that a CRID is a desired feature (and
> I can certainly see how it will be used in various scenarios).
Here is probably the source of the misunderstanding. I was assuming that step #3
needed a file descriptor to dump the checkpoint progressively, but reading your
first use-case more carefully might have avoided this misunderstanding :)
Anyway, we can still give a fd to sys_checkpoint() which will identify the
checkpoint for the remaining operations. It's up to userspace to show the
difference between two checkpoints taken (roughly) at the same time. From the
kernel point of view, a file descriptor is enough to make the difference.
Let's consider the three use cases of CRID you mentioned earlier:
1) Checkpointing in memory:
Actually, checkpointing in memory could also be done from userspace using tmpfs.
Again, I agree that this kind of optimization should be discussed later. I'm
just not convinced that this needs a CRID...
2) Reducing downtime of the checkpoint:
If reducing downtime is just a matter of avoiding disk accesses, tmpfs is again
a kind of solution. It even allows to swap if the checkpoint size is too big.
What kind of scenario (other than incremental checkpointing) do you envision
where multiple calls to sys_checkpoint() would use the same checkpoint object?
3) Incremental checkpoint:
I agree that maintaing a fd alive (in a checkpointer daemon for instance) may
look restrictive, but I'm not sure that it is really needed to keep it alive
between consecutive incremental checkpoints. I'd really like to see incremental
checkpointing as an append operation to a checkpoint file. This way the file
could contain the entire checkpoint history. On the other hand, you are not sure
that we could do incremental checkpoint this way, which justifies your need for
a CRID. Perhaps you have an example?
Anyway, do not take this as an attack. I just want to be well convinced that
CRIDs are really needed, and are worth the effort of managing them cleanly.
Exposing them to userspace just scares me a bit.
Btw, if we ever decide to use CRIDs, I'd propose to manage them in some
pseudo-filesystem, like SYSV IPC objects actually are.
Thanks,
Louis
--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes
[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
[-- Attachment #2: Type: text/plain, Size: 206 bytes --]
_______________________________________________
Containers mailing list
Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
https://lists.linux-foundation.org/mailman/listinfo/containers
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <20080731175058.GI22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
@ 2008-07-31 19:12 ` Oren Laadan
[not found] ` <48920EA0.1060608-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Oren Laadan @ 2008-07-31 19:12 UTC (permalink / raw)
To: Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ; +Cc: Linux Containers
Louis Rilling wrote:
> On Thu, Jul 31, 2008 at 12:28:57PM -0400, Oren Laadan wrote:
>>
>> Louis Rilling wrote:
>>> On Thu, Jul 31, 2008 at 11:09:54AM -0400, Oren Laadan wrote:
>>>> Louis Rilling wrote:
>>>>> On Wed, Jul 30, 2008 at 06:20:32PM -0400, Oren Laadan wrote:
>>>>>> Serge E. Hallyn wrote:
>>>>>>> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>>>>>>> +int do_checkpoint(struct cr_ctx *ctx)
>>>>>>>> +{
>>>>>>>> + int ret;
>>>>>>>> +
>>>>>>>> + /* FIX: need to test whether container is checkpointable */
>>>>>>>> +
>>>>>>>> + ret = cr_write_hdr(ctx);
>>>>>>>> + if (!ret)
>>>>>>>> + ret = cr_write_task(ctx, current);
>>>>>>>> + if (!ret)
>>>>>>>> + ret = cr_write_tail(ctx);
>>>>>>>> +
>>>>>>>> + /* on success, return (unique) checkpoint identifier */
>>>>>>>> + if (!ret)
>>>>>>>> + ret = ctx->crid;
>>>>>>> Does this crid have a purpose?
>>>>>> yes, at least three; both are for the future, but important to set the
>>>>>> meaning of the return value of the syscall already now. The "crid" is
>>>>>> the CR-identifier that identifies the checkpoint. Every checkpoint is
>>>>>> assigned a unique number (using an atomic counter).
>>>>>>
>>>>>> 1) if a checkpoint is taken and kept in memory (instead of to a file) then
>>>>>> this will be the identifier with which the restart (or cleanup) would refer
>>>>>> to the (in memory) checkpoint image
>>>>>>
>>>>>> 2) to reduce downtime of the checkpoint, data will be aggregated on the
>>>>>> checkpoint context, as well as referenced to (cow-ed) pages. This data can
>>>>>> persist between calls to sys_checkpoint(), and the 'crid', again, will be
>>>>>> used to identify the (in-memory-to-be-dumped-to-storage) context.
>>>>>>
>>>>>> 3) for incremental checkpoint (where a successive checkpoint will only
>>>>>> save what has changed since the previous checkpoint) there will be a need
>>>>>> to identify the previous checkpoints (to be able to know where to take
>>>>>> data from during restart). Again, a 'crid' is handy.
>>>>>>
>>>>>> [in fact, for the 3rd use, it will make sense to write that number as
>>>>>> part of the checkpoint image header]
>>>>>>
>>>>>> Note that by doing so, a process that checkpoints itself (in its own
>>>>>> context), can use code that is similar to the logic of fork():
>>>>>>
>>>>>> ...
>>>>>> crid = checkpoint(...);
>>>>>> switch (crid) {
>>>>>> case -1:
>>>>>> perror("checkpoint failed");
>>>>>> break;
>>>>>> default:
>>>>>> fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
>>>>>> /* proceed with execution after checkpoint */
>>>>>> ...
>>>>>> break;
>>>>>> case 0:
>>>>>> fprintf(stderr, "returned after restart\n");
>>>>>> /* proceed with action required following a restart */
>>>>>> ...
>>>>>> break;
>>>>>> }
>>>>>> ...
>>>>> If I understand correctly, this crid can live for quite a long time. So many of
>>>>> them could be generated while some container would accumulate incremental
>>>>> checkpoints on, say crid 5, and possibly crid 5 could be reused for another
>>>>> unrelated checkpoint during that time. This brings the issue of allocating crids
>>>>> reliably (using something like a pidmap for instance). Moreover, if such ids are
>>>>> exposed to userspace, we need to remember which ones are allocated accross
>>>>> reboots and migrations.
>>>>>
>>>>> I'm afraid that this becomes too complex...
>>>> And I'm afraid I didn't explain myself well. So let me rephrase:
>>>>
>>>> CRIDs are always _local_ to a specific node. The local CRID counter is
>>>> bumped (atomically) with each checkpoint attempt. The main use case is
>>>> for when the checkpoint is kept is memory either shortly (until it is
>>>> written back to disk) or for a longer time (use-cases that want to keep
>>>> it there). It only remains valid as long as the checkpoint image is
>>>> still in memory and have not been committed to storage/network. Think
>>>> of it as a way to identify the operation instance.
>>>>
>>>> So they can live quite a long time, but only as long as the original
>>>> node is still alive and the checkpoint is still kept in memory. They
>>>> are meaningless across reboots and migrations. I don't think a wrap
>>>> around is a concern, but we can use 64 bit if that is the case.
>>>>
>>>> Finally, the incremental checkpoint use-case: imagine a container that
>>>> is checkpointed regularly every minutes. The first checkpoint will be
>>>> a full checkpoint, say CRID=1. The second will be incremental with
>>>> respect to the first, with CRID=2, and so on the third and the forth.
>>>> Userspace could use these CRID to name the image files (for example,
>>>> app.img.CRID). Assume that we decide (big "if") that the convention is
>>>> that the last part of the filename must be the CRID, and if we decide
>>>> (another big "if") to save the CRID as part of the checkpoint image --
>>>> the part that describe the "incremental nature" of a new checkpoint.
>>>> (That part would specify where to get state that wasn't really saved
>>>> in the new checkpoint but instead can be retrieved from older ones).
>>>> If that was the case, then the logic in the kernel would be fairly
>>>> to find (and access) the actual files that hold the data. Note, that
>>>> in this case - the CRID are guaranteed to be unique per series of
>>>> incremental checkpoints, and incremental chekcpoint is meaningless
>>>> across reboots (and we can require that across migration too).
>>> Letting the kernel guess where to find the missing data of an incremental
>>> checkpoint seems a bit hazardous indeed. What about just appending incremental
>>> checkpoints to the last full checkpoint file?
>> It isn't quite a "guess", it's like the kernel assumes that a kernel-helper
>> resides in some directory - it's a convention. I agree, though, that it may
>> not be the best method to do it.
>>
>> As for putting everything in a single file, I prefer not to do that, and it
>> may not even always possible I believe.
>>
>> An incremental would include a section that describes how to find the missing
>> data from previous checkpoints, so it must have a way to identify a previous
>> checkpoint.
>>
>> On way is like I suggested name them with this identifier, another would be,
>> for example, that the user provides a list of file-descriptors that match
>> the required identifiers. Other ways may be possible too.
>>
>> In any event, I think it is now bit early to discuss the exact format and
>> logic, when we don't even have a simple checkpoint working :)
>>
>> Incremental checkpoint is one of a few reasons to use CRIDs, let us first
>> agree about CRIDs, and later, when we design incremental checkpoints, decide
>> on the technical details of incorporating this CRIDs.
>>
>
> Agreed, but since your point is to introduce CRIDs, I'd like to be convinced
> that they are needed :) At least I'd like to be convinced that they will not
> generate hard-to-manage side effects.
>
>> (Just to avoid confusion, an incremental checkpoint is _not_ a pre-copy or
>> live-migration: in a pre-copy, we repeatedly copy the state of the container
>> without freezing it until the delta is small enough, then we freeze and then
>> we checkpoint the remaining residues. All this activity belongs to a single
>> checkpoint. In incremental checkpoints, we talk about multiple checkpoints
>> that save only the delta with respect to their preceding checkpoint).
>
> Don't worry, I know what incremental checkpointing is.
>
>>>> We probably don't want to use something like a pid to identify the
>>>> checkpoint (while in memory), because we may have multiple checkpoints
>>>> in memory at a time (of the same container).
>>> Agreed.
>>>
>>>>> It would be way easier if the only (kernel-level) references to a checkpoint
>>>>> were pointers to its context. Ideally, the only reference would live in a
>>>>> 'struct container' and would be easily updated at restart-time.
>>>> Consider the following scenario of calls from user-space (which is
>>>> how I envision the checkpoint optimized for minimal downtime, in the
>>>> future):
>>>>
>>>> 1) while (syscall_to_do_precopy) <- do precopy until ready to
>>>> if (too_long_already) <- checkpoint or too long
>>>> break;
>>>>
>>>> 2) freeze_container();
>>>>
>>>> 3) crid = checkpoint(.., .., CR_CKPT_LAZY); <- checkpoint container
>>>> <- don't commit to disk
>>>> <- (minimize owntime)
>>>>
>>>> 4) unfreeze_container(); <- now can unfreeze container
>>>> <- already as soon as possible
>>>>
>>>> 5) ckpt_writeback(crid, fd); <- container is back running. we
>>>> <- can commit data to storage or
>>>> <- network in the background.
>>>>
>>>> #2 and #4 are done with freezer_cgroup()
>>>>
>>>> #1, #3 and #5 must be syscalls
>>>>
>>>> More specifically, syscall #5 must be able to refer to the result of syscall #3
>>>> (that is the CRID !). It is possible that another syscall #3 occur, on the same
>>>> container, between steps 4 and 5 ... but then that checkpoint will be assigned
>>>> another, unique CRID.
>>> Hm, assuming that, as proposed above, incremental checkpoints are stored in the
>>> same file as the ancestor full checkpoint, why not simply give fd as argument in
>>> #5? I'd expect that the kernel would associate the file descriptor to the
>>> checkpoint until it is finalized (written back, sent over the wire, etc.).
>> The above procedure, step 1-5 are for a _single_ checkpoint.
>
> This is what I understood.
>
>> Why would the kernel associate a file descriptor with the checkpoint until it
>> is finalized ? As far as I'm concerned, the checkpoint call in step 3 can go
>> without any FD. Also, what happens if there is another checkpoint, of the
>> same container, taken between steps 4 and 5, how would you tell the difference
>> or select which one goes in first ? Finally, keeping that FD alive between
>> multiple checkpoints would require the checkpointer (e.g. a daemon that will
>> periodically checkpoint) to keep it alive.
>>
>> I view it differently: a checkpoint held in memory is like a kernel resource,
>> and requires a handle/identifier for user space to refer to it. Like an IPC
>> object. Why tie that object to a specific file descriptor ?
>> The only exception I can see, is the need to tie it to a some process - the
>> checkpointer for instance, such that if that process dies without completing
>> the work, the checkpoint image in memory will be cleaned up.
>> That, however, still is problematic, because it will not allow you to use
>> different procesess for different steps (above).
>>
>> Since we are not yet optimizing the checkpoint procedure, just building the
>> infrastructure, my goal is to convince that a CRID is a desired feature (and
>> I can certainly see how it will be used in various scenarios).
>
> Here is probably the source of the misunderstanding. I was assuming that step #3
> needed a file descriptor to dump the checkpoint progressively, but reading your
> first use-case more carefully might have avoided this misunderstanding :)
Even without the first use-case (checkpoint in memory), step 3 does not need
necessarily a file-descriptor to which data will be dumped, in the case of
said optimization. Consider a scenario with periodic checkpointing of a long
running application, where we would like to minimize the downtime of the
application due to each checkpoint. The idea is to do steps 1 and 3 entirely
in memory, keep the data in a buffer (see below comment about tmpfs). The
expensive operation of streaming the data to the file-descriptor is only
done in step 5.
(In the case of checkpoint in memory - it is never written to a file. There
are various optimization to do there for fast restart for which putting the
data in a file doesn't make sense).
As for using tmpfs -- so during step 3 the state of all tasks is saved; part
of it is headers, task data, signals etc, but mostly the memory content. For
as long as the checkpoint is kept in memory (either because it is meant to
stay there, or because it is not committed to the file-descriptor yet), there
is no reason to make a copy of each (dirty) page. On the contrary - the pages
will be marked COW and a reference will be kept, as part of the checkpoint
context. Sure, you can put the rest of the data in a file in tmpfs; but you
probably don't want to copy all the pages to a file in tmpfs - that would be
wasteful.
> Anyway, we can still give a fd to sys_checkpoint() which will identify the
> checkpoint for the remaining operations. It's up to userspace to show the
> difference between two checkpoints taken (roughly) at the same time. From the
> kernel point of view, a file descriptor is enough to make the difference.
That is indeed an option. I haven't given a lot of thought to this approach,
because in Zap I use CRIDs. Three points against this approach are that:
(1) as I said, that would require that the file descriptor remains alive for
as long as we want to keep the checkpoint alive (in memory), and
(2) if the checkpoint is taken by a process from within the container, we
create a situation where a resource held by the process (an FD), is referring
to the checkpoint itself and at the same time also referred to by the
checkpoint (because it is part of the state of a process that is in the
container...). In particular this will necessitate some special case treatment
during the restart operation.
(3) if a give tasks wants to keep many checkpoints in memory (again, either
permanently or shortly), it will have to keep, forever, a lot of open file
descriptors.
On the other hand, using an FD provide the advantage of a simple cleanup (FD
closed -> checkpoint data discarded) and ridding us from the need to come up
with a cleanup strategy.
>
> Let's consider the three use cases of CRID you mentioned earlier:
>
> 1) Checkpointing in memory:
> Actually, checkpointing in memory could also be done from userspace using tmpfs.
> Again, I agree that this kind of optimization should be discussed later. I'm
> just not convinced that this needs a CRID...
See my comment about regarding tmpfs. You are right, however, in that we could
use FD to tmpfs where the rest of the data (not pages) will be stored.
>
> 2) Reducing downtime of the checkpoint:
> If reducing downtime is just a matter of avoiding disk accesses, tmpfs is again
> a kind of solution. It even allows to swap if the checkpoint size is too big.
> What kind of scenario (other than incremental checkpointing) do you envision
> where multiple calls to sys_checkpoint() would use the same checkpoint object?
Again, see the comment regarding tmpfs. The actual memory copy operation between
the real pages and the space allocated in tmpfs can take substantial time for
applications with large memory (compared to merely marking the pages COW, and
amortizing the cost during regular execution of the application), besides the
extra space overhead. Also, writing tmpfs incurs visible overhead when you care
about milliseconds of downtime; I've seen that with Zap.
> 3) Incremental checkpoint:
> I agree that maintaing a fd alive (in a checkpointer daemon for instance) may
> look restrictive, but I'm not sure that it is really needed to keep it alive
> between consecutive incremental checkpoints. I'd really like to see incremental
> checkpointing as an append operation to a checkpoint file. This way the file
Why ? What's the advantage of having all data in a single file as opposed to
multiple files ?
Recall that the data can be streamed, so when you start to read a file you
don't know a-priori how long is the checkpoint image, until you have parsed
it all; So you can't easily find the beginning of the, say 15th checkpoint
int that case.
Depending on the size of your checkpoint, a single file may eventually become
very large in a short time. I have one system that takes a checkpoint every
second of en entire user-desktop ...
One single large file is harder to manager, parse, and inspect, even with
proper user tools. If you wanted to change something inside (for whatever
reasons), that would be a difficult to do. Same goes for when you want to
coalesce multiple checkpoints into a single checkpoint (e.g. to save space,
or because you don't care about some of your past)
Ahh.. ok.. I stop here. This is not related to CRID vs. FD anymore :)
> could contain the entire checkpoint history. On the other hand, you are not sure
> that we could do incremental checkpoint this way, which justifies your need for
> a CRID. Perhaps you have an example?
Arguments given above. Note that even with multiple files we don't _need_
CRID, they are merely helpful. Instead, the user could be required to provide
the kernel with an array of file names, corresponding to checkpoint#0 (base),
checkpoint#2, checkpoint#3 etc; In this case, the "incremental state" that
is saved with checkpoint#4, is (a) that it is #4, and (b) for each part of
state that is found in a previous checkpoint, a reference to the serial no.
of that checkpoint is kept.
(The proposal for CRID was that instead of a serial number that starts from
0 with every full (base) checkpoint, we use the CRID).
>
> Anyway, do not take this as an attack. I just want to be well convinced that
On the contrary; your comments are definitely in place.
> CRIDs are really needed, and are worth the effort of managing them cleanly.
> Exposing them to userspace just scares me a bit.
I'm not sure why is there an "effort of managing" them ? It's a simple
atomic counter, that won't wrap around (use 64 bit if we wish). All in-memory
checkpoint contexts will be (also) in global linked list and easily located
there by their CRID.
>
> Btw, if we ever decide to use CRIDs, I'd propose to manage them in some
> pseudo-filesystem, like SYSV IPC objects actually are.
Eventually, yes ;)
> Thanks,
>
> Louis
>
Thanks for the comments and stimulating the discussion.
Oren.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <4890E930.9090204-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2008-07-31 13:57 ` Louis Rilling
@ 2008-07-31 21:25 ` Serge E. Hallyn
1 sibling, 0 replies; 37+ messages in thread
From: Serge E. Hallyn @ 2008-07-31 21:25 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers
Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>
>
> Serge E. Hallyn wrote:
>> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>> +int do_checkpoint(struct cr_ctx *ctx)
>>> +{
>>> + int ret;
>>> +
>>> + /* FIX: need to test whether container is checkpointable */
>>> +
>>> + ret = cr_write_hdr(ctx);
>>> + if (!ret)
>>> + ret = cr_write_task(ctx, current);
>>> + if (!ret)
>>> + ret = cr_write_tail(ctx);
>>> +
>>> + /* on success, return (unique) checkpoint identifier */
>>> + if (!ret)
>>> + ret = ctx->crid;
>>
>> Does this crid have a purpose?
>
> yes, at least three; both are for the future, but important to set the
> meaning of the return value of the syscall already now. The "crid" is
> the CR-identifier that identifies the checkpoint. Every checkpoint is
> assigned a unique number (using an atomic counter).
>
> 1) if a checkpoint is taken and kept in memory (instead of to a file) then
> this will be the identifier with which the restart (or cleanup) would refer
> to the (in memory) checkpoint image
>
> 2) to reduce downtime of the checkpoint, data will be aggregated on the
> checkpoint context, as well as referenced to (cow-ed) pages. This data can
> persist between calls to sys_checkpoint(), and the 'crid', again, will be
> used to identify the (in-memory-to-be-dumped-to-storage) context.
>
> 3) for incremental checkpoint (where a successive checkpoint will only
> save what has changed since the previous checkpoint) there will be a need
> to identify the previous checkpoints (to be able to know where to take
> data from during restart). Again, a 'crid' is handy.
>
> [in fact, for the 3rd use, it will make sense to write that number as
> part of the checkpoint image header]
>
> Note that by doing so, a process that checkpoints itself (in its own
> context), can use code that is similar to the logic of fork():
>
> ...
> crid = checkpoint(...);
> switch (crid) {
> case -1:
> perror("checkpoint failed");
> break;
> default:
> fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
> /* proceed with execution after checkpoint */
> ...
> break;
> case 0:
> fprintf(stderr, "returned after restart\n");
> /* proceed with action required following a restart */
> ...
> break;
> }
> ...
Thanks - for this and the later explanations in replies to Louis.
Really I had no doubt it had a purpose :) but wasn't sure what it was.
Quite clear now. Thanks.
-serge
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <48920EA0.1060608-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2008-08-01 10:26 ` Louis Rilling
[not found] ` <20080801102600.GJ22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Louis Rilling @ 2008-08-01 10:26 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers
[-- Attachment #1.1: Type: text/plain, Size: 23930 bytes --]
On Thu, Jul 31, 2008 at 03:12:32PM -0400, Oren Laadan wrote:
>
>
> Louis Rilling wrote:
>> On Thu, Jul 31, 2008 at 12:28:57PM -0400, Oren Laadan wrote:
>>>
>>> Louis Rilling wrote:
>>>> On Thu, Jul 31, 2008 at 11:09:54AM -0400, Oren Laadan wrote:
>>>>> Louis Rilling wrote:
>>>>>> On Wed, Jul 30, 2008 at 06:20:32PM -0400, Oren Laadan wrote:
>>>>>>> Serge E. Hallyn wrote:
>>>>>>>> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>>>>>>>> +int do_checkpoint(struct cr_ctx *ctx)
>>>>>>>>> +{
>>>>>>>>> + int ret;
>>>>>>>>> +
>>>>>>>>> + /* FIX: need to test whether container is checkpointable */
>>>>>>>>> +
>>>>>>>>> + ret = cr_write_hdr(ctx);
>>>>>>>>> + if (!ret)
>>>>>>>>> + ret = cr_write_task(ctx, current);
>>>>>>>>> + if (!ret)
>>>>>>>>> + ret = cr_write_tail(ctx);
>>>>>>>>> +
>>>>>>>>> + /* on success, return (unique) checkpoint identifier */
>>>>>>>>> + if (!ret)
>>>>>>>>> + ret = ctx->crid;
>>>>>>>> Does this crid have a purpose?
>>>>>>> yes, at least three; both are for the future, but important to set the
>>>>>>> meaning of the return value of the syscall already now. The "crid" is
>>>>>>> the CR-identifier that identifies the checkpoint. Every checkpoint is
>>>>>>> assigned a unique number (using an atomic counter).
>>>>>>>
>>>>>>> 1) if a checkpoint is taken and kept in memory (instead of to a file) then
>>>>>>> this will be the identifier with which the restart (or cleanup) would refer
>>>>>>> to the (in memory) checkpoint image
>>>>>>>
>>>>>>> 2) to reduce downtime of the checkpoint, data will be aggregated on the
>>>>>>> checkpoint context, as well as referenced to (cow-ed) pages. This data can
>>>>>>> persist between calls to sys_checkpoint(), and the 'crid', again, will be
>>>>>>> used to identify the (in-memory-to-be-dumped-to-storage) context.
>>>>>>>
>>>>>>> 3) for incremental checkpoint (where a successive checkpoint will only
>>>>>>> save what has changed since the previous checkpoint) there will be a need
>>>>>>> to identify the previous checkpoints (to be able to know where to take
>>>>>>> data from during restart). Again, a 'crid' is handy.
>>>>>>>
>>>>>>> [in fact, for the 3rd use, it will make sense to write that number as
>>>>>>> part of the checkpoint image header]
>>>>>>>
>>>>>>> Note that by doing so, a process that checkpoints itself (in its own
>>>>>>> context), can use code that is similar to the logic of fork():
>>>>>>>
>>>>>>> ...
>>>>>>> crid = checkpoint(...);
>>>>>>> switch (crid) {
>>>>>>> case -1:
>>>>>>> perror("checkpoint failed");
>>>>>>> break;
>>>>>>> default:
>>>>>>> fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
>>>>>>> /* proceed with execution after checkpoint */
>>>>>>> ...
>>>>>>> break;
>>>>>>> case 0:
>>>>>>> fprintf(stderr, "returned after restart\n");
>>>>>>> /* proceed with action required following a restart */
>>>>>>> ...
>>>>>>> break;
>>>>>>> }
>>>>>>> ...
>>>>>> If I understand correctly, this crid can live for quite a long time. So many of
>>>>>> them could be generated while some container would accumulate incremental
>>>>>> checkpoints on, say crid 5, and possibly crid 5 could be reused for another
>>>>>> unrelated checkpoint during that time. This brings the issue of allocating crids
>>>>>> reliably (using something like a pidmap for instance). Moreover, if such ids are
>>>>>> exposed to userspace, we need to remember which ones are allocated accross
>>>>>> reboots and migrations.
>>>>>>
>>>>>> I'm afraid that this becomes too complex...
>>>>> And I'm afraid I didn't explain myself well. So let me rephrase:
>>>>>
>>>>> CRIDs are always _local_ to a specific node. The local CRID counter is
>>>>> bumped (atomically) with each checkpoint attempt. The main use case is
>>>>> for when the checkpoint is kept is memory either shortly (until it is
>>>>> written back to disk) or for a longer time (use-cases that want to keep
>>>>> it there). It only remains valid as long as the checkpoint image is
>>>>> still in memory and have not been committed to storage/network. Think
>>>>> of it as a way to identify the operation instance.
>>>>>
>>>>> So they can live quite a long time, but only as long as the original
>>>>> node is still alive and the checkpoint is still kept in memory. They
>>>>> are meaningless across reboots and migrations. I don't think a wrap
>>>>> around is a concern, but we can use 64 bit if that is the case.
>>>>>
>>>>> Finally, the incremental checkpoint use-case: imagine a container that
>>>>> is checkpointed regularly every minutes. The first checkpoint will be
>>>>> a full checkpoint, say CRID=1. The second will be incremental with
>>>>> respect to the first, with CRID=2, and so on the third and the forth.
>>>>> Userspace could use these CRID to name the image files (for example,
>>>>> app.img.CRID). Assume that we decide (big "if") that the convention is
>>>>> that the last part of the filename must be the CRID, and if we decide
>>>>> (another big "if") to save the CRID as part of the checkpoint image --
>>>>> the part that describe the "incremental nature" of a new checkpoint.
>>>>> (That part would specify where to get state that wasn't really saved
>>>>> in the new checkpoint but instead can be retrieved from older ones).
>>>>> If that was the case, then the logic in the kernel would be fairly
>>>>> to find (and access) the actual files that hold the data. Note, that
>>>>> in this case - the CRID are guaranteed to be unique per series of
>>>>> incremental checkpoints, and incremental chekcpoint is meaningless
>>>>> across reboots (and we can require that across migration too).
>>>> Letting the kernel guess where to find the missing data of an incremental
>>>> checkpoint seems a bit hazardous indeed. What about just appending incremental
>>>> checkpoints to the last full checkpoint file?
>>> It isn't quite a "guess", it's like the kernel assumes that a kernel-helper
>>> resides in some directory - it's a convention. I agree, though, that it may
>>> not be the best method to do it.
>>>
>>> As for putting everything in a single file, I prefer not to do that, and it
>>> may not even always possible I believe.
>>>
>>> An incremental would include a section that describes how to find the missing
>>> data from previous checkpoints, so it must have a way to identify a previous
>>> checkpoint.
>>>
>>> On way is like I suggested name them with this identifier, another would be,
>>> for example, that the user provides a list of file-descriptors that match
>>> the required identifiers. Other ways may be possible too.
>>>
>>> In any event, I think it is now bit early to discuss the exact format and
>>> logic, when we don't even have a simple checkpoint working :)
>>>
>>> Incremental checkpoint is one of a few reasons to use CRIDs, let us first
>>> agree about CRIDs, and later, when we design incremental checkpoints, decide
>>> on the technical details of incorporating this CRIDs.
>>>
>>
>> Agreed, but since your point is to introduce CRIDs, I'd like to be convinced
>> that they are needed :) At least I'd like to be convinced that they will not
>> generate hard-to-manage side effects.
>>
>>> (Just to avoid confusion, an incremental checkpoint is _not_ a pre-copy or
>>> live-migration: in a pre-copy, we repeatedly copy the state of the container
>>> without freezing it until the delta is small enough, then we freeze and then
>>> we checkpoint the remaining residues. All this activity belongs to a single
>>> checkpoint. In incremental checkpoints, we talk about multiple checkpoints
>>> that save only the delta with respect to their preceding checkpoint).
>>
>> Don't worry, I know what incremental checkpointing is.
>>
>>>>> We probably don't want to use something like a pid to identify the
>>>>> checkpoint (while in memory), because we may have multiple checkpoints
>>>>> in memory at a time (of the same container).
>>>> Agreed.
>>>>
>>>>>> It would be way easier if the only (kernel-level) references to a checkpoint
>>>>>> were pointers to its context. Ideally, the only reference would live in a
>>>>>> 'struct container' and would be easily updated at restart-time.
>>>>> Consider the following scenario of calls from user-space (which is
>>>>> how I envision the checkpoint optimized for minimal downtime, in the
>>>>> future):
>>>>>
>>>>> 1) while (syscall_to_do_precopy) <- do precopy until ready to
>>>>> if (too_long_already) <- checkpoint or too long
>>>>> break;
>>>>>
>>>>> 2) freeze_container();
>>>>>
>>>>> 3) crid = checkpoint(.., .., CR_CKPT_LAZY); <- checkpoint container
>>>>> <- don't commit to disk
>>>>> <- (minimize owntime)
>>>>>
>>>>> 4) unfreeze_container(); <- now can unfreeze container
>>>>> <- already as soon as possible
>>>>>
>>>>> 5) ckpt_writeback(crid, fd); <- container is back running. we
>>>>> <- can commit data to storage or
>>>>> <- network in the background.
>>>>>
>>>>> #2 and #4 are done with freezer_cgroup()
>>>>>
>>>>> #1, #3 and #5 must be syscalls
>>>>>
>>>>> More specifically, syscall #5 must be able to refer to the result of syscall #3
>>>>> (that is the CRID !). It is possible that another syscall #3 occur, on the same
>>>>> container, between steps 4 and 5 ... but then that checkpoint will be assigned
>>>>> another, unique CRID.
>>>> Hm, assuming that, as proposed above, incremental checkpoints are stored in the
>>>> same file as the ancestor full checkpoint, why not simply give fd as argument in
>>>> #5? I'd expect that the kernel would associate the file descriptor to the
>>>> checkpoint until it is finalized (written back, sent over the wire, etc.).
>>> The above procedure, step 1-5 are for a _single_ checkpoint.
>>
>> This is what I understood.
>>
>>> Why would the kernel associate a file descriptor with the checkpoint until it
>>> is finalized ? As far as I'm concerned, the checkpoint call in step 3 can go
>>> without any FD. Also, what happens if there is another checkpoint, of the
>>> same container, taken between steps 4 and 5, how would you tell the difference
>>> or select which one goes in first ? Finally, keeping that FD alive between
>>> multiple checkpoints would require the checkpointer (e.g. a daemon that will
>>> periodically checkpoint) to keep it alive.
>>>
>>> I view it differently: a checkpoint held in memory is like a kernel resource,
>>> and requires a handle/identifier for user space to refer to it. Like an IPC
>>> object. Why tie that object to a specific file descriptor ?
>>> The only exception I can see, is the need to tie it to a some process - the
>>> checkpointer for instance, such that if that process dies without completing
>>> the work, the checkpoint image in memory will be cleaned up.
>>> That, however, still is problematic, because it will not allow you to use
>>> different procesess for different steps (above).
>>>
>>> Since we are not yet optimizing the checkpoint procedure, just building the
>>> infrastructure, my goal is to convince that a CRID is a desired feature (and
>>> I can certainly see how it will be used in various scenarios).
>>
>> Here is probably the source of the misunderstanding. I was assuming that step #3
>> needed a file descriptor to dump the checkpoint progressively, but reading your
>> first use-case more carefully might have avoided this misunderstanding :)
>
> Even without the first use-case (checkpoint in memory), step 3 does not need
> necessarily a file-descriptor to which data will be dumped, in the case of
> said optimization. Consider a scenario with periodic checkpointing of a long
> running application, where we would like to minimize the downtime of the
> application due to each checkpoint. The idea is to do steps 1 and 3 entirely
> in memory, keep the data in a buffer (see below comment about tmpfs). The
> expensive operation of streaming the data to the file-descriptor is only
> done in step 5.
>
> (In the case of checkpoint in memory - it is never written to a file. There
> are various optimization to do there for fast restart for which putting the
> data in a file doesn't make sense).
>
> As for using tmpfs -- so during step 3 the state of all tasks is saved; part
> of it is headers, task data, signals etc, but mostly the memory content. For
> as long as the checkpoint is kept in memory (either because it is meant to
> stay there, or because it is not committed to the file-descriptor yet), there
> is no reason to make a copy of each (dirty) page. On the contrary - the pages
> will be marked COW and a reference will be kept, as part of the checkpoint
> context. Sure, you can put the rest of the data in a file in tmpfs; but you
> probably don't want to copy all the pages to a file in tmpfs - that would be
> wasteful.
I think that memory pages need not to be dumped in step #3. They can be kept
just as you mentioned in COW state in the checkpoint context, and be really
dumped only in step #5.
>
>> Anyway, we can still give a fd to sys_checkpoint() which will identify the
>> checkpoint for the remaining operations. It's up to userspace to show the
>> difference between two checkpoints taken (roughly) at the same time. From the
>> kernel point of view, a file descriptor is enough to make the difference.
>
> That is indeed an option. I haven't given a lot of thought to this approach,
> because in Zap I use CRIDs. Three points against this approach are that:
>
> (1) as I said, that would require that the file descriptor remains alive for
> as long as we want to keep the checkpoint alive (in memory), and
Not sure that this is so bad. The checkpointer can transfer the descriptor
to some daemon using the file descriptor transfer feature of UNIX sockets, and
then freely exit.
>
> (2) if the checkpoint is taken by a process from within the container, we
> create a situation where a resource held by the process (an FD), is referring
> to the checkpoint itself and at the same time also referred to by the
> checkpoint (because it is part of the state of a process that is in the
> container...). In particular this will necessitate some special case treatment
> during the restart operation.
Interesting case. This means that the checkpointer would be checkpointed while
inside sys_checkpoint(), and would possibly try to writeback the checkpoint
after restart (going to step #5 as if it was not restarted). So the special
handling is already needed there, right? Like making sys_checkpoint() return an
error upon restart. I'm not sure that the checkpoint fd should really need a
special handling in the special case of self-checkpoiting, because the
checkpointer shoud probably not try to do anything with this checkpoint after a
restart, unless it reopens the checkpoint file for appending new incremental
checkpoints.
Anyway, we are trying to solve an issue that was explicitly forbidden in
previous discussions IIUC, because the whole container is assumed to be frozen
before calling sys_checkpoint(), which means that the checkpointer should live
outside of the container.
>
> (3) if a give tasks wants to keep many checkpoints in memory (again, either
> permanently or shortly), it will have to keep, forever, a lot of open file
> descriptors.
The only problem I see here is the limitation on the number of file descriptors.
Hm, hundreds of checkpoints in memory looks like memory wastage in some way.
>
> On the other hand, using an FD provide the advantage of a simple cleanup (FD
> closed -> checkpoint data discarded) and ridding us from the need to come up
> with a cleanup strategy.
We would not get this for free unless we add data for this to the file
descriptor. Adding something like an inotify listener (only used by the kernel)
should also make it.
>
>>
>> Let's consider the three use cases of CRID you mentioned earlier:
>>
>> 1) Checkpointing in memory:
>> Actually, checkpointing in memory could also be done from userspace using tmpfs.
>> Again, I agree that this kind of optimization should be discussed later. I'm
>> just not convinced that this needs a CRID...
>
> See my comment about regarding tmpfs. You are right, however, in that we could
> use FD to tmpfs where the rest of the data (not pages) will be stored.
See my comment above ;)
>
>>
>> 2) Reducing downtime of the checkpoint:
>> If reducing downtime is just a matter of avoiding disk accesses, tmpfs is again
>> a kind of solution. It even allows to swap if the checkpoint size is too big.
>> What kind of scenario (other than incremental checkpointing) do you envision
>> where multiple calls to sys_checkpoint() would use the same checkpoint object?
>
> Again, see the comment regarding tmpfs. The actual memory copy operation between
> the real pages and the space allocated in tmpfs can take substantial time for
> applications with large memory (compared to merely marking the pages COW, and
> amortizing the cost during regular execution of the application), besides the
> extra space overhead. Also, writing tmpfs incurs visible overhead when you care
> about milliseconds of downtime; I've seen that with Zap.
Are those milliseconds related to pages or to the kernel structures also?
>
>> 3) Incremental checkpoint:
>> I agree that maintaing a fd alive (in a checkpointer daemon for instance) may
>> look restrictive, but I'm not sure that it is really needed to keep it alive
>> between consecutive incremental checkpoints. I'd really like to see incremental
>> checkpointing as an append operation to a checkpoint file. This way the file
>
> Why ? What's the advantage of having all data in a single file as opposed to
> multiple files ?
- You do not have to look for the previous checkpoints using a to-be-defined
naming scheme, since they are all in the file.
- Userspace makes less errors when managing incremental checkpoints.
- You can easily create new branches by just copying the file, restarting from
it, and adding incremental checkpoints to it. (Not sure this branch feature
is really interesting, but I it sounds funny :))
>
> Recall that the data can be streamed, so when you start to read a file you
> don't know a-priori how long is the checkpoint image, until you have parsed
> it all; So you can't easily find the beginning of the, say 15th checkpoint
> int that case.
Good point: in append-only mode, we do not know that there are 15 checkpoints
until we reach the 15th one. Perhaps append-only is too restrictive for
incremental checkpoint. OTOH, do we really want to support a unique stream
having multiple checkpoints? Probably not. So rewrite and append looks like a
better option. An incremental checkpoint procedure could look like this:
err = sys_checkpoint(base_fd, out_fd, ...)
where:
- base_fd is a regular file containing the base checkpoint, or -1 if a full
checkpoint should be done. The checkpoint could actually also live in memory,
and the kernel should check that it matches the image pointed to by base_fd.
- out_fd is whatever file/socket/etc. on which we should dump the checkpoint. In
particular, out_fd can equal base_fd and should point to the beginning of the
file if it's a regular file.
If base_fd is a valid file descriptor, sys_checkpoint() would do this:
#1 check the validity of the checkpoint image (possibly compare with in-memory
checkpoint states),
#2 (over)write the position of the next (coming) checkpoint on out_fd (see
explanations below) and its sequence number as well (this actually makes
sequence counters live in the checkpoint image),
#3 write the contents of base_fd to out_fd, marking the records invalidated by
the current checkpoint on the fly (see explanations below),
#4 write the new incremental checkpoint records.
This assumes that a checkpoint image has a place in the header to tell where the
last checkpoint image is. Eventually, each record (task struct, vma, page, etc.)
should contain a field telling which later incremental checkpoint invalidates
it, so that we can restart from any intermediate checkpoint if we like.
Moreover, each intermediate checkpoint would contain a pointer to the start of
the previous and the next one, so that any intermediate checkpoint can be easily
found. This actually makes step #2 and #3 modify the checkpoint image in place,
whenever based_fd and out_fd point to the same file. This disables streaming for
restarts from an intermediate checkpoint, but I don't think this is a real
issue, unless there are use cases outside live-migration?
>
> Depending on the size of your checkpoint, a single file may eventually become
> very large in a short time. I have one system that takes a checkpoint every
> second of en entire user-desktop ...
>
> One single large file is harder to manager, parse, and inspect, even with
> proper user tools. If you wanted to change something inside (for whatever
> reasons), that would be a difficult to do. Same goes for when you want to
> coalesce multiple checkpoints into a single checkpoint (e.g. to save space,
> or because you don't care about some of your past)
Ok, this becomes more complex, but feasible I think (see above).
Coalescing checkpoints seems rather easy as soon as checkpoints records are
tagged with the first checkpoint number that invalidates them.
>
> Ahh.. ok.. I stop here. This is not related to CRID vs. FD anymore :)
You're right. Hopefully it is interesting, although a bit early to discuss :)
>
>> could contain the entire checkpoint history. On the other hand, you are not sure
>> that we could do incremental checkpoint this way, which justifies your need for
>> a CRID. Perhaps you have an example?
>
> Arguments given above. Note that even with multiple files we don't _need_
> CRID, they are merely helpful. Instead, the user could be required to provide
> the kernel with an array of file names, corresponding to checkpoint#0 (base),
> checkpoint#2, checkpoint#3 etc; In this case, the "incremental state" that
> is saved with checkpoint#4, is (a) that it is #4, and (b) for each part of
> state that is found in a previous checkpoint, a reference to the serial no.
> of that checkpoint is kept.
See above for a solution based on a single file.
>
> (The proposal for CRID was that instead of a serial number that starts from
> 0 with every full (base) checkpoint, we use the CRID).
>
>>
>> Anyway, do not take this as an attack. I just want to be well convinced that
>
> On the contrary; your comments are definitely in place.
>
>> CRIDs are really needed, and are worth the effort of managing them cleanly.
>> Exposing them to userspace just scares me a bit.
>
> I'm not sure why is there an "effort of managing" them ? It's a simple
> atomic counter, that won't wrap around (use 64 bit if we wish). All in-memory
> checkpoint contexts will be (also) in global linked list and easily located
> there by their CRID.
Ok, as long as no userspace task holds such IDs accross reboot or migration. How
would you check this?
>
>>
>> Btw, if we ever decide to use CRIDs, I'd propose to manage them in some
>> pseudo-filesystem, like SYSV IPC objects actually are.
>
> Eventually, yes ;)
>
>> Thanks,
>>
>> Louis
>>
>
> Thanks for the comments and stimulating the discussion.
I should have had many more discussions like this during my PhD. Your's is going
to be definitely better than mine :)
Thanks,
Louis
--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes
[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
[-- Attachment #2: Type: text/plain, Size: 206 bytes --]
_______________________________________________
Containers mailing list
Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
https://lists.linux-foundation.org/mailman/listinfo/containers
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <20080801102600.GJ22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
@ 2008-08-01 14:15 ` Oren Laadan
[not found] ` <48931A7E.1040302-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Oren Laadan @ 2008-08-01 14:15 UTC (permalink / raw)
To: Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ; +Cc: Linux Containers
Louis Rilling wrote:
> On Thu, Jul 31, 2008 at 03:12:32PM -0400, Oren Laadan wrote:
>>
>> Louis Rilling wrote:
>>> On Thu, Jul 31, 2008 at 12:28:57PM -0400, Oren Laadan wrote:
>>>> Louis Rilling wrote:
>>>>> On Thu, Jul 31, 2008 at 11:09:54AM -0400, Oren Laadan wrote:
>>>>>> Louis Rilling wrote:
>>>>>>> On Wed, Jul 30, 2008 at 06:20:32PM -0400, Oren Laadan wrote:
>>>>>>>> Serge E. Hallyn wrote:
>>>>>>>>> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>>>>>>>>> +int do_checkpoint(struct cr_ctx *ctx)
>>>>>>>>>> +{
>>>>>>>>>> + int ret;
>>>>>>>>>> +
>>>>>>>>>> + /* FIX: need to test whether container is checkpointable */
>>>>>>>>>> +
>>>>>>>>>> + ret = cr_write_hdr(ctx);
>>>>>>>>>> + if (!ret)
>>>>>>>>>> + ret = cr_write_task(ctx, current);
>>>>>>>>>> + if (!ret)
>>>>>>>>>> + ret = cr_write_tail(ctx);
>>>>>>>>>> +
>>>>>>>>>> + /* on success, return (unique) checkpoint identifier */
>>>>>>>>>> + if (!ret)
>>>>>>>>>> + ret = ctx->crid;
>>>>>>>>> Does this crid have a purpose?
>>>>>>>> yes, at least three; both are for the future, but important to set the
>>>>>>>> meaning of the return value of the syscall already now. The "crid" is
>>>>>>>> the CR-identifier that identifies the checkpoint. Every checkpoint is
>>>>>>>> assigned a unique number (using an atomic counter).
>>>>>>>>
>>>>>>>> 1) if a checkpoint is taken and kept in memory (instead of to a file) then
>>>>>>>> this will be the identifier with which the restart (or cleanup) would refer
>>>>>>>> to the (in memory) checkpoint image
>>>>>>>>
>>>>>>>> 2) to reduce downtime of the checkpoint, data will be aggregated on the
>>>>>>>> checkpoint context, as well as referenced to (cow-ed) pages. This data can
>>>>>>>> persist between calls to sys_checkpoint(), and the 'crid', again, will be
>>>>>>>> used to identify the (in-memory-to-be-dumped-to-storage) context.
>>>>>>>>
>>>>>>>> 3) for incremental checkpoint (where a successive checkpoint will only
>>>>>>>> save what has changed since the previous checkpoint) there will be a need
>>>>>>>> to identify the previous checkpoints (to be able to know where to take
>>>>>>>> data from during restart). Again, a 'crid' is handy.
>>>>>>>>
>>>>>>>> [in fact, for the 3rd use, it will make sense to write that number as
>>>>>>>> part of the checkpoint image header]
>>>>>>>>
>>>>>>>> Note that by doing so, a process that checkpoints itself (in its own
>>>>>>>> context), can use code that is similar to the logic of fork():
>>>>>>>>
>>>>>>>> ...
>>>>>>>> crid = checkpoint(...);
>>>>>>>> switch (crid) {
>>>>>>>> case -1:
>>>>>>>> perror("checkpoint failed");
>>>>>>>> break;
>>>>>>>> default:
>>>>>>>> fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
>>>>>>>> /* proceed with execution after checkpoint */
>>>>>>>> ...
>>>>>>>> break;
>>>>>>>> case 0:
>>>>>>>> fprintf(stderr, "returned after restart\n");
>>>>>>>> /* proceed with action required following a restart */
>>>>>>>> ...
>>>>>>>> break;
>>>>>>>> }
>>>>>>>> ...
>>>>>>> If I understand correctly, this crid can live for quite a long time. So many of
>>>>>>> them could be generated while some container would accumulate incremental
>>>>>>> checkpoints on, say crid 5, and possibly crid 5 could be reused for another
>>>>>>> unrelated checkpoint during that time. This brings the issue of allocating crids
>>>>>>> reliably (using something like a pidmap for instance). Moreover, if such ids are
>>>>>>> exposed to userspace, we need to remember which ones are allocated accross
>>>>>>> reboots and migrations.
>>>>>>>
>>>>>>> I'm afraid that this becomes too complex...
>>>>>> And I'm afraid I didn't explain myself well. So let me rephrase:
>>>>>>
>>>>>> CRIDs are always _local_ to a specific node. The local CRID counter is
>>>>>> bumped (atomically) with each checkpoint attempt. The main use case is
>>>>>> for when the checkpoint is kept is memory either shortly (until it is
>>>>>> written back to disk) or for a longer time (use-cases that want to keep
>>>>>> it there). It only remains valid as long as the checkpoint image is
>>>>>> still in memory and have not been committed to storage/network. Think
>>>>>> of it as a way to identify the operation instance.
>>>>>>
>>>>>> So they can live quite a long time, but only as long as the original
>>>>>> node is still alive and the checkpoint is still kept in memory. They
>>>>>> are meaningless across reboots and migrations. I don't think a wrap
>>>>>> around is a concern, but we can use 64 bit if that is the case.
>>>>>>
>>>>>> Finally, the incremental checkpoint use-case: imagine a container that
>>>>>> is checkpointed regularly every minutes. The first checkpoint will be
>>>>>> a full checkpoint, say CRID=1. The second will be incremental with
>>>>>> respect to the first, with CRID=2, and so on the third and the forth.
>>>>>> Userspace could use these CRID to name the image files (for example,
>>>>>> app.img.CRID). Assume that we decide (big "if") that the convention is
>>>>>> that the last part of the filename must be the CRID, and if we decide
>>>>>> (another big "if") to save the CRID as part of the checkpoint image --
>>>>>> the part that describe the "incremental nature" of a new checkpoint.
>>>>>> (That part would specify where to get state that wasn't really saved
>>>>>> in the new checkpoint but instead can be retrieved from older ones).
>>>>>> If that was the case, then the logic in the kernel would be fairly
>>>>>> to find (and access) the actual files that hold the data. Note, that
>>>>>> in this case - the CRID are guaranteed to be unique per series of
>>>>>> incremental checkpoints, and incremental chekcpoint is meaningless
>>>>>> across reboots (and we can require that across migration too).
>>>>> Letting the kernel guess where to find the missing data of an incremental
>>>>> checkpoint seems a bit hazardous indeed. What about just appending incremental
>>>>> checkpoints to the last full checkpoint file?
>>>> It isn't quite a "guess", it's like the kernel assumes that a kernel-helper
>>>> resides in some directory - it's a convention. I agree, though, that it may
>>>> not be the best method to do it.
>>>>
>>>> As for putting everything in a single file, I prefer not to do that, and it
>>>> may not even always possible I believe.
>>>>
>>>> An incremental would include a section that describes how to find the missing
>>>> data from previous checkpoints, so it must have a way to identify a previous
>>>> checkpoint.
>>>>
>>>> On way is like I suggested name them with this identifier, another would be,
>>>> for example, that the user provides a list of file-descriptors that match
>>>> the required identifiers. Other ways may be possible too.
>>>>
>>>> In any event, I think it is now bit early to discuss the exact format and
>>>> logic, when we don't even have a simple checkpoint working :)
>>>>
>>>> Incremental checkpoint is one of a few reasons to use CRIDs, let us first
>>>> agree about CRIDs, and later, when we design incremental checkpoints, decide
>>>> on the technical details of incorporating this CRIDs.
>>>>
>>> Agreed, but since your point is to introduce CRIDs, I'd like to be convinced
>>> that they are needed :) At least I'd like to be convinced that they will not
>>> generate hard-to-manage side effects.
>>>
>>>> (Just to avoid confusion, an incremental checkpoint is _not_ a pre-copy or
>>>> live-migration: in a pre-copy, we repeatedly copy the state of the container
>>>> without freezing it until the delta is small enough, then we freeze and then
>>>> we checkpoint the remaining residues. All this activity belongs to a single
>>>> checkpoint. In incremental checkpoints, we talk about multiple checkpoints
>>>> that save only the delta with respect to their preceding checkpoint).
>>> Don't worry, I know what incremental checkpointing is.
>>>
>>>>>> We probably don't want to use something like a pid to identify the
>>>>>> checkpoint (while in memory), because we may have multiple checkpoints
>>>>>> in memory at a time (of the same container).
>>>>> Agreed.
>>>>>
>>>>>>> It would be way easier if the only (kernel-level) references to a checkpoint
>>>>>>> were pointers to its context. Ideally, the only reference would live in a
>>>>>>> 'struct container' and would be easily updated at restart-time.
>>>>>> Consider the following scenario of calls from user-space (which is
>>>>>> how I envision the checkpoint optimized for minimal downtime, in the
>>>>>> future):
>>>>>>
>>>>>> 1) while (syscall_to_do_precopy) <- do precopy until ready to
>>>>>> if (too_long_already) <- checkpoint or too long
>>>>>> break;
>>>>>>
>>>>>> 2) freeze_container();
>>>>>>
>>>>>> 3) crid = checkpoint(.., .., CR_CKPT_LAZY); <- checkpoint container
>>>>>> <- don't commit to disk
>>>>>> <- (minimize owntime)
>>>>>>
>>>>>> 4) unfreeze_container(); <- now can unfreeze container
>>>>>> <- already as soon as possible
>>>>>>
>>>>>> 5) ckpt_writeback(crid, fd); <- container is back running. we
>>>>>> <- can commit data to storage or
>>>>>> <- network in the background.
>>>>>>
>>>>>> #2 and #4 are done with freezer_cgroup()
>>>>>>
>>>>>> #1, #3 and #5 must be syscalls
>>>>>>
>>>>>> More specifically, syscall #5 must be able to refer to the result of syscall #3
>>>>>> (that is the CRID !). It is possible that another syscall #3 occur, on the same
>>>>>> container, between steps 4 and 5 ... but then that checkpoint will be assigned
>>>>>> another, unique CRID.
>>>>> Hm, assuming that, as proposed above, incremental checkpoints are stored in the
>>>>> same file as the ancestor full checkpoint, why not simply give fd as argument in
>>>>> #5? I'd expect that the kernel would associate the file descriptor to the
>>>>> checkpoint until it is finalized (written back, sent over the wire, etc.).
>>>> The above procedure, step 1-5 are for a _single_ checkpoint.
>>> This is what I understood.
>>>
>>>> Why would the kernel associate a file descriptor with the checkpoint until it
>>>> is finalized ? As far as I'm concerned, the checkpoint call in step 3 can go
>>>> without any FD. Also, what happens if there is another checkpoint, of the
>>>> same container, taken between steps 4 and 5, how would you tell the difference
>>>> or select which one goes in first ? Finally, keeping that FD alive between
>>>> multiple checkpoints would require the checkpointer (e.g. a daemon that will
>>>> periodically checkpoint) to keep it alive.
>>>>
>>>> I view it differently: a checkpoint held in memory is like a kernel resource,
>>>> and requires a handle/identifier for user space to refer to it. Like an IPC
>>>> object. Why tie that object to a specific file descriptor ?
>>>> The only exception I can see, is the need to tie it to a some process - the
>>>> checkpointer for instance, such that if that process dies without completing
>>>> the work, the checkpoint image in memory will be cleaned up.
>>>> That, however, still is problematic, because it will not allow you to use
>>>> different procesess for different steps (above).
>>>>
>>>> Since we are not yet optimizing the checkpoint procedure, just building the
>>>> infrastructure, my goal is to convince that a CRID is a desired feature (and
>>>> I can certainly see how it will be used in various scenarios).
>>> Here is probably the source of the misunderstanding. I was assuming that step #3
>>> needed a file descriptor to dump the checkpoint progressively, but reading your
>>> first use-case more carefully might have avoided this misunderstanding :)
>> Even without the first use-case (checkpoint in memory), step 3 does not need
>> necessarily a file-descriptor to which data will be dumped, in the case of
>> said optimization. Consider a scenario with periodic checkpointing of a long
>> running application, where we would like to minimize the downtime of the
>> application due to each checkpoint. The idea is to do steps 1 and 3 entirely
>> in memory, keep the data in a buffer (see below comment about tmpfs). The
>> expensive operation of streaming the data to the file-descriptor is only
>> done in step 5.
>>
>> (In the case of checkpoint in memory - it is never written to a file. There
>> are various optimization to do there for fast restart for which putting the
>> data in a file doesn't make sense).
>>
>> As for using tmpfs -- so during step 3 the state of all tasks is saved; part
>> of it is headers, task data, signals etc, but mostly the memory content. For
>> as long as the checkpoint is kept in memory (either because it is meant to
>> stay there, or because it is not committed to the file-descriptor yet), there
>> is no reason to make a copy of each (dirty) page. On the contrary - the pages
>> will be marked COW and a reference will be kept, as part of the checkpoint
>> context. Sure, you can put the rest of the data in a file in tmpfs; but you
>> probably don't want to copy all the pages to a file in tmpfs - that would be
>> wasteful.
>
> I think that memory pages need not to be dumped in step #3. They can be kept
> just as you mentioned in COW state in the checkpoint context, and be really
> dumped only in step #5.
>
>>> Anyway, we can still give a fd to sys_checkpoint() which will identify the
>>> checkpoint for the remaining operations. It's up to userspace to show the
>>> difference between two checkpoints taken (roughly) at the same time. From the
>>> kernel point of view, a file descriptor is enough to make the difference.
>> That is indeed an option. I haven't given a lot of thought to this approach,
>> because in Zap I use CRIDs. Three points against this approach are that:
>>
>> (1) as I said, that would require that the file descriptor remains alive for
>> as long as we want to keep the checkpoint alive (in memory), and
>
> Not sure that this is so bad. The checkpointer can transfer the descriptor
> to some daemon using the file descriptor transfer feature of UNIX sockets, and
> then freely exit.
Uhh.. that's an evil feature to begin with :o
In any case, it requires that extra logic.
>
>> (2) if the checkpoint is taken by a process from within the container, we
>> create a situation where a resource held by the process (an FD), is referring
>> to the checkpoint itself and at the same time also referred to by the
>> checkpoint (because it is part of the state of a process that is in the
>> container...). In particular this will necessitate some special case treatment
>> during the restart operation.
>
> Interesting case. This means that the checkpointer would be checkpointed while
> inside sys_checkpoint(), and would possibly try to writeback the checkpoint
> after restart (going to step #5 as if it was not restarted). So the special
> handling is already needed there, right? Like making sys_checkpoint() return an
Not quite. See my first reply to Serge earlier in this thread. sys_checkpoint()
returns one of three values: -1 for error, positive (non zero) number which is
the CRID on success, and 0 when it returns from restart. Logic is analogous to
a fork() syscall. No special handling, definitely not in kernel space.
> error upon restart. I'm not sure that the checkpoint fd should really need a
> special handling in the special case of self-checkpoiting, because the
> checkpointer shoud probably not try to do anything with this checkpoint after a
> restart, unless it reopens the checkpoint file for appending new incremental
> checkpoints.
>
> Anyway, we are trying to solve an issue that was explicitly forbidden in
> previous discussions IIUC, because the whole container is assumed to be frozen
> before calling sys_checkpoint(), which means that the checkpointer should live
> outside of the container.
Actually, I made the point in the mini-summit that such a functionality will be
useful, and I have several use cases, and two of them actually implemented
with Zap. The main change from a regular, freeze-entire-container checkpoint
is that one task - the checkpointer - will be allowed not to freeze. Since
it will be doing the checkpoint itself, there is no concern about it not being
frozen (after all, we freeze them so they don't change their state). I already
implemented this is Zap and it proved quite useful. See this paper, for example:
http://www.ncl.cs.columbia.edu/publications/sosp2007_dejaview.pdf
>
>> (3) if a give tasks wants to keep many checkpoints in memory (again, either
>> permanently or shortly), it will have to keep, forever, a lot of open file
>> descriptors.
>
> The only problem I see here is the limitation on the number of file descriptors.
> Hm, hundreds of checkpoints in memory looks like memory wastage in some way.
"640K ought to be enough for anybody." - Bill Gates, 1981 (actually, according
to this page http://en.wikiquote.org/wiki/Talk:Bill_Gates, it may not have been
him at all ...)
Now seriously, I have at least one use case (the details weren't published yet).
>
>> On the other hand, using an FD provide the advantage of a simple cleanup (FD
>> closed -> checkpoint data discarded) and ridding us from the need to come up
>> with a cleanup strategy.
>
> We would not get this for free unless we add data for this to the file
> descriptor. Adding something like an inotify listener (only used by the kernel)
> should also make it.
Lol .. then we stick to CRID if we have to implemented something anyway :)
>
>>> Let's consider the three use cases of CRID you mentioned earlier:
>>>
>>> 1) Checkpointing in memory:
>>> Actually, checkpointing in memory could also be done from userspace using tmpfs.
>>> Again, I agree that this kind of optimization should be discussed later. I'm
>>> just not convinced that this needs a CRID...
>> See my comment about regarding tmpfs. You are right, however, in that we could
>> use FD to tmpfs where the rest of the data (not pages) will be stored.
>
> See my comment above ;)
>
>>> 2) Reducing downtime of the checkpoint:
>>> If reducing downtime is just a matter of avoiding disk accesses, tmpfs is again
>>> a kind of solution. It even allows to swap if the checkpoint size is too big.
>>> What kind of scenario (other than incremental checkpointing) do you envision
>>> where multiple calls to sys_checkpoint() would use the same checkpoint object?
>> Again, see the comment regarding tmpfs. The actual memory copy operation between
>> the real pages and the space allocated in tmpfs can take substantial time for
>> applications with large memory (compared to merely marking the pages COW, and
>> amortizing the cost during regular execution of the application), besides the
>> extra space overhead. Also, writing tmpfs incurs visible overhead when you care
>> about milliseconds of downtime; I've seen that with Zap.
>
> Are those milliseconds related to pages or to the kernel structures also?
It's a visible overhead. I can't remember exactly how much because once I saw
it was expensive, I dropped that path. Even buffer allocation (page allocation
in case of tmpfs) could become an annoyance when it comes to low downtime, so
one optimization in Zap was the pre-allocate the buffers using a good estimate
on their sizes based on past checkpoints.
Finally, there are use-cases in which you'd like a reall-super-ultra-fast
checkpoint (e.g. in context), that is under a millesecond (like a partial
fork, to some extent); you do feel the difference then.
>
>>> 3) Incremental checkpoint:
>>> I agree that maintaing a fd alive (in a checkpointer daemon for instance) may
>>> look restrictive, but I'm not sure that it is really needed to keep it alive
>>> between consecutive incremental checkpoints. I'd really like to see incremental
>>> checkpointing as an append operation to a checkpoint file. This way the file
>> Why ? What's the advantage of having all data in a single file as opposed to
>> multiple files ?
>
> - You do not have to look for the previous checkpoints using a to-be-defined
> naming scheme, since they are all in the file.
but if you *want* to look for a previous checkpoint -- you wanna return to an
arbitrary checkpoint in the past ? now you need to look for it.
> - Userspace makes less errors when managing incremental checkpoints.
have you implemented this ? did you experience issues in real life ? user
space will need a way to manage all of it anyway in many aspects. This will
be the last/least of the issues ...
>
> - You can easily create new branches by just copying the file, restarting from
> it, and adding incremental checkpoints to it. (Not sure this branch feature
> is really interesting, but I it sounds funny :))
Using multiple files, you can create branches by adding hard-links (or soft-
links) to previous files. Saves space, time, and - I'd argue - easier to
understand and manage.
Branches features is really interesting, as a matter of fact; Again I refer
you to the paper mentioned above.
>
>> Recall that the data can be streamed, so when you start to read a file you
>> don't know a-priori how long is the checkpoint image, until you have parsed
>> it all; So you can't easily find the beginning of the, say 15th checkpoint
>> int that case.
>
> Good point: in append-only mode, we do not know that there are 15 checkpoints
> until we reach the 15th one. Perhaps append-only is too restrictive for
> incremental checkpoint. OTOH, do we really want to support a unique stream
> having multiple checkpoints? Probably not. So rewrite and append looks like a
> better option. An incremental checkpoint procedure could look like this:
>
> err = sys_checkpoint(base_fd, out_fd, ...)
Re-write + append will end up being very costly (imagine you save the data
on a network filel system), both in time and (at least for some time) in
space.
Besides, this scheme begins to sound much more complex than a single file.
Do you really gain so much from not having multiple files, one per checkpoint ?
>
> where:
> - base_fd is a regular file containing the base checkpoint, or -1 if a full
> checkpoint should be done. The checkpoint could actually also live in memory,
> and the kernel should check that it matches the image pointed to by base_fd.
> - out_fd is whatever file/socket/etc. on which we should dump the checkpoint. In
> particular, out_fd can equal base_fd and should point to the beginning of the
> file if it's a regular file.
Excellent example. What if the checkpoint data is streamed over the network;
so you cannot rewrite the file after it has been streamed... Or you will have
to save the entire incremental history in memory :(
The checkpoint - may, or may not live in memory for a long time. Usually not,
by the way, for the usual case it doesn't really make sense to use up memory
for nothing.
>
> If base_fd is a valid file descriptor, sys_checkpoint() would do this:
>
> #1 check the validity of the checkpoint image (possibly compare with in-memory
> checkpoint states),
>
> #2 (over)write the position of the next (coming) checkpoint on out_fd (see
> explanations below) and its sequence number as well (this actually makes
> sequence counters live in the checkpoint image),
>
> #3 write the contents of base_fd to out_fd, marking the records invalidated by
> the current checkpoint on the fly (see explanations below),
>
> #4 write the new incremental checkpoint records.
I truly don't think this scheme is simpler or easier to manage compared to
a using multiple files; and I really wonder what is the big advantage of
going through this non-trivial logic ?
>
> This assumes that a checkpoint image has a place in the header to tell where the
> last checkpoint image is. Eventually, each record (task struct, vma, page, etc.)
> should contain a field telling which later incremental checkpoint invalidates
> it, so that we can restart from any intermediate checkpoint if we like.
My experience is that you really need incremental for memory, but not that
necessary for the rest of the state. So the way I did it is - whenever a
vma is saved, if some of its pages are found in previous checkpoints, a
pointer to where the page data resides is given (CRID, position) instead of
the page contents.
>
> Moreover, each intermediate checkpoint would contain a pointer to the start of
> the previous and the next one, so that any intermediate checkpoint can be easily
> found. This actually makes step #2 and #3 modify the checkpoint image in place,
> whenever based_fd and out_fd point to the same file. This disables streaming for
> restarts from an intermediate checkpoint, but I don't think this is a real
> issue, unless there are use cases outside live-migration?
This is not quite possible to do when the data has been streamed through a
socket, for example (can't rewrite); or expensive to do with a network file
system.
Live migration is orthogonal to incremental checkpoint, they have nothing
in common. There are use cases for restarting from an intermediate checkpoint
like the paper I mentioned, as well as fault tolerance, debugging, forensics,
and more.
"Streaming" also means, as I mentioned above, to the case where you send
the data over a socket (even if not for a live migration, but to a daemon
that would hold it in memory on another node, for example). In that media
you cannot easily rewrite the file.
>
>> Depending on the size of your checkpoint, a single file may eventually become
>> very large in a short time. I have one system that takes a checkpoint every
>> second of en entire user-desktop ...
>>
>> One single large file is harder to manager, parse, and inspect, even with
>> proper user tools. If you wanted to change something inside (for whatever
>> reasons), that would be a difficult to do. Same goes for when you want to
>> coalesce multiple checkpoints into a single checkpoint (e.g. to save space,
>> or because you don't care about some of your past)
>
> Ok, this becomes more complex, but feasible I think (see above).
Heh ... of course it is feasible. The question is which alternative is better ?
>
> Coalescing checkpoints seems rather easy as soon as checkpoints records are
> tagged with the first checkpoint number that invalidates them.
>
>> Ahh.. ok.. I stop here. This is not related to CRID vs. FD anymore :)
>
> You're right. Hopefully it is interesting, although a bit early to discuss :)
lol .. I couldn't help it.
>
>>> could contain the entire checkpoint history. On the other hand, you are not sure
>>> that we could do incremental checkpoint this way, which justifies your need for
>>> a CRID. Perhaps you have an example?
>> Arguments given above. Note that even with multiple files we don't _need_
>> CRID, they are merely helpful. Instead, the user could be required to provide
>> the kernel with an array of file names, corresponding to checkpoint#0 (base),
>> checkpoint#2, checkpoint#3 etc; In this case, the "incremental state" that
>> is saved with checkpoint#4, is (a) that it is #4, and (b) for each part of
>> state that is found in a previous checkpoint, a reference to the serial no.
>> of that checkpoint is kept.
>
> See above for a solution based on a single file.
>
>> (The proposal for CRID was that instead of a serial number that starts from
>> 0 with every full (base) checkpoint, we use the CRID).
>>
>>> Anyway, do not take this as an attack. I just want to be well convinced that
>> On the contrary; your comments are definitely in place.
>>
>>> CRIDs are really needed, and are worth the effort of managing them cleanly.
>>> Exposing them to userspace just scares me a bit.
>> I'm not sure why is there an "effort of managing" them ? It's a simple
>> atomic counter, that won't wrap around (use 64 bit if we wish). All in-memory
>> checkpoint contexts will be (also) in global linked list and easily located
>> there by their CRID.
>
> Ok, as long as no userspace task holds such IDs accross reboot or migration. How
> would you check this?
Ahhhh.... once again: CRIDs do _not_ make sense across a reboot. Not in the
kernel anyway. (For incremental, they can be used as hints, and userspace
brains are needed there anyway). A CRID identifies a checkpoint _in memory_
and goes away when the checkpoint is removed from memory (canceled, commited)
or when the container goes away, or when the RAM goes away (e.g. reboot).
When I said "hints" for user space, I refer to two use cases actually. One
is the incremental checkpoint where this CRID will be part of the header of
the checkpoint file, and user space will have that number returned by the
syscall and could use it (e.g. to name the files, but also to keep a record
of when/what was checkpointed).
Another is when we will add the capability of file-system snapshot, then
we'll have a way to identify each snapshot (let's say there will be some
identifier to each). Then user space could keep a table with the tuples:
<time, filename, CRID, FSID> to keep track of checkpoint data (FSID stands
for filesystem snapshot identifier).
>
>>> Btw, if we ever decide to use CRIDs, I'd propose to manage them in some
>>> pseudo-filesystem, like SYSV IPC objects actually are.
>> Eventually, yes ;)
>>
>>> Thanks,
>>>
>>> Louis
>>>
>> Thanks for the comments and stimulating the discussion.
>
> I should have had many more discussions like this during my PhD. Your's is going
> to be definitely better than mine :)
:)
Oren.
>
> Thanks,
>
> Louis
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <48931A7E.1040302-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2008-08-01 18:00 ` Louis Rilling
[not found] ` <20080801180038.GL22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Louis Rilling @ 2008-08-01 18:00 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers
[-- Attachment #1.1: Type: text/plain, Size: 35949 bytes --]
On Fri, Aug 01, 2008 at 10:15:26AM -0400, Oren Laadan wrote:
>
>
> Louis Rilling wrote:
>> On Thu, Jul 31, 2008 at 03:12:32PM -0400, Oren Laadan wrote:
>>>
>>> Louis Rilling wrote:
>>>> On Thu, Jul 31, 2008 at 12:28:57PM -0400, Oren Laadan wrote:
>>>>> Louis Rilling wrote:
>>>>>> On Thu, Jul 31, 2008 at 11:09:54AM -0400, Oren Laadan wrote:
>>>>>>> Louis Rilling wrote:
>>>>>>>> On Wed, Jul 30, 2008 at 06:20:32PM -0400, Oren Laadan wrote:
>>>>>>>>> Serge E. Hallyn wrote:
>>>>>>>>>> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>>>>>>>>>> +int do_checkpoint(struct cr_ctx *ctx)
>>>>>>>>>>> +{
>>>>>>>>>>> + int ret;
>>>>>>>>>>> +
>>>>>>>>>>> + /* FIX: need to test whether container is checkpointable */
>>>>>>>>>>> +
>>>>>>>>>>> + ret = cr_write_hdr(ctx);
>>>>>>>>>>> + if (!ret)
>>>>>>>>>>> + ret = cr_write_task(ctx, current);
>>>>>>>>>>> + if (!ret)
>>>>>>>>>>> + ret = cr_write_tail(ctx);
>>>>>>>>>>> +
>>>>>>>>>>> + /* on success, return (unique) checkpoint identifier */
>>>>>>>>>>> + if (!ret)
>>>>>>>>>>> + ret = ctx->crid;
>>>>>>>>>> Does this crid have a purpose?
>>>>>>>>> yes, at least three; both are for the future, but important to set the
>>>>>>>>> meaning of the return value of the syscall already now. The "crid" is
>>>>>>>>> the CR-identifier that identifies the checkpoint. Every checkpoint is
>>>>>>>>> assigned a unique number (using an atomic counter).
>>>>>>>>>
>>>>>>>>> 1) if a checkpoint is taken and kept in memory (instead of to a file) then
>>>>>>>>> this will be the identifier with which the restart (or cleanup) would refer
>>>>>>>>> to the (in memory) checkpoint image
>>>>>>>>>
>>>>>>>>> 2) to reduce downtime of the checkpoint, data will be aggregated on the
>>>>>>>>> checkpoint context, as well as referenced to (cow-ed) pages. This data can
>>>>>>>>> persist between calls to sys_checkpoint(), and the 'crid', again, will be
>>>>>>>>> used to identify the (in-memory-to-be-dumped-to-storage) context.
>>>>>>>>>
>>>>>>>>> 3) for incremental checkpoint (where a successive checkpoint will only
>>>>>>>>> save what has changed since the previous checkpoint) there will be a need
>>>>>>>>> to identify the previous checkpoints (to be able to know where to take
>>>>>>>>> data from during restart). Again, a 'crid' is handy.
>>>>>>>>>
>>>>>>>>> [in fact, for the 3rd use, it will make sense to write that number as
>>>>>>>>> part of the checkpoint image header]
>>>>>>>>>
>>>>>>>>> Note that by doing so, a process that checkpoints itself (in its own
>>>>>>>>> context), can use code that is similar to the logic of fork():
>>>>>>>>>
>>>>>>>>> ...
>>>>>>>>> crid = checkpoint(...);
>>>>>>>>> switch (crid) {
>>>>>>>>> case -1:
>>>>>>>>> perror("checkpoint failed");
>>>>>>>>> break;
>>>>>>>>> default:
>>>>>>>>> fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
>>>>>>>>> /* proceed with execution after checkpoint */
>>>>>>>>> ...
>>>>>>>>> break;
>>>>>>>>> case 0:
>>>>>>>>> fprintf(stderr, "returned after restart\n");
>>>>>>>>> /* proceed with action required following a restart */
>>>>>>>>> ...
>>>>>>>>> break;
>>>>>>>>> }
>>>>>>>>> ...
>>>>>>>> If I understand correctly, this crid can live for quite a long time. So many of
>>>>>>>> them could be generated while some container would accumulate incremental
>>>>>>>> checkpoints on, say crid 5, and possibly crid 5 could be reused for another
>>>>>>>> unrelated checkpoint during that time. This brings the issue of allocating crids
>>>>>>>> reliably (using something like a pidmap for instance). Moreover, if such ids are
>>>>>>>> exposed to userspace, we need to remember which ones are allocated accross
>>>>>>>> reboots and migrations.
>>>>>>>>
>>>>>>>> I'm afraid that this becomes too complex...
>>>>>>> And I'm afraid I didn't explain myself well. So let me rephrase:
>>>>>>>
>>>>>>> CRIDs are always _local_ to a specific node. The local CRID counter is
>>>>>>> bumped (atomically) with each checkpoint attempt. The main use case is
>>>>>>> for when the checkpoint is kept is memory either shortly (until it is
>>>>>>> written back to disk) or for a longer time (use-cases that want to keep
>>>>>>> it there). It only remains valid as long as the checkpoint image is
>>>>>>> still in memory and have not been committed to storage/network. Think
>>>>>>> of it as a way to identify the operation instance.
>>>>>>>
>>>>>>> So they can live quite a long time, but only as long as the original
>>>>>>> node is still alive and the checkpoint is still kept in memory. They
>>>>>>> are meaningless across reboots and migrations. I don't think a wrap
>>>>>>> around is a concern, but we can use 64 bit if that is the case.
>>>>>>>
>>>>>>> Finally, the incremental checkpoint use-case: imagine a container that
>>>>>>> is checkpointed regularly every minutes. The first checkpoint will be
>>>>>>> a full checkpoint, say CRID=1. The second will be incremental with
>>>>>>> respect to the first, with CRID=2, and so on the third and the forth.
>>>>>>> Userspace could use these CRID to name the image files (for example,
>>>>>>> app.img.CRID). Assume that we decide (big "if") that the convention is
>>>>>>> that the last part of the filename must be the CRID, and if we decide
>>>>>>> (another big "if") to save the CRID as part of the checkpoint image --
>>>>>>> the part that describe the "incremental nature" of a new checkpoint.
>>>>>>> (That part would specify where to get state that wasn't really saved
>>>>>>> in the new checkpoint but instead can be retrieved from older ones).
>>>>>>> If that was the case, then the logic in the kernel would be fairly
>>>>>>> to find (and access) the actual files that hold the data. Note, that
>>>>>>> in this case - the CRID are guaranteed to be unique per series of
>>>>>>> incremental checkpoints, and incremental chekcpoint is meaningless
>>>>>>> across reboots (and we can require that across migration too).
>>>>>> Letting the kernel guess where to find the missing data of an incremental
>>>>>> checkpoint seems a bit hazardous indeed. What about just appending incremental
>>>>>> checkpoints to the last full checkpoint file?
>>>>> It isn't quite a "guess", it's like the kernel assumes that a kernel-helper
>>>>> resides in some directory - it's a convention. I agree, though, that it may
>>>>> not be the best method to do it.
>>>>>
>>>>> As for putting everything in a single file, I prefer not to do that, and it
>>>>> may not even always possible I believe.
>>>>>
>>>>> An incremental would include a section that describes how to find the missing
>>>>> data from previous checkpoints, so it must have a way to identify a previous
>>>>> checkpoint.
>>>>>
>>>>> On way is like I suggested name them with this identifier, another would be,
>>>>> for example, that the user provides a list of file-descriptors that match
>>>>> the required identifiers. Other ways may be possible too.
>>>>>
>>>>> In any event, I think it is now bit early to discuss the exact format and
>>>>> logic, when we don't even have a simple checkpoint working :)
>>>>>
>>>>> Incremental checkpoint is one of a few reasons to use CRIDs, let us first
>>>>> agree about CRIDs, and later, when we design incremental checkpoints, decide
>>>>> on the technical details of incorporating this CRIDs.
>>>>>
>>>> Agreed, but since your point is to introduce CRIDs, I'd like to be convinced
>>>> that they are needed :) At least I'd like to be convinced that they will not
>>>> generate hard-to-manage side effects.
>>>>
>>>>> (Just to avoid confusion, an incremental checkpoint is _not_ a pre-copy or
>>>>> live-migration: in a pre-copy, we repeatedly copy the state of the container
>>>>> without freezing it until the delta is small enough, then we freeze and then
>>>>> we checkpoint the remaining residues. All this activity belongs to a single
>>>>> checkpoint. In incremental checkpoints, we talk about multiple checkpoints
>>>>> that save only the delta with respect to their preceding checkpoint).
>>>> Don't worry, I know what incremental checkpointing is.
>>>>
>>>>>>> We probably don't want to use something like a pid to identify the
>>>>>>> checkpoint (while in memory), because we may have multiple checkpoints
>>>>>>> in memory at a time (of the same container).
>>>>>> Agreed.
>>>>>>
>>>>>>>> It would be way easier if the only (kernel-level) references to a checkpoint
>>>>>>>> were pointers to its context. Ideally, the only reference would live in a
>>>>>>>> 'struct container' and would be easily updated at restart-time.
>>>>>>> Consider the following scenario of calls from user-space (which is
>>>>>>> how I envision the checkpoint optimized for minimal downtime, in the
>>>>>>> future):
>>>>>>>
>>>>>>> 1) while (syscall_to_do_precopy) <- do precopy until ready to
>>>>>>> if (too_long_already) <- checkpoint or too long
>>>>>>> break;
>>>>>>>
>>>>>>> 2) freeze_container();
>>>>>>>
>>>>>>> 3) crid = checkpoint(.., .., CR_CKPT_LAZY); <- checkpoint container
>>>>>>> <- don't commit to disk
>>>>>>> <- (minimize owntime)
>>>>>>>
>>>>>>> 4) unfreeze_container(); <- now can unfreeze container
>>>>>>> <- already as soon as possible
>>>>>>>
>>>>>>> 5) ckpt_writeback(crid, fd); <- container is back running. we
>>>>>>> <- can commit data to storage or
>>>>>>> <- network in the background.
>>>>>>>
>>>>>>> #2 and #4 are done with freezer_cgroup()
>>>>>>>
>>>>>>> #1, #3 and #5 must be syscalls
>>>>>>>
>>>>>>> More specifically, syscall #5 must be able to refer to the result of syscall #3
>>>>>>> (that is the CRID !). It is possible that another syscall #3 occur, on the same
>>>>>>> container, between steps 4 and 5 ... but then that checkpoint will be assigned
>>>>>>> another, unique CRID.
>>>>>> Hm, assuming that, as proposed above, incremental checkpoints are stored in the
>>>>>> same file as the ancestor full checkpoint, why not simply give fd as argument in
>>>>>> #5? I'd expect that the kernel would associate the file descriptor to the
>>>>>> checkpoint until it is finalized (written back, sent over the wire, etc.).
>>>>> The above procedure, step 1-5 are for a _single_ checkpoint.
>>>> This is what I understood.
>>>>
>>>>> Why would the kernel associate a file descriptor with the checkpoint until it
>>>>> is finalized ? As far as I'm concerned, the checkpoint call in step 3 can go
>>>>> without any FD. Also, what happens if there is another checkpoint, of the
>>>>> same container, taken between steps 4 and 5, how would you tell the difference
>>>>> or select which one goes in first ? Finally, keeping that FD alive between
>>>>> multiple checkpoints would require the checkpointer (e.g. a daemon that will
>>>>> periodically checkpoint) to keep it alive.
>>>>>
>>>>> I view it differently: a checkpoint held in memory is like a kernel resource,
>>>>> and requires a handle/identifier for user space to refer to it. Like an IPC
>>>>> object. Why tie that object to a specific file descriptor ?
>>>>> The only exception I can see, is the need to tie it to a some process - the
>>>>> checkpointer for instance, such that if that process dies without completing
>>>>> the work, the checkpoint image in memory will be cleaned up.
>>>>> That, however, still is problematic, because it will not allow you to use
>>>>> different procesess for different steps (above).
>>>>>
>>>>> Since we are not yet optimizing the checkpoint procedure, just building the
>>>>> infrastructure, my goal is to convince that a CRID is a desired feature (and
>>>>> I can certainly see how it will be used in various scenarios).
>>>> Here is probably the source of the misunderstanding. I was assuming that step #3
>>>> needed a file descriptor to dump the checkpoint progressively, but reading your
>>>> first use-case more carefully might have avoided this misunderstanding :)
>>> Even without the first use-case (checkpoint in memory), step 3 does not need
>>> necessarily a file-descriptor to which data will be dumped, in the case of
>>> said optimization. Consider a scenario with periodic checkpointing of a long
>>> running application, where we would like to minimize the downtime of the
>>> application due to each checkpoint. The idea is to do steps 1 and 3 entirely
>>> in memory, keep the data in a buffer (see below comment about tmpfs). The
>>> expensive operation of streaming the data to the file-descriptor is only
>>> done in step 5.
>>>
>>> (In the case of checkpoint in memory - it is never written to a file. There
>>> are various optimization to do there for fast restart for which putting the
>>> data in a file doesn't make sense).
>>>
>>> As for using tmpfs -- so during step 3 the state of all tasks is saved; part
>>> of it is headers, task data, signals etc, but mostly the memory content. For
>>> as long as the checkpoint is kept in memory (either because it is meant to
>>> stay there, or because it is not committed to the file-descriptor yet), there
>>> is no reason to make a copy of each (dirty) page. On the contrary - the pages
>>> will be marked COW and a reference will be kept, as part of the checkpoint
>>> context. Sure, you can put the rest of the data in a file in tmpfs; but you
>>> probably don't want to copy all the pages to a file in tmpfs - that would be
>>> wasteful.
>>
>> I think that memory pages need not to be dumped in step #3. They can be kept
>> just as you mentioned in COW state in the checkpoint context, and be really
>> dumped only in step #5.
>>
>>>> Anyway, we can still give a fd to sys_checkpoint() which will identify the
>>>> checkpoint for the remaining operations. It's up to userspace to show the
>>>> difference between two checkpoints taken (roughly) at the same time. From the
>>>> kernel point of view, a file descriptor is enough to make the difference.
>>> That is indeed an option. I haven't given a lot of thought to this approach,
>>> because in Zap I use CRIDs. Three points against this approach are that:
>>>
>>> (1) as I said, that would require that the file descriptor remains alive for
>>> as long as we want to keep the checkpoint alive (in memory), and
>>
>> Not sure that this is so bad. The checkpointer can transfer the descriptor
>> to some daemon using the file descriptor transfer feature of UNIX sockets, and
>> then freely exit.
>
> Uhh.. that's an evil feature to begin with :o
> In any case, it requires that extra logic.
>
>>
>>> (2) if the checkpoint is taken by a process from within the container, we
>>> create a situation where a resource held by the process (an FD), is referring
>>> to the checkpoint itself and at the same time also referred to by the
>>> checkpoint (because it is part of the state of a process that is in the
>>> container...). In particular this will necessitate some special case treatment
>>> during the restart operation.
>>
>> Interesting case. This means that the checkpointer would be checkpointed while
>> inside sys_checkpoint(), and would possibly try to writeback the checkpoint
>> after restart (going to step #5 as if it was not restarted). So the special
>> handling is already needed there, right? Like making sys_checkpoint() return an
>
> Not quite. See my first reply to Serge earlier in this thread. sys_checkpoint()
> returns one of three values: -1 for error, positive (non zero) number which is
> the CRID on success, and 0 when it returns from restart. Logic is analogous to
> a fork() syscall. No special handling, definitely not in kernel space.
Sorry I had not these details in mind anymore.
Returning 0 in case of a restart is what I called a special handling. You won't
do this for the other tasks, so this is special. Since userspace must cope with
it anyway, userspace can be clever enough to avoid using the fd on restart, or
stupid enough to destroy its checkpoint after restart.
>
>> error upon restart. I'm not sure that the checkpoint fd should really need a
>> special handling in the special case of self-checkpoiting, because the
>> checkpointer shoud probably not try to do anything with this checkpoint after a
>> restart, unless it reopens the checkpoint file for appending new incremental
>> checkpoints.
>>
>> Anyway, we are trying to solve an issue that was explicitly forbidden in
>> previous discussions IIUC, because the whole container is assumed to be frozen
>> before calling sys_checkpoint(), which means that the checkpointer should live
>> outside of the container.
>
> Actually, I made the point in the mini-summit that such a functionality will be
> useful, and I have several use cases, and two of them actually implemented
> with Zap. The main change from a regular, freeze-entire-container checkpoint
> is that one task - the checkpointer - will be allowed not to freeze. Since
> it will be doing the checkpoint itself, there is no concern about it not being
> frozen (after all, we freeze them so they don't change their state). I already
I had no doubt that self-checkpoint is feasible, since we are doing this in
Kerrighed (it's a signal that is handled at kernel-level only).
> implemented this is Zap and it proved quite useful. See this paper, for example:
> http://www.ncl.cs.columbia.edu/publications/sosp2007_dejaview.pdf
Nice paper :)
>
>>
>>> (3) if a give tasks wants to keep many checkpoints in memory (again, either
>>> permanently or shortly), it will have to keep, forever, a lot of open file
>>> descriptors.
>>
>> The only problem I see here is the limitation on the number of file descriptors.
>> Hm, hundreds of checkpoints in memory looks like memory wastage in some way.
>
> "640K ought to be enough for anybody." - Bill Gates, 1981 (actually, according
> to this page http://en.wikiquote.org/wiki/Talk:Bill_Gates, it may not have been
> him at all ...)
>
> Now seriously, I have at least one use case (the details weren't published yet).
So sad! We'll have to wait...
>
>>
>>> On the other hand, using an FD provide the advantage of a simple cleanup (FD
>>> closed -> checkpoint data discarded) and ridding us from the need to come up
>>> with a cleanup strategy.
>>
>> We would not get this for free unless we add data for this to the file
>> descriptor. Adding something like an inotify listener (only used by the kernel)
>> should also make it.
>
> Lol .. then we stick to CRID if we have to implemented something anyway :)
My comment did not aim at saying "it's bad", and it actually didn't. Just giving
an idea on how to do it.
>
>>
>>>> Let's consider the three use cases of CRID you mentioned earlier:
>>>>
>>>> 1) Checkpointing in memory:
>>>> Actually, checkpointing in memory could also be done from userspace using tmpfs.
>>>> Again, I agree that this kind of optimization should be discussed later. I'm
>>>> just not convinced that this needs a CRID...
>>> See my comment about regarding tmpfs. You are right, however, in that we could
>>> use FD to tmpfs where the rest of the data (not pages) will be stored.
>>
>> See my comment above ;)
>>
>>>> 2) Reducing downtime of the checkpoint:
>>>> If reducing downtime is just a matter of avoiding disk accesses, tmpfs is again
>>>> a kind of solution. It even allows to swap if the checkpoint size is too big.
>>>> What kind of scenario (other than incremental checkpointing) do you envision
>>>> where multiple calls to sys_checkpoint() would use the same checkpoint object?
>>> Again, see the comment regarding tmpfs. The actual memory copy operation between
>>> the real pages and the space allocated in tmpfs can take substantial time for
>>> applications with large memory (compared to merely marking the pages COW, and
>>> amortizing the cost during regular execution of the application), besides the
>>> extra space overhead. Also, writing tmpfs incurs visible overhead when you care
>>> about milliseconds of downtime; I've seen that with Zap.
>>
>> Are those milliseconds related to pages or to the kernel structures also?
>
> It's a visible overhead. I can't remember exactly how much because once I saw
> it was expensive, I dropped that path. Even buffer allocation (page allocation
> in case of tmpfs) could become an annoyance when it comes to low downtime, so
> one optimization in Zap was the pre-allocate the buffers using a good estimate
> on their sizes based on past checkpoints.
>
> Finally, there are use-cases in which you'd like a reall-super-ultra-fast
> checkpoint (e.g. in context), that is under a millesecond (like a partial
> fork, to some extent); you do feel the difference then.
>
>>
>>>> 3) Incremental checkpoint:
>>>> I agree that maintaing a fd alive (in a checkpointer daemon for instance) may
>>>> look restrictive, but I'm not sure that it is really needed to keep it alive
>>>> between consecutive incremental checkpoints. I'd really like to see incremental
>>>> checkpointing as an append operation to a checkpoint file. This way the file
>>> Why ? What's the advantage of having all data in a single file as opposed to
>>> multiple files ?
>>
>> - You do not have to look for the previous checkpoints using a to-be-defined
>> naming scheme, since they are all in the file.
>
> but if you *want* to look for a previous checkpoint -- you wanna return to an
> arbitrary checkpoint in the past ? now you need to look for it.
I think I already sketched how to do it.
>
>> - Userspace makes less errors when managing incremental checkpoints.
>
> have you implemented this ? did you experience issues in real life ? user
> space will need a way to manage all of it anyway in many aspects. This will
> be the last/least of the issues ...
No it was not implemented, and I'm not going to enter a discussion about the
weight of arguments whether they are backed by implementations or not. It just
becomes easier to create a mess with things depending on each other created as
separate, "freely" (userspace-decided)-named objects.
>
>>
>> - You can easily create new branches by just copying the file, restarting from
>> it, and adding incremental checkpoints to it. (Not sure this branch feature
>> is really interesting, but I it sounds funny :))
>
> Using multiple files, you can create branches by adding hard-links (or soft-
> links) to previous files. Saves space, time, and - I'd argue - easier to
> understand and manage.
Again, no doubt about the feasibility with multiple files. I admit that this
also saves space since the common parts are shared.
>
> Branches features is really interesting, as a matter of fact; Again I refer
> you to the paper mentioned above.
>
>>
>>> Recall that the data can be streamed, so when you start to read a file you
>>> don't know a-priori how long is the checkpoint image, until you have parsed
>>> it all; So you can't easily find the beginning of the, say 15th checkpoint
>>> int that case.
>>
>> Good point: in append-only mode, we do not know that there are 15 checkpoints
>> until we reach the 15th one. Perhaps append-only is too restrictive for
>> incremental checkpoint. OTOH, do we really want to support a unique stream
>> having multiple checkpoints? Probably not. So rewrite and append looks like a
>> better option. An incremental checkpoint procedure could look like this:
>>
>> err = sys_checkpoint(base_fd, out_fd, ...)
>
> Re-write + append will end up being very costly (imagine you save the data
> on a network filel system), both in time and (at least for some time) in
> space.
Hm, I'd bet that you have to read the previous checkpoints anyway, unless after
some time things differ so much that the oldest images are not needed anymore.
>
> Besides, this scheme begins to sound much more complex than a single file.
> Do you really gain so much from not having multiple files, one per checkpoint ?
Well, at least you are not limited by the number of open file descriptors
(assuming that, as you mentioned earlier, you pass an array of previous images
to compute the next incremental checkpoint).
>
>>
>> where:
>> - base_fd is a regular file containing the base checkpoint, or -1 if a full
>> checkpoint should be done. The checkpoint could actually also live in memory,
>> and the kernel should check that it matches the image pointed to by base_fd.
>> - out_fd is whatever file/socket/etc. on which we should dump the checkpoint. In
>> particular, out_fd can equal base_fd and should point to the beginning of the
>> file if it's a regular file.
>
> Excellent example. What if the checkpoint data is streamed over the network;
> so you cannot rewrite the file after it has been streamed... Or you will have
> to save the entire incremental history in memory :(
I'm not sure to have expressed myself well: as was explained later, streaming
output is ok for an incremental checkpoint, since you need the base checkpoint
anyway. Unless you have a solution to build an incremental checkpoint out of
streamed earlier checkpoints, I don't see what kind of limitation this would
introduce.
>
> The checkpoint - may, or may not live in memory for a long time. Usually not,
> by the way, for the usual case it doesn't really make sense to use up memory
> for nothing.
Definitely agreed.
>
>>
>> If base_fd is a valid file descriptor, sys_checkpoint() would do this:
>>
>> #1 check the validity of the checkpoint image (possibly compare with in-memory
>> checkpoint states),
>>
>> #2 (over)write the position of the next (coming) checkpoint on out_fd (see
>> explanations below) and its sequence number as well (this actually makes
>> sequence counters live in the checkpoint image),
>>
>> #3 write the contents of base_fd to out_fd, marking the records invalidated by
>> the current checkpoint on the fly (see explanations below),
>>
>> #4 write the new incremental checkpoint records.
>
> I truly don't think this scheme is simpler or easier to manage compared to
> a using multiple files; and I really wonder what is the big advantage of
> going through this non-trivial logic ?
>
>>
>> This assumes that a checkpoint image has a place in the header to tell where the
>> last checkpoint image is. Eventually, each record (task struct, vma, page, etc.)
>> should contain a field telling which later incremental checkpoint invalidates
>> it, so that we can restart from any intermediate checkpoint if we like.
>
> My experience is that you really need incremental for memory, but not that
> necessary for the rest of the state. So the way I did it is - whenever a
> vma is saved, if some of its pages are found in previous checkpoints, a
> pointer to where the page data resides is given (CRID, position) instead of
> the page contents.
So in the case I described, say we restart from checkpoint #7, the page would be
found at the first page record of same (mm,address) that is not invalidated by a
checkpoint having id <= 7.
I see where multiple files provide more performance however: you do not have to
read the whole history to restart. At least this is true for non-streamed
checkpoints. As soon as they are streamed, you can only hope that you won't need
data living at the end of the images.
>
>>
>> Moreover, each intermediate checkpoint would contain a pointer to the start of
>> the previous and the next one, so that any intermediate checkpoint can be easily
>> found. This actually makes step #2 and #3 modify the checkpoint image in place,
>> whenever based_fd and out_fd point to the same file. This disables streaming for
>> restarts from an intermediate checkpoint, but I don't think this is a real
>> issue, unless there are use cases outside live-migration?
>
> This is not quite possible to do when the data has been streamed through a
> socket, for example (can't rewrite); or expensive to do with a network file
> system.
Again, how do you build an incremental checkpoint out of streamed-only previous
checkpoints?
>
> Live migration is orthogonal to incremental checkpoint, they have nothing
> in common. There are use cases for restarting from an intermediate checkpoint
> like the paper I mentioned, as well as fault tolerance, debugging, forensics,
> and more.
I'm definitely sure that intermediate checkpoints are interesting. I was only
wondering if streaming was so interesting for them.
>
> "Streaming" also means, as I mentioned above, to the case where you send
> the data over a socket (even if not for a live migration, but to a daemon
> that would hold it in memory on another node, for example). In that media
> you cannot easily rewrite the file.
The point is that you need previous data when building an incremental
checkpoint, so you will read it at least. And since it was previously stored (in
memory or whatever), you can even get its size before actually reading it,
unless you checkpoint at such a rate that the previous chekpoint was not
completely sent when you start the next one. If a remote daemon should really
host the checkpoints, you can even tell the daemon which checkpoint to overwrite
with the new one.
>
>>
>>> Depending on the size of your checkpoint, a single file may eventually become
>>> very large in a short time. I have one system that takes a checkpoint every
>>> second of en entire user-desktop ...
>>>
>>> One single large file is harder to manager, parse, and inspect, even with
>>> proper user tools. If you wanted to change something inside (for whatever
>>> reasons), that would be a difficult to do. Same goes for when you want to
>>> coalesce multiple checkpoints into a single checkpoint (e.g. to save space,
>>> or because you don't care about some of your past)
>>
>> Ok, this becomes more complex, but feasible I think (see above).
>
> Heh ... of course it is feasible. The question is which alternative is better ?
Definitely, and probably none of them alone ;)
>
>>
>> Coalescing checkpoints seems rather easy as soon as checkpoints records are
>> tagged with the first checkpoint number that invalidates them.
>>
>>> Ahh.. ok.. I stop here. This is not related to CRID vs. FD anymore :)
>>
>> You're right. Hopefully it is interesting, although a bit early to discuss :)
>
> lol .. I couldn't help it.
I could also have simply shut up and kept on lurking... but it was so temptating
to enter the discussion :)
>
>>
>>>> could contain the entire checkpoint history. On the other hand, you are not sure
>>>> that we could do incremental checkpoint this way, which justifies your need for
>>>> a CRID. Perhaps you have an example?
>>> Arguments given above. Note that even with multiple files we don't _need_
>>> CRID, they are merely helpful. Instead, the user could be required to provide
>>> the kernel with an array of file names, corresponding to checkpoint#0 (base),
>>> checkpoint#2, checkpoint#3 etc; In this case, the "incremental state" that
>>> is saved with checkpoint#4, is (a) that it is #4, and (b) for each part of
>>> state that is found in a previous checkpoint, a reference to the serial no.
>>> of that checkpoint is kept.
>>
>> See above for a solution based on a single file.
>>
>>> (The proposal for CRID was that instead of a serial number that starts from
>>> 0 with every full (base) checkpoint, we use the CRID).
>>>
>>>> Anyway, do not take this as an attack. I just want to be well convinced that
>>> On the contrary; your comments are definitely in place.
>>>
>>>> CRIDs are really needed, and are worth the effort of managing them cleanly.
>>>> Exposing them to userspace just scares me a bit.
>>> I'm not sure why is there an "effort of managing" them ? It's a simple
>>> atomic counter, that won't wrap around (use 64 bit if we wish). All in-memory
>>> checkpoint contexts will be (also) in global linked list and easily located
>>> there by their CRID.
>>
>> Ok, as long as no userspace task holds such IDs accross reboot or migration. How
>> would you check this?
>
> Ahhhh.... once again: CRIDs do _not_ make sense across a reboot. Not in the
> kernel anyway. (For incremental, they can be used as hints, and userspace
> brains are needed there anyway). A CRID identifies a checkpoint _in memory_
> and goes away when the checkpoint is removed from memory (canceled, commited)
> or when the container goes away, or when the RAM goes away (e.g. reboot).
Again, I must have failed expressing myself well. I really understand that your
CRIDs have no sense accross reboot or migration, and I do not want to give them
such sense. What annoys me is that userspace gets a CRID as a result of
sys_checkpoint(), and then can give it back to the kernel to write back the
checkpoint. IIUC, a correct userspace checkpointer would give this CRID to
the kernel to write the checkpoint (your step #5), and then would never give it
again to the kernel (or only if the kernel would keep it internally for later
incremental checkpoints). The problem is not well-behaving userspace apps. The
problem is: how does the kernel check that userspace does not give a crappy
CRID (actually the CRID of a checkpoint in an un-related container, it would
probably not hurt for CRIDs generated on a another node/life mistakenly refering
to locally computed checkpoints of the same container)?
Ok, the answer is probably here: CRIDs are local to containers, and userspace
always gives them to the kernel with a reference on the container (whatever
struct it is based on).
>
> When I said "hints" for user space, I refer to two use cases actually. One
> is the incremental checkpoint where this CRID will be part of the header of
> the checkpoint file, and user space will have that number returned by the
> syscall and could use it (e.g. to name the files, but also to keep a record
> of when/what was checkpointed).
> Another is when we will add the capability of file-system snapshot, then
> we'll have a way to identify each snapshot (let's say there will be some
> identifier to each). Then user space could keep a table with the tuples:
> <time, filename, CRID, FSID> to keep track of checkpoint data (FSID stands
> for filesystem snapshot identifier).
Ok. I'd bet that userspace could figure out itself what is the sequence number
of the next checkpoint, but why not.
It's probably time to conclude: I am now convinced that CRIDs can be managed
correctly without userspace being able to crash everything. I'm not strongly
against incremental checkpoints having their own files, so I won't debate to
death on their advantages and drawbacks. You recognized (IIUC your words) the
feasibility of single files hosting chains of incremental checkpoints, so I will
consider myself satisfied with your proposal.
Anyway, other proposals are coming (eg the one from openvz), and things may
still move. So the discussion will probably come back in some way.
Thanks for the discussion (and I'm still interested in your answers to questions
left above).
Louis
>
>>
>>>> Btw, if we ever decide to use CRIDs, I'd propose to manage them in some
>>>> pseudo-filesystem, like SYSV IPC objects actually are.
>>> Eventually, yes ;)
>>>
>>>> Thanks,
>>>>
>>>> Louis
>>>>
>>> Thanks for the comments and stimulating the discussion.
>>
>> I should have had many more discussions like this during my PhD. Your's is going
>> to be definitely better than mine :)
>
> :)
>
> Oren.
>
>>
>> Thanks,
>>
>> Louis
>>
--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes
[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
[-- Attachment #2: Type: text/plain, Size: 206 bytes --]
_______________________________________________
Containers mailing list
Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
https://lists.linux-foundation.org/mailman/listinfo/containers
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <20080801180038.GL22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
@ 2008-08-01 18:51 ` Oren Laadan
[not found] ` <48935B4D.7070302-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Oren Laadan @ 2008-08-01 18:51 UTC (permalink / raw)
To: Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ; +Cc: Linux Containers
Louis Rilling wrote:
> On Fri, Aug 01, 2008 at 10:15:26AM -0400, Oren Laadan wrote:
>>
>> Louis Rilling wrote:
>>> On Thu, Jul 31, 2008 at 03:12:32PM -0400, Oren Laadan wrote:
>>>> Louis Rilling wrote:
>>>>> On Thu, Jul 31, 2008 at 12:28:57PM -0400, Oren Laadan wrote:
>>>>>> Louis Rilling wrote:
>>>>>>> On Thu, Jul 31, 2008 at 11:09:54AM -0400, Oren Laadan wrote:
>>>>>>>> Louis Rilling wrote:
>>>>>>>>> On Wed, Jul 30, 2008 at 06:20:32PM -0400, Oren Laadan wrote:
>>>>>>>>>> Serge E. Hallyn wrote:
>>>>>>>>>>> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>>>>>>>>>>> +int do_checkpoint(struct cr_ctx *ctx)
>>>>>>>>>>>> +{
>>>>>>>>>>>> + int ret;
>>>>>>>>>>>> +
>>>>>>>>>>>> + /* FIX: need to test whether container is checkpointable */
>>>>>>>>>>>> +
>>>>>>>>>>>> + ret = cr_write_hdr(ctx);
>>>>>>>>>>>> + if (!ret)
>>>>>>>>>>>> + ret = cr_write_task(ctx, current);
>>>>>>>>>>>> + if (!ret)
>>>>>>>>>>>> + ret = cr_write_tail(ctx);
>>>>>>>>>>>> +
>>>>>>>>>>>> + /* on success, return (unique) checkpoint identifier */
>>>>>>>>>>>> + if (!ret)
>>>>>>>>>>>> + ret = ctx->crid;
>>>>>>>>>>> Does this crid have a purpose?
>>>>>>>>>> yes, at least three; both are for the future, but important to set the
>>>>>>>>>> meaning of the return value of the syscall already now. The "crid" is
>>>>>>>>>> the CR-identifier that identifies the checkpoint. Every checkpoint is
>>>>>>>>>> assigned a unique number (using an atomic counter).
>>>>>>>>>>
>>>>>>>>>> 1) if a checkpoint is taken and kept in memory (instead of to a file) then
>>>>>>>>>> this will be the identifier with which the restart (or cleanup) would refer
>>>>>>>>>> to the (in memory) checkpoint image
>>>>>>>>>>
>>>>>>>>>> 2) to reduce downtime of the checkpoint, data will be aggregated on the
>>>>>>>>>> checkpoint context, as well as referenced to (cow-ed) pages. This data can
>>>>>>>>>> persist between calls to sys_checkpoint(), and the 'crid', again, will be
>>>>>>>>>> used to identify the (in-memory-to-be-dumped-to-storage) context.
>>>>>>>>>>
>>>>>>>>>> 3) for incremental checkpoint (where a successive checkpoint will only
>>>>>>>>>> save what has changed since the previous checkpoint) there will be a need
>>>>>>>>>> to identify the previous checkpoints (to be able to know where to take
>>>>>>>>>> data from during restart). Again, a 'crid' is handy.
>>>>>>>>>>
>>>>>>>>>> [in fact, for the 3rd use, it will make sense to write that number as
>>>>>>>>>> part of the checkpoint image header]
>>>>>>>>>>
>>>>>>>>>> Note that by doing so, a process that checkpoints itself (in its own
>>>>>>>>>> context), can use code that is similar to the logic of fork():
>>>>>>>>>>
>>>>>>>>>> ...
>>>>>>>>>> crid = checkpoint(...);
>>>>>>>>>> switch (crid) {
>>>>>>>>>> case -1:
>>>>>>>>>> perror("checkpoint failed");
>>>>>>>>>> break;
>>>>>>>>>> default:
>>>>>>>>>> fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
>>>>>>>>>> /* proceed with execution after checkpoint */
>>>>>>>>>> ...
>>>>>>>>>> break;
>>>>>>>>>> case 0:
>>>>>>>>>> fprintf(stderr, "returned after restart\n");
>>>>>>>>>> /* proceed with action required following a restart */
>>>>>>>>>> ...
>>>>>>>>>> break;
>>>>>>>>>> }
>>>>>>>>>> ...
>>>>>>>>> If I understand correctly, this crid can live for quite a long time. So many of
>>>>>>>>> them could be generated while some container would accumulate incremental
>>>>>>>>> checkpoints on, say crid 5, and possibly crid 5 could be reused for another
>>>>>>>>> unrelated checkpoint during that time. This brings the issue of allocating crids
>>>>>>>>> reliably (using something like a pidmap for instance). Moreover, if such ids are
>>>>>>>>> exposed to userspace, we need to remember which ones are allocated accross
>>>>>>>>> reboots and migrations.
>>>>>>>>>
>>>>>>>>> I'm afraid that this becomes too complex...
>>>>>>>> And I'm afraid I didn't explain myself well. So let me rephrase:
>>>>>>>>
>>>>>>>> CRIDs are always _local_ to a specific node. The local CRID counter is
>>>>>>>> bumped (atomically) with each checkpoint attempt. The main use case is
>>>>>>>> for when the checkpoint is kept is memory either shortly (until it is
>>>>>>>> written back to disk) or for a longer time (use-cases that want to keep
>>>>>>>> it there). It only remains valid as long as the checkpoint image is
>>>>>>>> still in memory and have not been committed to storage/network. Think
>>>>>>>> of it as a way to identify the operation instance.
>>>>>>>>
>>>>>>>> So they can live quite a long time, but only as long as the original
>>>>>>>> node is still alive and the checkpoint is still kept in memory. They
>>>>>>>> are meaningless across reboots and migrations. I don't think a wrap
>>>>>>>> around is a concern, but we can use 64 bit if that is the case.
>>>>>>>>
>>>>>>>> Finally, the incremental checkpoint use-case: imagine a container that
>>>>>>>> is checkpointed regularly every minutes. The first checkpoint will be
>>>>>>>> a full checkpoint, say CRID=1. The second will be incremental with
>>>>>>>> respect to the first, with CRID=2, and so on the third and the forth.
>>>>>>>> Userspace could use these CRID to name the image files (for example,
>>>>>>>> app.img.CRID). Assume that we decide (big "if") that the convention is
>>>>>>>> that the last part of the filename must be the CRID, and if we decide
>>>>>>>> (another big "if") to save the CRID as part of the checkpoint image --
>>>>>>>> the part that describe the "incremental nature" of a new checkpoint.
>>>>>>>> (That part would specify where to get state that wasn't really saved
>>>>>>>> in the new checkpoint but instead can be retrieved from older ones).
>>>>>>>> If that was the case, then the logic in the kernel would be fairly
>>>>>>>> to find (and access) the actual files that hold the data. Note, that
>>>>>>>> in this case - the CRID are guaranteed to be unique per series of
>>>>>>>> incremental checkpoints, and incremental chekcpoint is meaningless
>>>>>>>> across reboots (and we can require that across migration too).
>>>>>>> Letting the kernel guess where to find the missing data of an incremental
>>>>>>> checkpoint seems a bit hazardous indeed. What about just appending incremental
>>>>>>> checkpoints to the last full checkpoint file?
>>>>>> It isn't quite a "guess", it's like the kernel assumes that a kernel-helper
>>>>>> resides in some directory - it's a convention. I agree, though, that it may
>>>>>> not be the best method to do it.
>>>>>>
>>>>>> As for putting everything in a single file, I prefer not to do that, and it
>>>>>> may not even always possible I believe.
>>>>>>
>>>>>> An incremental would include a section that describes how to find the missing
>>>>>> data from previous checkpoints, so it must have a way to identify a previous
>>>>>> checkpoint.
>>>>>>
>>>>>> On way is like I suggested name them with this identifier, another would be,
>>>>>> for example, that the user provides a list of file-descriptors that match
>>>>>> the required identifiers. Other ways may be possible too.
>>>>>>
>>>>>> In any event, I think it is now bit early to discuss the exact format and
>>>>>> logic, when we don't even have a simple checkpoint working :)
>>>>>>
>>>>>> Incremental checkpoint is one of a few reasons to use CRIDs, let us first
>>>>>> agree about CRIDs, and later, when we design incremental checkpoints, decide
>>>>>> on the technical details of incorporating this CRIDs.
>>>>>>
>>>>> Agreed, but since your point is to introduce CRIDs, I'd like to be convinced
>>>>> that they are needed :) At least I'd like to be convinced that they will not
>>>>> generate hard-to-manage side effects.
>>>>>
>>>>>> (Just to avoid confusion, an incremental checkpoint is _not_ a pre-copy or
>>>>>> live-migration: in a pre-copy, we repeatedly copy the state of the container
>>>>>> without freezing it until the delta is small enough, then we freeze and then
>>>>>> we checkpoint the remaining residues. All this activity belongs to a single
>>>>>> checkpoint. In incremental checkpoints, we talk about multiple checkpoints
>>>>>> that save only the delta with respect to their preceding checkpoint).
>>>>> Don't worry, I know what incremental checkpointing is.
>>>>>
>>>>>>>> We probably don't want to use something like a pid to identify the
>>>>>>>> checkpoint (while in memory), because we may have multiple checkpoints
>>>>>>>> in memory at a time (of the same container).
>>>>>>> Agreed.
>>>>>>>
>>>>>>>>> It would be way easier if the only (kernel-level) references to a checkpoint
>>>>>>>>> were pointers to its context. Ideally, the only reference would live in a
>>>>>>>>> 'struct container' and would be easily updated at restart-time.
>>>>>>>> Consider the following scenario of calls from user-space (which is
>>>>>>>> how I envision the checkpoint optimized for minimal downtime, in the
>>>>>>>> future):
>>>>>>>>
>>>>>>>> 1) while (syscall_to_do_precopy) <- do precopy until ready to
>>>>>>>> if (too_long_already) <- checkpoint or too long
>>>>>>>> break;
>>>>>>>>
>>>>>>>> 2) freeze_container();
>>>>>>>>
>>>>>>>> 3) crid = checkpoint(.., .., CR_CKPT_LAZY); <- checkpoint container
>>>>>>>> <- don't commit to disk
>>>>>>>> <- (minimize owntime)
>>>>>>>>
>>>>>>>> 4) unfreeze_container(); <- now can unfreeze container
>>>>>>>> <- already as soon as possible
>>>>>>>>
>>>>>>>> 5) ckpt_writeback(crid, fd); <- container is back running. we
>>>>>>>> <- can commit data to storage or
>>>>>>>> <- network in the background.
>>>>>>>>
>>>>>>>> #2 and #4 are done with freezer_cgroup()
>>>>>>>>
>>>>>>>> #1, #3 and #5 must be syscalls
>>>>>>>>
>>>>>>>> More specifically, syscall #5 must be able to refer to the result of syscall #3
>>>>>>>> (that is the CRID !). It is possible that another syscall #3 occur, on the same
>>>>>>>> container, between steps 4 and 5 ... but then that checkpoint will be assigned
>>>>>>>> another, unique CRID.
>>>>>>> Hm, assuming that, as proposed above, incremental checkpoints are stored in the
>>>>>>> same file as the ancestor full checkpoint, why not simply give fd as argument in
>>>>>>> #5? I'd expect that the kernel would associate the file descriptor to the
>>>>>>> checkpoint until it is finalized (written back, sent over the wire, etc.).
>>>>>> The above procedure, step 1-5 are for a _single_ checkpoint.
>>>>> This is what I understood.
>>>>>
>>>>>> Why would the kernel associate a file descriptor with the checkpoint until it
>>>>>> is finalized ? As far as I'm concerned, the checkpoint call in step 3 can go
>>>>>> without any FD. Also, what happens if there is another checkpoint, of the
>>>>>> same container, taken between steps 4 and 5, how would you tell the difference
>>>>>> or select which one goes in first ? Finally, keeping that FD alive between
>>>>>> multiple checkpoints would require the checkpointer (e.g. a daemon that will
>>>>>> periodically checkpoint) to keep it alive.
>>>>>>
>>>>>> I view it differently: a checkpoint held in memory is like a kernel resource,
>>>>>> and requires a handle/identifier for user space to refer to it. Like an IPC
>>>>>> object. Why tie that object to a specific file descriptor ?
>>>>>> The only exception I can see, is the need to tie it to a some process - the
>>>>>> checkpointer for instance, such that if that process dies without completing
>>>>>> the work, the checkpoint image in memory will be cleaned up.
>>>>>> That, however, still is problematic, because it will not allow you to use
>>>>>> different procesess for different steps (above).
>>>>>>
>>>>>> Since we are not yet optimizing the checkpoint procedure, just building the
>>>>>> infrastructure, my goal is to convince that a CRID is a desired feature (and
>>>>>> I can certainly see how it will be used in various scenarios).
>>>>> Here is probably the source of the misunderstanding. I was assuming that step #3
>>>>> needed a file descriptor to dump the checkpoint progressively, but reading your
>>>>> first use-case more carefully might have avoided this misunderstanding :)
>>>> Even without the first use-case (checkpoint in memory), step 3 does not need
>>>> necessarily a file-descriptor to which data will be dumped, in the case of
>>>> said optimization. Consider a scenario with periodic checkpointing of a long
>>>> running application, where we would like to minimize the downtime of the
>>>> application due to each checkpoint. The idea is to do steps 1 and 3 entirely
>>>> in memory, keep the data in a buffer (see below comment about tmpfs). The
>>>> expensive operation of streaming the data to the file-descriptor is only
>>>> done in step 5.
>>>>
>>>> (In the case of checkpoint in memory - it is never written to a file. There
>>>> are various optimization to do there for fast restart for which putting the
>>>> data in a file doesn't make sense).
>>>>
>>>> As for using tmpfs -- so during step 3 the state of all tasks is saved; part
>>>> of it is headers, task data, signals etc, but mostly the memory content. For
>>>> as long as the checkpoint is kept in memory (either because it is meant to
>>>> stay there, or because it is not committed to the file-descriptor yet), there
>>>> is no reason to make a copy of each (dirty) page. On the contrary - the pages
>>>> will be marked COW and a reference will be kept, as part of the checkpoint
>>>> context. Sure, you can put the rest of the data in a file in tmpfs; but you
>>>> probably don't want to copy all the pages to a file in tmpfs - that would be
>>>> wasteful.
>>> I think that memory pages need not to be dumped in step #3. They can be kept
>>> just as you mentioned in COW state in the checkpoint context, and be really
>>> dumped only in step #5.
>>>
>>>>> Anyway, we can still give a fd to sys_checkpoint() which will identify the
>>>>> checkpoint for the remaining operations. It's up to userspace to show the
>>>>> difference between two checkpoints taken (roughly) at the same time. From the
>>>>> kernel point of view, a file descriptor is enough to make the difference.
>>>> That is indeed an option. I haven't given a lot of thought to this approach,
>>>> because in Zap I use CRIDs. Three points against this approach are that:
>>>>
>>>> (1) as I said, that would require that the file descriptor remains alive for
>>>> as long as we want to keep the checkpoint alive (in memory), and
>>> Not sure that this is so bad. The checkpointer can transfer the descriptor
>>> to some daemon using the file descriptor transfer feature of UNIX sockets, and
>>> then freely exit.
>> Uhh.. that's an evil feature to begin with :o
>> In any case, it requires that extra logic.
>>
>>>> (2) if the checkpoint is taken by a process from within the container, we
>>>> create a situation where a resource held by the process (an FD), is referring
>>>> to the checkpoint itself and at the same time also referred to by the
>>>> checkpoint (because it is part of the state of a process that is in the
>>>> container...). In particular this will necessitate some special case treatment
>>>> during the restart operation.
>>> Interesting case. This means that the checkpointer would be checkpointed while
>>> inside sys_checkpoint(), and would possibly try to writeback the checkpoint
>>> after restart (going to step #5 as if it was not restarted). So the special
>>> handling is already needed there, right? Like making sys_checkpoint() return an
>> Not quite. See my first reply to Serge earlier in this thread. sys_checkpoint()
>> returns one of three values: -1 for error, positive (non zero) number which is
>> the CRID on success, and 0 when it returns from restart. Logic is analogous to
>> a fork() syscall. No special handling, definitely not in kernel space.
>
> Sorry I had not these details in mind anymore.
>
> Returning 0 in case of a restart is what I called a special handling. You won't
> do this for the other tasks, so this is special. Since userspace must cope with
> it anyway, userspace can be clever enough to avoid using the fd on restart, or
> stupid enough to destroy its checkpoint after restart.
It's a different "special hanlding" :) In the case of a single task that wants
to checkpoint itself - there are no other tasks. In the case of a container -
there will be only a single task that calls sys_checkpoint(), so only that task
will either get the CRID or the 0 (or an error). The other tasks will resume
whatever it was that they were doing (lol, assuming of course restart works).
So this "special handling" ends up being a two-liner: setting the return
value of the syscall for the task that called sys_checkpoint() (well, actually
it will call sys_restart() to restart, and return from sys_checkpoint() with
a value of 0 ...).
If you use an FD, you will have to checkpoint that resource as part of the
checkpoint, and restore it as part of the restart. In doing so you'll need
to specially handle it, because it has a special meaning. I agree, of course,
that it is feasible.
>
>>> error upon restart. I'm not sure that the checkpoint fd should really need a
>>> special handling in the special case of self-checkpoiting, because the
>>> checkpointer shoud probably not try to do anything with this checkpoint after a
>>> restart, unless it reopens the checkpoint file for appending new incremental
>>> checkpoints.
>>>
>>> Anyway, we are trying to solve an issue that was explicitly forbidden in
>>> previous discussions IIUC, because the whole container is assumed to be frozen
>>> before calling sys_checkpoint(), which means that the checkpointer should live
>>> outside of the container.
>> Actually, I made the point in the mini-summit that such a functionality will be
>> useful, and I have several use cases, and two of them actually implemented
>> with Zap. The main change from a regular, freeze-entire-container checkpoint
>> is that one task - the checkpointer - will be allowed not to freeze. Since
>> it will be doing the checkpoint itself, there is no concern about it not being
>> frozen (after all, we freeze them so they don't change their state). I already
>
> I had no doubt that self-checkpoint is feasible, since we are doing this in
> Kerrighed (it's a signal that is handled at kernel-level only).
>
>> implemented this is Zap and it proved quite useful. See this paper, for example:
>> http://www.ncl.cs.columbia.edu/publications/sosp2007_dejaview.pdf
>
> Nice paper :)
>
>>>> (3) if a give tasks wants to keep many checkpoints in memory (again, either
>>>> permanently or shortly), it will have to keep, forever, a lot of open file
>>>> descriptors.
>>> The only problem I see here is the limitation on the number of file descriptors.
>>> Hm, hundreds of checkpoints in memory looks like memory wastage in some way.
>> "640K ought to be enough for anybody." - Bill Gates, 1981 (actually, according
>> to this page http://en.wikiquote.org/wiki/Talk:Bill_Gates, it may not have been
>> him at all ...)
>>
>> Now seriously, I have at least one use case (the details weren't published yet).
>
> So sad! We'll have to wait...
>
>>>> On the other hand, using an FD provide the advantage of a simple cleanup (FD
>>>> closed -> checkpoint data discarded) and ridding us from the need to come up
>>>> with a cleanup strategy.
>>> We would not get this for free unless we add data for this to the file
>>> descriptor. Adding something like an inotify listener (only used by the kernel)
>>> should also make it.
>> Lol .. then we stick to CRID if we have to implemented something anyway :)
>
> My comment did not aim at saying "it's bad", and it actually didn't. Just giving
> an idea on how to do it.
>
>>>>> Let's consider the three use cases of CRID you mentioned earlier:
>>>>>
>>>>> 1) Checkpointing in memory:
>>>>> Actually, checkpointing in memory could also be done from userspace using tmpfs.
>>>>> Again, I agree that this kind of optimization should be discussed later. I'm
>>>>> just not convinced that this needs a CRID...
>>>> See my comment about regarding tmpfs. You are right, however, in that we could
>>>> use FD to tmpfs where the rest of the data (not pages) will be stored.
>>> See my comment above ;)
>>>
>>>>> 2) Reducing downtime of the checkpoint:
>>>>> If reducing downtime is just a matter of avoiding disk accesses, tmpfs is again
>>>>> a kind of solution. It even allows to swap if the checkpoint size is too big.
>>>>> What kind of scenario (other than incremental checkpointing) do you envision
>>>>> where multiple calls to sys_checkpoint() would use the same checkpoint object?
>>>> Again, see the comment regarding tmpfs. The actual memory copy operation between
>>>> the real pages and the space allocated in tmpfs can take substantial time for
>>>> applications with large memory (compared to merely marking the pages COW, and
>>>> amortizing the cost during regular execution of the application), besides the
>>>> extra space overhead. Also, writing tmpfs incurs visible overhead when you care
>>>> about milliseconds of downtime; I've seen that with Zap.
>>> Are those milliseconds related to pages or to the kernel structures also?
>> It's a visible overhead. I can't remember exactly how much because once I saw
>> it was expensive, I dropped that path. Even buffer allocation (page allocation
>> in case of tmpfs) could become an annoyance when it comes to low downtime, so
>> one optimization in Zap was the pre-allocate the buffers using a good estimate
>> on their sizes based on past checkpoints.
>>
>> Finally, there are use-cases in which you'd like a reall-super-ultra-fast
>> checkpoint (e.g. in context), that is under a millesecond (like a partial
>> fork, to some extent); you do feel the difference then.
>>
>>>>> 3) Incremental checkpoint:
>>>>> I agree that maintaing a fd alive (in a checkpointer daemon for instance) may
>>>>> look restrictive, but I'm not sure that it is really needed to keep it alive
>>>>> between consecutive incremental checkpoints. I'd really like to see incremental
>>>>> checkpointing as an append operation to a checkpoint file. This way the file
>>>> Why ? What's the advantage of having all data in a single file as opposed to
>>>> multiple files ?
>>> - You do not have to look for the previous checkpoints using a to-be-defined
>>> naming scheme, since they are all in the file.
>> but if you *want* to look for a previous checkpoint -- you wanna return to an
>> arbitrary checkpoint in the past ? now you need to look for it.
>
> I think I already sketched how to do it.
>
>>> - Userspace makes less errors when managing incremental checkpoints.
>> have you implemented this ? did you experience issues in real life ? user
>> space will need a way to manage all of it anyway in many aspects. This will
>> be the last/least of the issues ...
>
> No it was not implemented, and I'm not going to enter a discussion about the
> weight of arguments whether they are backed by implementations or not. It just
> becomes easier to create a mess with things depending on each other created as
> separate, "freely" (userspace-decided)-named objects.
If I were to write a user-space tool to handle this, I would keep each chain
of checkpoints (from "base" and on) in a separate subdir, for example. In fact,
that's how I did it :)
>>> - You can easily create new branches by just copying the file, restarting from
>>> it, and adding incremental checkpoints to it. (Not sure this branch feature
>>> is really interesting, but I it sounds funny :))
>> Using multiple files, you can create branches by adding hard-links (or soft-
>> links) to previous files. Saves space, time, and - I'd argue - easier to
>> understand and manage.
>
> Again, no doubt about the feasibility with multiple files. I admit that this
> also saves space since the common parts are shared.
>
>> Branches features is really interesting, as a matter of fact; Again I refer
>> you to the paper mentioned above.
>>
>>>> Recall that the data can be streamed, so when you start to read a file you
>>>> don't know a-priori how long is the checkpoint image, until you have parsed
>>>> it all; So you can't easily find the beginning of the, say 15th checkpoint
>>>> int that case.
>>> Good point: in append-only mode, we do not know that there are 15 checkpoints
>>> until we reach the 15th one. Perhaps append-only is too restrictive for
>>> incremental checkpoint. OTOH, do we really want to support a unique stream
>>> having multiple checkpoints? Probably not. So rewrite and append looks like a
>>> better option. An incremental checkpoint procedure could look like this:
>>>
>>> err = sys_checkpoint(base_fd, out_fd, ...)
>> Re-write + append will end up being very costly (imagine you save the data
>> on a network filel system), both in time and (at least for some time) in
>> space.
>
> Hm, I'd bet that you have to read the previous checkpoints anyway, unless after
> some time things differ so much that the oldest images are not needed anymore.
Read, yes. Not re-write. And you don't need to read all of them, but cherry-pick
the pieces of interest (as indicated in the "current" checkpoint image).
>
>> Besides, this scheme begins to sound much more complex than a single file.
>> Do you really gain so much from not having multiple files, one per checkpoint ?
>
> Well, at least you are not limited by the number of open file descriptors
> (assuming that, as you mentioned earlier, you pass an array of previous images
> to compute the next incremental checkpoint).
You aren't limited by the number of open file. User space could provide an array
of <CRID, pathname> (or <serial#, pathname>) to the kernel, the kernel will
access the files as necessary.
Uhh .. hold on: you need the array of previous checkpoint to _restart_ from
an incremental checkpoint. You don't care about it when you checkpoint: instead,
you keep track in memory of (1) what changed (e.g. which pages where touched),
and (2) where to find unmodified pages in previous checkpoints. You save this
information with each new checkpoint. The data structure to describe #2 is
dynamic and changes with the execution, and easily keeps track of when older
checkpoint images become irrelevant (because all the pages they hold have been
overwritten already).
>>> where:
>>> - base_fd is a regular file containing the base checkpoint, or -1 if a full
>>> checkpoint should be done. The checkpoint could actually also live in memory,
>>> and the kernel should check that it matches the image pointed to by base_fd.
>>> - out_fd is whatever file/socket/etc. on which we should dump the checkpoint. In
>>> particular, out_fd can equal base_fd and should point to the beginning of the
>>> file if it's a regular file.
>> Excellent example. What if the checkpoint data is streamed over the network;
>> so you cannot rewrite the file after it has been streamed... Or you will have
>> to save the entire incremental history in memory :(
>
> I'm not sure to have expressed myself well: as was explained later, streaming
> output is ok for an incremental checkpoint, since you need the base checkpoint
> anyway. Unless you have a solution to build an incremental checkpoint out of
> streamed earlier checkpoints, I don't see what kind of limitation this would
> introduce.
I suspect we need to clarify the terminology: by "streamed" I mean that
the format does not require seeks (going back and forth), so that it can be
sent over a socket and make sense. While this is useful for migration, it
does not imply a migration. Consider, for instance, if you want to store the
checkpoint elsewhere you transfer the data via a socket to a daemon.
I actually wasn't thinking of streaming a series of incremental checkpoints
(from base and on) to implement migration... I simply didn't have a use-case
for that :)
>> The checkpoint - may, or may not live in memory for a long time. Usually not,
>> by the way, for the usual case it doesn't really make sense to use up memory
>> for nothing.
>
> Definitely agreed.
>
>>> If base_fd is a valid file descriptor, sys_checkpoint() would do this:
>>>
>>> #1 check the validity of the checkpoint image (possibly compare with in-memory
>>> checkpoint states),
>>>
>>> #2 (over)write the position of the next (coming) checkpoint on out_fd (see
>>> explanations below) and its sequence number as well (this actually makes
>>> sequence counters live in the checkpoint image),
>>>
>>> #3 write the contents of base_fd to out_fd, marking the records invalidated by
>>> the current checkpoint on the fly (see explanations below),
>>>
>>> #4 write the new incremental checkpoint records.
>> I truly don't think this scheme is simpler or easier to manage compared to
>> a using multiple files; and I really wonder what is the big advantage of
>> going through this non-trivial logic ?
>>
>>> This assumes that a checkpoint image has a place in the header to tell where the
>>> last checkpoint image is. Eventually, each record (task struct, vma, page, etc.)
>>> should contain a field telling which later incremental checkpoint invalidates
>>> it, so that we can restart from any intermediate checkpoint if we like.
>> My experience is that you really need incremental for memory, but not that
>> necessary for the rest of the state. So the way I did it is - whenever a
>> vma is saved, if some of its pages are found in previous checkpoints, a
>> pointer to where the page data resides is given (CRID, position) instead of
>> the page contents.
>
> So in the case I described, say we restart from checkpoint #7, the page would be
> found at the first page record of same (mm,address) that is not invalidated by a
> checkpoint having id <= 7.
Ehhh... I'm confused with this. Invalidated by checkpoint having id <= 7 ? only
a later checkpoint can invalidate a page and provide a newer version of that
page.
So to restart from checkpoint #7, you first restart from checkpoint #7 *as is*.
At this point you'll have everything setup, except that some memory contents
(hopefully much, because that means you saved a lot by doing incremental) will
be incorrect, because they weren't actually saved with checkpoint #7. But
checkpoint #7 will also have a section that describes this remaining memory and
where it can be found, e.g many entries like this:
<mm_struct id, page addr, checkpoint image id, position in file>
Now the code will scan this array, and fetch the required pages from where
they are stored.
(As mentioned before, the data structure that describes this array will be
dynamically updated as applications modify their memory).
This, of course, assumes that an incremental restart is _not_ stream-able,
and that all the files (or the entire single file) is available and seek-able.
(Still, being able to stream the (regular) checkpoint/restart operation is one
of our goals).
> I see where multiple files provide more performance however: you do not have to
> read the whole history to restart. At least this is true for non-streamed
> checkpoints. As soon as they are streamed, you can only hope that you won't need
> data living at the end of the images.
Exactly.
>
>>> Moreover, each intermediate checkpoint would contain a pointer to the start of
>>> the previous and the next one, so that any intermediate checkpoint can be easily
>>> found. This actually makes step #2 and #3 modify the checkpoint image in place,
>>> whenever based_fd and out_fd point to the same file. This disables streaming for
>>> restarts from an intermediate checkpoint, but I don't think this is a real
>>> issue, unless there are use cases outside live-migration?
>> This is not quite possible to do when the data has been streamed through a
>> socket, for example (can't rewrite); or expensive to do with a network file
>> system.
>
> Again, how do you build an incremental checkpoint out of streamed-only previous
> checkpoints?
I hope the clarification above explains that what I meant by "data being
steamed" is that the file is not seek-able.
>
>> Live migration is orthogonal to incremental checkpoint, they have nothing
>> in common. There are use cases for restarting from an intermediate checkpoint
>> like the paper I mentioned, as well as fault tolerance, debugging, forensics,
>> and more.
>
> I'm definitely sure that intermediate checkpoints are interesting. I was only
> wondering if streaming was so interesting for them.
Not in the sense of streaming for migration :)
>
>> "Streaming" also means, as I mentioned above, to the case where you send
>> the data over a socket (even if not for a live migration, but to a daemon
>> that would hold it in memory on another node, for example). In that media
>> you cannot easily rewrite the file.
>
> The point is that you need previous data when building an incremental
> checkpoint, so you will read it at least. And since it was previously stored (in
The scheme that I described above and is implemented in Zap does not require
access to previous checkpoints when building a new incremental checkpoint.
Instead, you keep some data structure in the kernel that describes the pieces
that you need to carry with you (what pages were saved, and where; when a task
exits, the data describing its mm will be discarded, of course, and so on).
> memory or whatever), you can even get its size before actually reading it,
> unless you checkpoint at such a rate that the previous chekpoint was not
> completely sent when you start the next one. If a remote daemon should really
> host the checkpoints, you can even tell the daemon which checkpoint to overwrite
> with the new one.
>
>>>> Depending on the size of your checkpoint, a single file may eventually become
>>>> very large in a short time. I have one system that takes a checkpoint every
>>>> second of en entire user-desktop ...
>>>>
>>>> One single large file is harder to manager, parse, and inspect, even with
>>>> proper user tools. If you wanted to change something inside (for whatever
>>>> reasons), that would be a difficult to do. Same goes for when you want to
>>>> coalesce multiple checkpoints into a single checkpoint (e.g. to save space,
>>>> or because you don't care about some of your past)
>>> Ok, this becomes more complex, but feasible I think (see above).
>> Heh ... of course it is feasible. The question is which alternative is better ?
>
> Definitely, and probably none of them alone ;)
>
>>> Coalescing checkpoints seems rather easy as soon as checkpoints records are
>>> tagged with the first checkpoint number that invalidates them.
>>>
>>>> Ahh.. ok.. I stop here. This is not related to CRID vs. FD anymore :)
>>> You're right. Hopefully it is interesting, although a bit early to discuss :)
>> lol .. I couldn't help it.
>
> I could also have simply shut up and kept on lurking... but it was so temptating
> to enter the discussion :)
>
>>>>> could contain the entire checkpoint history. On the other hand, you are not sure
>>>>> that we could do incremental checkpoint this way, which justifies your need for
>>>>> a CRID. Perhaps you have an example?
>>>> Arguments given above. Note that even with multiple files we don't _need_
>>>> CRID, they are merely helpful. Instead, the user could be required to provide
>>>> the kernel with an array of file names, corresponding to checkpoint#0 (base),
>>>> checkpoint#2, checkpoint#3 etc; In this case, the "incremental state" that
>>>> is saved with checkpoint#4, is (a) that it is #4, and (b) for each part of
>>>> state that is found in a previous checkpoint, a reference to the serial no.
>>>> of that checkpoint is kept.
>>> See above for a solution based on a single file.
>>>
>>>> (The proposal for CRID was that instead of a serial number that starts from
>>>> 0 with every full (base) checkpoint, we use the CRID).
>>>>
>>>>> Anyway, do not take this as an attack. I just want to be well convinced that
>>>> On the contrary; your comments are definitely in place.
>>>>
>>>>> CRIDs are really needed, and are worth the effort of managing them cleanly.
>>>>> Exposing them to userspace just scares me a bit.
>>>> I'm not sure why is there an "effort of managing" them ? It's a simple
>>>> atomic counter, that won't wrap around (use 64 bit if we wish). All in-memory
>>>> checkpoint contexts will be (also) in global linked list and easily located
>>>> there by their CRID.
>>> Ok, as long as no userspace task holds such IDs accross reboot or migration. How
>>> would you check this?
>> Ahhhh.... once again: CRIDs do _not_ make sense across a reboot. Not in the
>> kernel anyway. (For incremental, they can be used as hints, and userspace
>> brains are needed there anyway). A CRID identifies a checkpoint _in memory_
>> and goes away when the checkpoint is removed from memory (canceled, commited)
>> or when the container goes away, or when the RAM goes away (e.g. reboot).
>
> Again, I must have failed expressing myself well. I really understand that your
> CRIDs have no sense accross reboot or migration, and I do not want to give them
> such sense. What annoys me is that userspace gets a CRID as a result of
> sys_checkpoint(), and then can give it back to the kernel to write back the
> checkpoint. IIUC, a correct userspace checkpointer would give this CRID to
> the kernel to write the checkpoint (your step #5), and then would never give it
> again to the kernel (or only if the kernel would keep it internally for later
> incremental checkpoints). The problem is not well-behaving userspace apps. The
> problem is: how does the kernel check that userspace does not give a crappy
> CRID (actually the CRID of a checkpoint in an un-related container, it would
> probably not hurt for CRIDs generated on a another node/life mistakenly refering
> to locally computed checkpoints of the same container)?
Excellent point. (same as with IPC identifiers ...)
> Ok, the answer is probably here: CRIDs are local to containers, and userspace
> always gives them to the kernel with a reference on the container (whatever
> struct it is based on).
Exactly.
>
>> When I said "hints" for user space, I refer to two use cases actually. One
>> is the incremental checkpoint where this CRID will be part of the header of
>> the checkpoint file, and user space will have that number returned by the
>> syscall and could use it (e.g. to name the files, but also to keep a record
>> of when/what was checkpointed).
>> Another is when we will add the capability of file-system snapshot, then
>> we'll have a way to identify each snapshot (let's say there will be some
>> identifier to each). Then user space could keep a table with the tuples:
>> <time, filename, CRID, FSID> to keep track of checkpoint data (FSID stands
>> for filesystem snapshot identifier).
>
> Ok. I'd bet that userspace could figure out itself what is the sequence number
> of the next checkpoint, but why not.
>
> It's probably time to conclude: I am now convinced that CRIDs can be managed
> correctly without userspace being able to crash everything. I'm not strongly
> against incremental checkpoints having their own files, so I won't debate to
> death on their advantages and drawbacks. You recognized (IIUC your words) the
> feasibility of single files hosting chains of incremental checkpoints, so I will
> consider myself satisfied with your proposal.
All agreed :)
>
> Anyway, other proposals are coming (eg the one from openvz), and things may
> still move. So the discussion will probably come back in some way.
>
> Thanks for the discussion (and I'm still interested in your answers to questions
> left above).
I tried my best.
Oren.
>
> Louis
>
>>>>> Btw, if we ever decide to use CRIDs, I'd propose to manage them in some
>>>>> pseudo-filesystem, like SYSV IPC objects actually are.
>>>> Eventually, yes ;)
>>>>
>>>>> Thanks,
>>>>>
>>>>> Louis
>>>>>
>>>> Thanks for the comments and stimulating the discussion.
>>> I should have had many more discussions like this during my PhD. Your's is going
>>> to be definitely better than mine :)
>> :)
>>
>> Oren.
>>
>>> Thanks,
>>>
>>> Louis
>>>
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <48935B4D.7070302-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2008-08-04 10:16 ` Louis Rilling
2008-08-05 2:37 ` Oren Laadan
0 siblings, 1 reply; 37+ messages in thread
From: Louis Rilling @ 2008-08-04 10:16 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers
On Fri, Aug 01, 2008 at 02:51:57PM -0400, Oren Laadan wrote:
> Louis Rilling wrote:
>> On Fri, Aug 01, 2008 at 10:15:26AM -0400, Oren Laadan wrote:
>>> Louis Rilling wrote:
>>>> On Thu, Jul 31, 2008 at 03:12:32PM -0400, Oren Laadan wrote:
Cut the less interesting (IMHO at least) history to make Dave happier ;)
>>
>> Returning 0 in case of a restart is what I called a special handling. You won't
>> do this for the other tasks, so this is special. Since userspace must cope with
>> it anyway, userspace can be clever enough to avoid using the fd on restart, or
>> stupid enough to destroy its checkpoint after restart.
>
> It's a different "special hanlding" :) In the case of a single task that wants
> to checkpoint itself - there are no other tasks. In the case of a container -
> there will be only a single task that calls sys_checkpoint(), so only that task
> will either get the CRID or the 0 (or an error). The other tasks will resume
> whatever it was that they were doing (lol, assuming of course restart works).
>
> So this "special handling" ends up being a two-liner: setting the return
> value of the syscall for the task that called sys_checkpoint() (well, actually
> it will call sys_restart() to restart, and return from sys_checkpoint() with
> a value of 0 ...).
I knew it, since I actually saw it in the patches you sent last week.
>
> If you use an FD, you will have to checkpoint that resource as part of the
> checkpoint, and restore it as part of the restart. In doing so you'll need
> to specially handle it, because it has a special meaning. I agree, of course,
> that it is feasible.
>
>>>> - Userspace makes less errors when managing incremental checkpoints.
>>> have you implemented this ? did you experience issues in real life ? user
>>> space will need a way to manage all of it anyway in many aspects. This will
>>> be the last/least of the issues ...
>>
>> No it was not implemented, and I'm not going to enter a discussion about the
>> weight of arguments whether they are backed by implementations or not. It just
>> becomes easier to create a mess with things depending on each other created as
>> separate, "freely" (userspace-decided)-named objects.
>
> If I were to write a user-space tool to handle this, I would keep each chain
> of checkpoints (from "base" and on) in a separate subdir, for example. In fact,
> that's how I did it :)
This is intuitive indeed. Checkpoints are already organized in a similar way in
Kerrighed, except that a notion of application (transparent to applications)
replaces the notion of container, and the kernel decides where to put the
checkpoints and how they are named (I'm not saying that this is the best
way though).
>>> Besides, this scheme begins to sound much more complex than a single file.
>>> Do you really gain so much from not having multiple files, one per checkpoint ?
>>
>> Well, at least you are not limited by the number of open file descriptors
>> (assuming that, as you mentioned earlier, you pass an array of previous images
>> to compute the next incremental checkpoint).
>
> You aren't limited by the number of open file. User space could provide an array
> of <CRID, pathname> (or <serial#, pathname>) to the kernel, the kernel will
> access the files as necessary.
But the kernel itself would have to cope with this limit (even if it is
not enforced, just to avoid consuming too much resources), or close and
reopen files when needed...
>
> Uhh .. hold on: you need the array of previous checkpoint to _restart_ from
> an incremental checkpoint. You don't care about it when you checkpoint: instead,
> you keep track in memory of (1) what changed (e.g. which pages where touched),
> and (2) where to find unmodified pages in previous checkpoints. You save this
> information with each new checkpoint. The data structure to describe #2 is
> dynamic and changes with the execution, and easily keeps track of when older
> checkpoint images become irrelevant (because all the pages they hold have been
> overwritten already).
I see. I thought that you also intended to build incremental checkpoints
from previous checkpoints only, because even if this is not fast, this
saves storage space. I agree that if you always keep necessary metadata
in kernel memory, you don't need the previous images. Actually I don't
know any incremental checkpoint scheme not using such in-memory metadata
scheme. Which does not imply that other schemes are not relevant
though...
>
>
>>>> where:
>>>> - base_fd is a regular file containing the base checkpoint, or -1 if a full
>>>> checkpoint should be done. The checkpoint could actually also live in memory,
>>>> and the kernel should check that it matches the image pointed to by base_fd.
>>>> - out_fd is whatever file/socket/etc. on which we should dump the checkpoint. In
>>>> particular, out_fd can equal base_fd and should point to the beginning of the
>>>> file if it's a regular file.
>>> Excellent example. What if the checkpoint data is streamed over the network;
>>> so you cannot rewrite the file after it has been streamed... Or you will have
>>> to save the entire incremental history in memory :(
>>
>> I'm not sure to have expressed myself well: as was explained later, streaming
>> output is ok for an incremental checkpoint, since you need the base checkpoint
>> anyway. Unless you have a solution to build an incremental checkpoint out of
>> streamed earlier checkpoints, I don't see what kind of limitation this would
>> introduce.
>
> I suspect we need to clarify the terminology: by "streamed" I mean that
> the format does not require seeks (going back and forth), so that it can be
> sent over a socket and make sense. While this is useful for migration, it
> does not imply a migration. Consider, for instance, if you want to store the
> checkpoint elsewhere you transfer the data via a socket to a daemon.
My definition of "streaming" was exactly "non-seekable", not only for
migration.
>
> I actually wasn't thinking of streaming a series of incremental checkpoints
> (from base and on) to implement migration... I simply didn't have a use-case
> for that :)
This could be useful however. Since incremental checkpoint is faster
this could reduce down-time.
>>>> This assumes that a checkpoint image has a place in the header to tell where the
>>>> last checkpoint image is. Eventually, each record (task struct, vma, page, etc.)
>>>> should contain a field telling which later incremental checkpoint invalidates
>>>> it, so that we can restart from any intermediate checkpoint if we like.
>>> My experience is that you really need incremental for memory, but not that
>>> necessary for the rest of the state. So the way I did it is - whenever a
>>> vma is saved, if some of its pages are found in previous checkpoints, a
>>> pointer to where the page data resides is given (CRID, position) instead of
>>> the page contents.
>>
>> So in the case I described, say we restart from checkpoint #7, the page would be
>> found at the first page record of same (mm,address) that is not invalidated by a
>> checkpoint having id <= 7.
>
> Ehhh... I'm confused with this. Invalidated by checkpoint having id <= 7 ? only
> a later checkpoint can invalidate a page and provide a newer version of that
> page.
Sorry, I was not clear enough. I was talking about restarting from an
incremental checkpoint in the case were all the sequence of checkpoints
is stored in as single file. So I meant "not invalidated by a checkpoint
having an id <= 7, eg. 5". That is, when you restart from (possibly
intermediate) incremental checkpoint 7 and walk the file containing the sequence
of checkpoints, some records are invalidated by incremental checkpoints
having ids > 7 (eg checkpoint 9) and thus are part of checkpoint 7, some records
are not invalidated by any checkpoint yet and thus are also part of checkpoint
7, and the other records were invalidated by checkpoints havind ids <= 7
(for instance 5, 3, etc.) and thus are not part of checkpoint 7.
>
> So to restart from checkpoint #7, you first restart from checkpoint #7 *as is*.
> At this point you'll have everything setup, except that some memory contents
> (hopefully much, because that means you saved a lot by doing incremental) will
> be incorrect, because they weren't actually saved with checkpoint #7. But
> checkpoint #7 will also have a section that describes this remaining memory and
> where it can be found, e.g many entries like this:
>
> <mm_struct id, page addr, checkpoint image id, position in file>
>
> Now the code will scan this array, and fetch the required pages from where
> they are stored.
>
> (As mentioned before, the data structure that describes this array will be
> dynamically updated as applications modify their memory).
>
> This, of course, assumes that an incremental restart is _not_ stream-able,
> and that all the files (or the entire single file) is available and seek-able.
> (Still, being able to stream the (regular) checkpoint/restart operation is one
> of our goals).
Ok. The single file approach, with records tagged with the first
checkpoint id not using them anymore (as mentioned again above), makes
incremental restarts streamable (pages that are part of the checkpoint
can be stored in a array until they are mapped), although this makes an
incremental restart read the whole checkpoint sequence.
In the multiple files approach, we could first restore all but missing
memory pages from the incremental checkpoint file, at the same time
record in a temporary array which pages are missing and where to find
them, sorting the entries by checkpoint file and location in the file,
and in a second pass read sequentially the needed checkpoint files and
fetch the needed pages. Since the array is sorted by location in the
files, this second pass would not need costly lookups in the array to
figure out whether a pages is needed or not.
>>
>> Again, how do you build an incremental checkpoint out of streamed-only previous
>> checkpoints?
>
> I hope the clarification above explains that what I meant by "data being
> steamed" is that the file is not seek-able.
I hope that the clarification above explains what I expected from
incremental checkpoints :)
>
>>
>>> Live migration is orthogonal to incremental checkpoint, they have nothing
>>> in common. There are use cases for restarting from an intermediate checkpoint
>>> like the paper I mentioned, as well as fault tolerance, debugging, forensics,
>>> and more.
>>
>> I'm definitely sure that intermediate checkpoints are interesting. I was only
>> wondering if streaming was so interesting for them.
>
> Not in the sense of streaming for migration :)
But possibly in the sense of streaming from a remote store? Maybe
performance is not so critical in those cases?
>>
>> The point is that you need previous data when building an incremental
>> checkpoint, so you will read it at least. And since it was previously stored (in
>
> The scheme that I described above and is implemented in Zap does not require
> access to previous checkpoints when building a new incremental checkpoint.
> Instead, you keep some data structure in the kernel that describes the pieces
> that you need to carry with you (what pages were saved, and where; when a task
> exits, the data describing its mm will be discarded, of course, and so on).
This is because you probably decided that a mechanism in the kernel that saves
storage space was not interesting if it does not improve speed. As a
consequence you need to keep metadata in kernel memory in order to do
incremental checkpoint. Maybe saving storage space without considering
speed could equally be done from userspace with sort of checkpoint diff
tools that would create an incremental checkpoint 2' from two full
checkpoints 1 and 2.
Thanks,
Louis
--
Dr Louis Rilling Kerlabs - IRISA
Skype: louis.rilling Campus Universitaire de Beaulieu
Phone: (+33|0) 2 99 84 71 52 Avenue du General Leclerc
Fax: (+33|0) 2 99 84 71 71 35042 Rennes CEDEX - France
http://www.kerlabs.com/
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
2008-08-04 10:16 ` Louis Rilling
@ 2008-08-05 2:37 ` Oren Laadan
[not found] ` <4897BCE0.1080508-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Oren Laadan @ 2008-08-05 2:37 UTC (permalink / raw)
To: Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ; +Cc: Linux Containers
Louis Rilling wrote:
> On Fri, Aug 01, 2008 at 02:51:57PM -0400, Oren Laadan wrote:
>> Louis Rilling wrote:
>>> On Fri, Aug 01, 2008 at 10:15:26AM -0400, Oren Laadan wrote:
>>>> Louis Rilling wrote:
>>>>> On Thu, Jul 31, 2008 at 03:12:32PM -0400, Oren Laadan wrote:
>
> Cut the less interesting (IMHO at least) history to make Dave happier ;)
>
>>> Returning 0 in case of a restart is what I called a special handling. You won't
>>> do this for the other tasks, so this is special. Since userspace must cope with
>>> it anyway, userspace can be clever enough to avoid using the fd on restart, or
>>> stupid enough to destroy its checkpoint after restart.
>> It's a different "special hanlding" :) In the case of a single task that wants
>> to checkpoint itself - there are no other tasks. In the case of a container -
>> there will be only a single task that calls sys_checkpoint(), so only that task
>> will either get the CRID or the 0 (or an error). The other tasks will resume
>> whatever it was that they were doing (lol, assuming of course restart works).
>>
>> So this "special handling" ends up being a two-liner: setting the return
>> value of the syscall for the task that called sys_checkpoint() (well, actually
>> it will call sys_restart() to restart, and return from sys_checkpoint() with
>> a value of 0 ...).
>
> I knew it, since I actually saw it in the patches you sent last week.
>
>> If you use an FD, you will have to checkpoint that resource as part of the
>> checkpoint, and restore it as part of the restart. In doing so you'll need
>> to specially handle it, because it has a special meaning. I agree, of course,
>> that it is feasible.
>>
>
>>>>> - Userspace makes less errors when managing incremental checkpoints.
>>>> have you implemented this ? did you experience issues in real life ? user
>>>> space will need a way to manage all of it anyway in many aspects. This will
>>>> be the last/least of the issues ...
>>> No it was not implemented, and I'm not going to enter a discussion about the
>>> weight of arguments whether they are backed by implementations or not. It just
>>> becomes easier to create a mess with things depending on each other created as
>>> separate, "freely" (userspace-decided)-named objects.
>> If I were to write a user-space tool to handle this, I would keep each chain
>> of checkpoints (from "base" and on) in a separate subdir, for example. In fact,
>> that's how I did it :)
>
> This is intuitive indeed. Checkpoints are already organized in a similar way in
> Kerrighed, except that a notion of application (transparent to applications)
> replaces the notion of container, and the kernel decides where to put the
> checkpoints and how they are named (I'm not saying that this is the best
> way though).
>
>>>> Besides, this scheme begins to sound much more complex than a single file.
>>>> Do you really gain so much from not having multiple files, one per checkpoint ?
>>> Well, at least you are not limited by the number of open file descriptors
>>> (assuming that, as you mentioned earlier, you pass an array of previous images
>>> to compute the next incremental checkpoint).
>> You aren't limited by the number of open file. User space could provide an array
>> of <CRID, pathname> (or <serial#, pathname>) to the kernel, the kernel will
>> access the files as necessary.
>
> But the kernel itself would have to cope with this limit (even if it is
> not enforced, just to avoid consuming too much resources), or close and
> reopen files when needed...
You got - close and reopen as needed with LRU policy to decide which open file
to close. My experience so far is that you rarely need more than 100 open files.
>
>> Uhh .. hold on: you need the array of previous checkpoint to _restart_ from
>> an incremental checkpoint. You don't care about it when you checkpoint: instead,
>> you keep track in memory of (1) what changed (e.g. which pages where touched),
>> and (2) where to find unmodified pages in previous checkpoints. You save this
>> information with each new checkpoint. The data structure to describe #2 is
>> dynamic and changes with the execution, and easily keeps track of when older
>> checkpoint images become irrelevant (because all the pages they hold have been
>> overwritten already).
>
> I see. I thought that you also intended to build incremental checkpoints
> from previous checkpoints only, because even if this is not fast, this
> saves storage space. I agree that if you always keep necessary metadata
> in kernel memory, you don't need the previous images. Actually I don't
> know any incremental checkpoint scheme not using such in-memory metadata
> scheme. Which does not imply that other schemes are not relevant
> though...
>
>>
>>>>> where:
>>>>> - base_fd is a regular file containing the base checkpoint, or -1 if a full
>>>>> checkpoint should be done. The checkpoint could actually also live in memory,
>>>>> and the kernel should check that it matches the image pointed to by base_fd.
>>>>> - out_fd is whatever file/socket/etc. on which we should dump the checkpoint. In
>>>>> particular, out_fd can equal base_fd and should point to the beginning of the
>>>>> file if it's a regular file.
>>>> Excellent example. What if the checkpoint data is streamed over the network;
>>>> so you cannot rewrite the file after it has been streamed... Or you will have
>>>> to save the entire incremental history in memory :(
>>> I'm not sure to have expressed myself well: as was explained later, streaming
>>> output is ok for an incremental checkpoint, since you need the base checkpoint
>>> anyway. Unless you have a solution to build an incremental checkpoint out of
>>> streamed earlier checkpoints, I don't see what kind of limitation this would
>>> introduce.
>> I suspect we need to clarify the terminology: by "streamed" I mean that
>> the format does not require seeks (going back and forth), so that it can be
>> sent over a socket and make sense. While this is useful for migration, it
>> does not imply a migration. Consider, for instance, if you want to store the
>> checkpoint elsewhere you transfer the data via a socket to a daemon.
>
> My definition of "streaming" was exactly "non-seekable", not only for
> migration.
>
>> I actually wasn't thinking of streaming a series of incremental checkpoints
>> (from base and on) to implement migration... I simply didn't have a use-case
>> for that :)
>
> This could be useful however. Since incremental checkpoint is faster
> this could reduce down-time.
Naturally incremental checkpoint reduces downtime; however since each checkpoint
is taken at a different time, they can be streamed -- transferred over the
network -- as they are taken. This gives more flexibility and can still, if
you wish, can easily be transformed to a single long stream.
Actually, this is a good argument in favor of using multiple files: they are a
more flexible approach and can always be easily transformed to a single long
stream, while the reverse isn't so.
>
>>>>> This assumes that a checkpoint image has a place in the header to tell where the
>>>>> last checkpoint image is. Eventually, each record (task struct, vma, page, etc.)
>>>>> should contain a field telling which later incremental checkpoint invalidates
>>>>> it, so that we can restart from any intermediate checkpoint if we like.
>>>> My experience is that you really need incremental for memory, but not that
>>>> necessary for the rest of the state. So the way I did it is - whenever a
>>>> vma is saved, if some of its pages are found in previous checkpoints, a
>>>> pointer to where the page data resides is given (CRID, position) instead of
>>>> the page contents.
>>> So in the case I described, say we restart from checkpoint #7, the page would be
>>> found at the first page record of same (mm,address) that is not invalidated by a
>>> checkpoint having id <= 7.
>> Ehhh... I'm confused with this. Invalidated by checkpoint having id <= 7 ? only
>> a later checkpoint can invalidate a page and provide a newer version of that
>> page.
>
> Sorry, I was not clear enough. I was talking about restarting from an
> incremental checkpoint in the case were all the sequence of checkpoints
> is stored in as single file. So I meant "not invalidated by a checkpoint
> having an id <= 7, eg. 5". That is, when you restart from (possibly
> intermediate) incremental checkpoint 7 and walk the file containing the sequence
> of checkpoints, some records are invalidated by incremental checkpoints
> having ids > 7 (eg checkpoint 9) and thus are part of checkpoint 7, some records
> are not invalidated by any checkpoint yet and thus are also part of checkpoint
> 7, and the other records were invalidated by checkpoints havind ids <= 7
> (for instance 5, 3, etc.) and thus are not part of checkpoint 7.
>
>> So to restart from checkpoint #7, you first restart from checkpoint #7 *as is*.
>> At this point you'll have everything setup, except that some memory contents
>> (hopefully much, because that means you saved a lot by doing incremental) will
>> be incorrect, because they weren't actually saved with checkpoint #7. But
>> checkpoint #7 will also have a section that describes this remaining memory and
>> where it can be found, e.g many entries like this:
>>
>> <mm_struct id, page addr, checkpoint image id, position in file>
>>
>> Now the code will scan this array, and fetch the required pages from where
>> they are stored.
>>
>> (As mentioned before, the data structure that describes this array will be
>> dynamically updated as applications modify their memory).
>>
>> This, of course, assumes that an incremental restart is _not_ stream-able,
>> and that all the files (or the entire single file) is available and seek-able.
>> (Still, being able to stream the (regular) checkpoint/restart operation is one
>> of our goals).
>
> Ok. The single file approach, with records tagged with the first
> checkpoint id not using them anymore (as mentioned again above), makes
> incremental restarts streamable (pages that are part of the checkpoint
> can be stored in a array until they are mapped), although this makes an
> incremental restart read the whole checkpoint sequence.
>
> In the multiple files approach, we could first restore all but missing
> memory pages from the incremental checkpoint file, at the same time
> record in a temporary array which pages are missing and where to find
> them, sorting the entries by checkpoint file and location in the file,
> and in a second pass read sequentially the needed checkpoint files and
> fetch the needed pages. Since the array is sorted by location in the
> files, this second pass would not need costly lookups in the array to
> figure out whether a pages is needed or not.
>
>>> Again, how do you build an incremental checkpoint out of streamed-only previous
>>> checkpoints?
>> I hope the clarification above explains that what I meant by "data being
>> steamed" is that the file is not seek-able.
streamed, that is :)
>
> I hope that the clarification above explains what I expected from
> incremental checkpoints :)
>
>>>> Live migration is orthogonal to incremental checkpoint, they have nothing
>>>> in common. There are use cases for restarting from an intermediate checkpoint
>>>> like the paper I mentioned, as well as fault tolerance, debugging, forensics,
>>>> and more.
>>> I'm definitely sure that intermediate checkpoints are interesting. I was only
>>> wondering if streaming was so interesting for them.
>> Not in the sense of streaming for migration :)
>
> But possibly in the sense of streaming from a remote store? Maybe
> performance is not so critical in those cases?
>
>>> The point is that you need previous data when building an incremental
>>> checkpoint, so you will read it at least. And since it was previously stored (in
>> The scheme that I described above and is implemented in Zap does not require
>> access to previous checkpoints when building a new incremental checkpoint.
>> Instead, you keep some data structure in the kernel that describes the pieces
>> that you need to carry with you (what pages were saved, and where; when a task
>> exits, the data describing its mm will be discarded, of course, and so on).
>
> This is because you probably decided that a mechanism in the kernel that saves
> storage space was not interesting if it does not improve speed. As a
> consequence you need to keep metadata in kernel memory in order to do
> incremental checkpoint. Maybe saving storage space without considering
> speed could equally be done from userspace with sort of checkpoint diff
> tools that would create an incremental checkpoint 2' from two full
> checkpoints 1 and 2.
Good point. In fact, the meta data is not only kept in memory, but also saved
with each incremental checkpoint (well, its version at checkpoint time), so
that restart would know where to find older data. So it is already transfered
to user space; we may as well provide the option to keep it only in user space.
Oren.
>
> Thanks,
>
> Louis
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <4897BCE0.1080508-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2008-08-05 3:51 ` Joseph Ruscio
[not found] ` <1FA56146-7C30-4C36-982D-A50AA8BC8392-ccALPSaRSA5Wk0Htik3J/w@public.gmane.org>
2008-08-05 9:32 ` Louis Rilling
1 sibling, 1 reply; 37+ messages in thread
From: Joseph Ruscio @ 2008-08-05 3:51 UTC (permalink / raw)
To: Oren Laadan; +Cc: Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ, Linux Containers
On Aug 4, 2008, at 7:37 PM, Oren Laadan wrote:
>>>> The point is that you need previous data when building an
>>>> incremental
>>>> checkpoint, so you will read it at least. And since it was
>>>> previously stored (in
>>> The scheme that I described above and is implemented in Zap does
>>> not require
>>> access to previous checkpoints when building a new incremental
>>> checkpoint.
>>> Instead, you keep some data structure in the kernel that describes
>>> the pieces
>>> that you need to carry with you (what pages were saved, and where;
>>> when a task
>>> exits, the data describing its mm will be discarded, of course,
>>> and so on).
>>
>> This is because you probably decided that a mechanism in the kernel
>> that saves
>> storage space was not interesting if it does not improve speed. As a
>> consequence you need to keep metadata in kernel memory in order to do
>> incremental checkpoint. Maybe saving storage space without
>> considering
>> speed could equally be done from userspace with sort of checkpoint
>> diff
>> tools that would create an incremental checkpoint 2' from two full
>> checkpoints 1 and 2.
>
> Good point. In fact, the meta data is not only kept in memory, but
> also saved
> with each incremental checkpoint (well, its version at checkpoint
> time), so
> that restart would know where to find older data. So it is already
> transfered
> to user space; we may as well provide the option to keep it only in
> user space.
As somewhat of a tangent to this discussion, I've been giving some
thought to the general strategy we talked about during the summit. The
checkpointing solution we built at Evergrid sits completely in
userspace and is soley focused on checkpointing parallel codes (e.g.
MPI). That approach required us to virtualize a whole slew of
resources (e.g. PIDs) that will be far better supported in the kernel
through this effort. On the other hand, there isn't anything inherent
to checkpointing the memory in a process that requires it to be in a
kernel. During a restart, you can map and load the memory from the
checkpoint file in userspace as easily as in the kernel. Since the
cost of checkpointing HPC codes is fairly dominated by checkpointing
their large memory footprints, memory checkpointing is an area of
ongoing research with many different solutions.
It might be desirable for the checkpointing implementation to be
modular enough that a userspace application or library could select to
handle certain resources on their own. Memory is the primary one that
comes to mind.
-joe
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <1FA56146-7C30-4C36-982D-A50AA8BC8392-ccALPSaRSA5Wk0Htik3J/w@public.gmane.org>
@ 2008-08-05 9:19 ` Louis Rilling
2008-08-05 16:20 ` Oren Laadan
2008-08-05 16:23 ` Dave Hansen
1 sibling, 1 reply; 37+ messages in thread
From: Louis Rilling @ 2008-08-05 9:19 UTC (permalink / raw)
To: Joseph Ruscio; +Cc: Linux Containers
On Mon, Aug 04, 2008 at 08:51:37PM -0700, Joseph Ruscio wrote:
> As somewhat of a tangent to this discussion, I've been giving some
> thought to the general strategy we talked about during the summit. The
> checkpointing solution we built at Evergrid sits completely in userspace
> and is soley focused on checkpointing parallel codes (e.g. MPI). That
> approach required us to virtualize a whole slew of resources (e.g. PIDs)
> that will be far better supported in the kernel through this effort. On
> the other hand, there isn't anything inherent to checkpointing the memory
> in a process that requires it to be in a kernel. During a restart, you
> can map and load the memory from the checkpoint file in userspace as
> easily as in the kernel. Since the cost of checkpointing HPC codes is
Hmm, for unusual mappings this may be not so easy to reproduce from
userspace if binaries are statically linked. I agree that with
dynamically linked applications, LD_PRELOAD allows one to record the
actual memory mappings and restore them at restart.
> fairly dominated by checkpointing their large memory footprints, memory
> checkpointing is an area of ongoing research with many different
> solutions.
>
> It might be desirable for the checkpointing implementation to be modular
> enough that a userspace application or library could select to handle
> certain resources on their own. Memory is the primary one that comes to
> mind.
I definitely agree with you about this flexibility. Actually in
Kerrighed, during the next 3 years, we are going to study an API for
collaborative checkpoint/restart between kernel and userspace, in order to
allow such HPC apps to checkpoint huge memory efficiently (eg. when reaching
states where saving small parts is enough), or to rebuild their data from
partial/older states.
I hope that this study will bring useful ideas that could be applied to
containers as well.
Thanks,
Louis
--
Dr Louis Rilling Kerlabs - IRISA
Skype: louis.rilling Campus Universitaire de Beaulieu
Phone: (+33|0) 2 99 84 71 52 Avenue du General Leclerc
Fax: (+33|0) 2 99 84 71 71 35042 Rennes CEDEX - France
http://www.kerlabs.com/
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <4897BCE0.1080508-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2008-08-05 3:51 ` Joseph Ruscio
@ 2008-08-05 9:32 ` Louis Rilling
1 sibling, 0 replies; 37+ messages in thread
From: Louis Rilling @ 2008-08-05 9:32 UTC (permalink / raw)
To: Oren Laadan; +Cc: Linux Containers
On Mon, Aug 04, 2008 at 10:37:20PM -0400, Oren Laadan wrote:
> Louis Rilling wrote:
>> On Fri, Aug 01, 2008 at 02:51:57PM -0400, Oren Laadan wrote:
>>> Louis Rilling wrote:
>>>> On Fri, Aug 01, 2008 at 10:15:26AM -0400, Oren Laadan wrote:
>>> I actually wasn't thinking of streaming a series of incremental checkpoints
>>> (from base and on) to implement migration... I simply didn't have a use-case
>>> for that :)
>>
>> This could be useful however. Since incremental checkpoint is faster
>> this could reduce down-time.
>
> Naturally incremental checkpoint reduces downtime; however since each checkpoint
> is taken at a different time, they can be streamed -- transferred over the
> network -- as they are taken. This gives more flexibility and can still, if
> you wish, can easily be transformed to a single long stream.
>
> Actually, this is a good argument in favor of using multiple files: they are a
> more flexible approach and can always be easily transformed to a single long
> stream, while the reverse isn't so.
Yes the reverse is as easy: rebuilding a full checkpoint of a given id
#id consists simply in removing the records that are tagged as invalid as
from checkpoints having ids <= #id. This is actually what restart should
do :)
>>>> The point is that you need previous data when building an incremental
>>>> checkpoint, so you will read it at least. And since it was previously stored (in
>>> The scheme that I described above and is implemented in Zap does not require
>>> access to previous checkpoints when building a new incremental checkpoint.
>>> Instead, you keep some data structure in the kernel that describes the pieces
>>> that you need to carry with you (what pages were saved, and where; when a task
>>> exits, the data describing its mm will be discarded, of course, and so on).
>>
>> This is because you probably decided that a mechanism in the kernel that saves
>> storage space was not interesting if it does not improve speed. As a
>> consequence you need to keep metadata in kernel memory in order to do
>> incremental checkpoint. Maybe saving storage space without considering
>> speed could equally be done from userspace with sort of checkpoint diff
>> tools that would create an incremental checkpoint 2' from two full
>> checkpoints 1 and 2.
>
> Good point. In fact, the meta data is not only kept in memory, but also saved
> with each incremental checkpoint (well, its version at checkpoint time), so
> that restart would know where to find older data. So it is already transfered
> to user space; we may as well provide the option to keep it only in user space.
That is userspace should give it back to the kernel before doing the
next incremental checkpoint?
Louis
--
Dr Louis Rilling Kerlabs - IRISA
Skype: louis.rilling Campus Universitaire de Beaulieu
Phone: (+33|0) 2 99 84 71 52 Avenue du General Leclerc
Fax: (+33|0) 2 99 84 71 71 35042 Rennes CEDEX - France
http://www.kerlabs.com/
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
2008-08-05 9:19 ` Louis Rilling
@ 2008-08-05 16:20 ` Oren Laadan
[not found] ` <48987DE7.3060408-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Oren Laadan @ 2008-08-05 16:20 UTC (permalink / raw)
To: Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ; +Cc: Joseph Ruscio, Linux Containers
Louis Rilling wrote:
> On Mon, Aug 04, 2008 at 08:51:37PM -0700, Joseph Ruscio wrote:
>> As somewhat of a tangent to this discussion, I've been giving some
>> thought to the general strategy we talked about during the summit. The
>> checkpointing solution we built at Evergrid sits completely in userspace
>> and is soley focused on checkpointing parallel codes (e.g. MPI). That
>> approach required us to virtualize a whole slew of resources (e.g. PIDs)
>> that will be far better supported in the kernel through this effort. On
>> the other hand, there isn't anything inherent to checkpointing the memory
>> in a process that requires it to be in a kernel. During a restart, you
>> can map and load the memory from the checkpoint file in userspace as
>> easily as in the kernel. Since the cost of checkpointing HPC codes is
>
> Hmm, for unusual mappings this may be not so easy to reproduce from
> userspace if binaries are statically linked. I agree that with
> dynamically linked applications, LD_PRELOAD allows one to record the
> actual memory mappings and restore them at restart.
I second that: unusual mapping can be hard to reproduce.
Besides, several important optimization are difficult to do in user-space,
if at all possible:
* detecting sharing (unless the application itself gives the OS an advice -
more on this below); In the kernel, this is detected easily using the inode
that represents a shared memory region in SHMFS
* detecting (and restoring) COW sharing: process A forks process B, so at
least initially the private memory of both is the same via COW; this can be
optimized to save the memory of only one instead of both, and restore this
COW relationship on restart.
* reducing checkpoint downtime using the COW technique that I described at
the summit: when processes are frozen, mark all dirty pages COW and keep a
reference, and write-back the contents only after the container is unfrozen.
Eh... and, yes, live migration :)
>
>> fairly dominated by checkpointing their large memory footprints, memory
>> checkpointing is an area of ongoing research with many different
>> solutions.
>>
>> It might be desirable for the checkpointing implementation to be modular
>> enough that a userspace application or library could select to handle
>> certain resources on their own. Memory is the primary one that comes to
>> mind.
>
> I definitely agree with you about this flexibility. Actually in
> Kerrighed, during the next 3 years, we are going to study an API for
> collaborative checkpoint/restart between kernel and userspace, in order to
> allow such HPC apps to checkpoint huge memory efficiently (eg. when reaching
> states where saving small parts is enough), or to rebuild their data from
> partial/older states.
> I hope that this study will bring useful ideas that could be applied to
> containers as well.
Indeed it would add flexibility if an interface exists. One example is for
network connections in the case of a distributed MPI application, or if a
specific (otherwise unsupported for CR) device is involved.
As for memory, a clever way to hint the system about what parts of memory
are important, is to use something like an madvice() with a new flag, to
mark areas of interest/dis-interest. Throw in a mechanism to notify tasks
(who request to be notified) of an upcoming checkpoint, end of successful
checkpoint, and completion of a successful restart - and you've got it all.
Oren.
>
> Thanks,
>
> Louis
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <1FA56146-7C30-4C36-982D-A50AA8BC8392-ccALPSaRSA5Wk0Htik3J/w@public.gmane.org>
2008-08-05 9:19 ` Louis Rilling
@ 2008-08-05 16:23 ` Dave Hansen
2008-08-06 16:15 ` Joseph Ruscio
2008-08-08 17:20 ` Joseph Ruscio
1 sibling, 2 replies; 37+ messages in thread
From: Dave Hansen @ 2008-08-05 16:23 UTC (permalink / raw)
To: Joseph Ruscio; +Cc: Linux Containers, Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ
On Mon, 2008-08-04 at 20:51 -0700, Joseph Ruscio wrote:
> It might be desirable for the checkpointing implementation to be
> modular enough that a userspace application or library could select to
> handle certain resources on their own. Memory is the primary one that
> comes to mind.
How would you propose making it modular?
-- Dave
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <48987DE7.3060408-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2008-08-06 15:41 ` Joseph Ruscio
[not found] ` <3A99F254-E9B3-484B-85B0-29023ADA04C4-ccALPSaRSA5Wk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 37+ messages in thread
From: Joseph Ruscio @ 2008-08-06 15:41 UTC (permalink / raw)
To: Oren Laadan; +Cc: Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ, Linux Containers
On Aug 5, 2008, at 9:20 AM, Oren Laadan wrote:
>
>
> Louis Rilling wrote:
>> On Mon, Aug 04, 2008 at 08:51:37PM -0700, Joseph Ruscio wrote:
>>> As somewhat of a tangent to this discussion, I've been giving some
>>> thought to the general strategy we talked about during the summit.
>>> The
>>> checkpointing solution we built at Evergrid sits completely in
>>> userspace
>>> and is soley focused on checkpointing parallel codes (e.g. MPI).
>>> That
>>> approach required us to virtualize a whole slew of resources (e.g.
>>> PIDs)
>>> that will be far better supported in the kernel through this
>>> effort. On
>>> the other hand, there isn't anything inherent to checkpointing the
>>> memory
>>> in a process that requires it to be in a kernel. During a restart,
>>> you
>>> can map and load the memory from the checkpoint file in userspace as
>>> easily as in the kernel. Since the cost of checkpointing HPC codes
>>> is
>>
>> Hmm, for unusual mappings this may be not so easy to reproduce from
>> userspace if binaries are statically linked. I agree that with
>> dynamically linked applications, LD_PRELOAD allows one to record the
>> actual memory mappings and restore them at restart.
>
> I second that: unusual mapping can be hard to reproduce.
>
> Besides, several important optimization are difficult to do in user-
> space,
> if at all possible:
>
> * detecting sharing (unless the application itself gives the OS an
> advice -
> more on this below); In the kernel, this is detected easily using
> the inode
> that represents a shared memory region in SHMFS
>
>
> * detecting (and restoring) COW sharing: process A forks process B,
> so at
> least initially the private memory of both is the same via COW; this
> can be
> optimized to save the memory of only one instead of both, and
> restore this
> COW relationship on restart.
Both of these are possible from userspace, but agreeably more
complicated. Also agree that statically linked binaries are not really
feasible in user-space.
> * reducing checkpoint downtime using the COW technique that I
> described at
> the summit: when processes are frozen, mark all dirty pages COW and
> keep a
> reference, and write-back the contents only after the container is
> unfrozen.
Our user-space implementation already has a complete concurrent (i.e.
COW) checkpointing implementation where the "freeze" period lasts only
the length of time it takes to mprotect() the allocated memory
regions. So I don't necessarily agree that these optimizations require
kernel access.
> Eh... and, yes, live migration :)
User-space live migration of a "batch" process e.g. one taking place
in an MPI job is quite trivial. User-space live migration of something
like a database is not that hard assuming you have a cooperative load
balancer or proxy on the front end.
I'm not advocating for implementing this in user-space. I am in
complete agreement that this effort should result in code that
completely checkpoints a Container in the kernel. My question was
whether there are situations where it would be advantageous for user-
space to have the option of instructing/hinting the kernel to ignore
certain resources that it would handle itself. Most of the use-cases
I'm thinking of come from the different styles of implementations I've
seen in the HPC space, where our implementation (and a lot of others)
are focused.
MPI codes require coordination between all the different processes
taking part to ensure that the checkpoints are globally consistent.
MPI implementations that run on hardware such as Infiniband would most
likely want the container checkpointing to ignore all of the pinned
memory associated with the RDMA operations so that the coordination
and recreation of MPI communicator state could be handled in user-
space. When working with inflexible process checkpointers, MPI
coordination routines often must completely teardown all communicator
state prior to invoking the checkpoint, and then recreate all the
communicators after the checkpoint. On very large scale jobs, this is
expensive.
As another example HPC applications can create local scratch files of
several GB in /tmp. It may not be necessary to migrate these files,
but if user-space has no way to mark a particular file, "local files",
or files in general as being ignored, then we'll have to copy these
during a migration or a checkpoint.
I don't suppose anyone is attending Linuxworld in San Francisco this
week? I'd be more then happy to grab a coffee and talk about some of
this. I stopped by the OpenVZ booth but none of the devs are around.
thanks,
Joe
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
2008-08-05 16:23 ` Dave Hansen
@ 2008-08-06 16:15 ` Joseph Ruscio
[not found] ` <FE4D936E-06F1-45D2-8E7C-85D87149BDC0-ccALPSaRSA5Wk0Htik3J/w@public.gmane.org>
2008-08-08 17:20 ` Joseph Ruscio
1 sibling, 1 reply; 37+ messages in thread
From: Joseph Ruscio @ 2008-08-06 16:15 UTC (permalink / raw)
To: Dave Hansen; +Cc: Linux Containers, Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ
On Aug 5, 2008, at 9:23 AM, Dave Hansen wrote:
> On Mon, 2008-08-04 at 20:51 -0700, Joseph Ruscio wrote:
>> It might be desirable for the checkpointing implementation to be
>> modular enough that a userspace application or library could select
>> to
>> handle certain resources on their own. Memory is the primary one that
>> comes to mind.
>
> How would you propose making it modular?
>
> -- Dave
>
Well it seems to me that the initial focus here is in live migration
of traditional enterprise applications, e.g. databases, app-servers,
etc. I think this is the right focus given how much utility the
general enterprise is finding in capabilities like VMotion. Providing
this mobility to applications without the overhead of traditional VM's
would be very valuable.
On the other hand I've been primarily focused in checkpointing large-
scale MPI jobs to provide fault tolerance, and that use-case is
somewhat different then the live-migration one. These checkpoints have
huge RAM footprints (in-core checkpointing is not an option), require
coordination across large numbers of servers, some number of open
files on an enormous parallel filesystem, and some scratch files open
on the local disk/ramdisk. They generally have very simple process
trees with one process per core, or one process with a thread for each
core.
To support these kinds of jobs, one would ideally instruct the
Container checkpointer to ignore network resources, dynamically
allocated private memory, and the contents of open files. You'd be
relying on the Container checkpointer to recreate processes, open file
descriptors, threads, thread synchronization primitives, IPC
mechanisms (including shm).
As far as the mechanism is concerned, I'd defer to the more
experienced kernel developers here. I assume that passing a bitmask of
flags as an argument into the checkpoint syscall would be frowned
upon, and anyways redundant, as its unlikely that the mask would
change within a container from checkpoint to checkpoint. If each
container is going to have a CGroup filesystem directory, then we
could have a file(s) along the lines of /proc/sys/kernel/
randomize_va_space that turn features off for that Container. The
default settings after Container creation would be a complete in-
kernel checkpoint/migration.
thanks,
Joe
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <3A99F254-E9B3-484B-85B0-29023ADA04C4-ccALPSaRSA5Wk0Htik3J/w@public.gmane.org>
@ 2008-08-07 9:25 ` Louis Rilling
0 siblings, 0 replies; 37+ messages in thread
From: Louis Rilling @ 2008-08-07 9:25 UTC (permalink / raw)
To: Joseph Ruscio; +Cc: Linux Containers
[-- Attachment #1.1: Type: text/plain, Size: 3176 bytes --]
On Wed, Aug 06, 2008 at 08:41:10AM -0700, Joseph Ruscio wrote:
>
> On Aug 5, 2008, at 9:20 AM, Oren Laadan wrote:
>> Eh... and, yes, live migration :)
>
> User-space live migration of a "batch" process e.g. one taking place in
> an MPI job is quite trivial. User-space live migration of something like
> a database is not that hard assuming you have a cooperative load
> balancer or proxy on the front end.
Hm, this means modifying the MPI run-time, right? Especially the ones relying on
daemons on each node (like LAM implementation, and MPI2 specification IIRC).
Anyway, this is probably not an issue, since most high-end HPC systems come with
their own customized MPI implementation.
>
> I'm not advocating for implementing this in user-space. I am in complete
> agreement that this effort should result in code that completely
> checkpoints a Container in the kernel. My question was whether there are
> situations where it would be advantageous for user-space to have the
> option of instructing/hinting the kernel to ignore certain resources that
> it would handle itself. Most of the use-cases I'm thinking of come from
> the different styles of implementations I've seen in the HPC space, where
> our implementation (and a lot of others) are focused.
>
> MPI codes require coordination between all the different processes
> taking part to ensure that the checkpoints are globally consistent. MPI
> implementations that run on hardware such as Infiniband would most
> likely want the container checkpointing to ignore all of the pinned
> memory associated with the RDMA operations so that the coordination and
> recreation of MPI communicator state could be handled in user-space. When
> working with inflexible process checkpointers, MPI coordination routines
> often must completely teardown all communicator state prior to invoking
> the checkpoint, and then recreate all the communicators after the
> checkpoint. On very large scale jobs, this is expensive.
>
> As another example HPC applications can create local scratch files of
> several GB in /tmp. It may not be necessary to migrate these files, but
> if user-space has no way to mark a particular file, "local files", or
> files in general as being ignored, then we'll have to copy these during a
> migration or a checkpoint.
Definitely agree with you here. This is the kind of use-case we will study in
Kerrighed. (Actually the project is centered on supporting a petaflopic
application, with help from Kerrighed to tolerate failures).
>
> I don't suppose anyone is attending Linuxworld in San Francisco this
> week? I'd be more then happy to grab a coffee and talk about some of
> this. I stopped by the OpenVZ booth but none of the devs are around.
Not me, sorry :) However, whichever requirement you can describe is interesting
for us. They can surely help designing a most useful checkpoint/restart
mechanism.
Thanks,
Louis
--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes
[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
[-- Attachment #2: Type: text/plain, Size: 206 bytes --]
_______________________________________________
Containers mailing list
Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
https://lists.linux-foundation.org/mailman/listinfo/containers
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <FE4D936E-06F1-45D2-8E7C-85D87149BDC0-ccALPSaRSA5Wk0Htik3J/w@public.gmane.org>
@ 2008-08-07 9:29 ` Louis Rilling
0 siblings, 0 replies; 37+ messages in thread
From: Louis Rilling @ 2008-08-07 9:29 UTC (permalink / raw)
To: Joseph Ruscio; +Cc: Linux Containers, Dave Hansen
[-- Attachment #1.1: Type: text/plain, Size: 2891 bytes --]
On Wed, Aug 06, 2008 at 09:15:46AM -0700, Joseph Ruscio wrote:
>
> On Aug 5, 2008, at 9:23 AM, Dave Hansen wrote:
>
>> On Mon, 2008-08-04 at 20:51 -0700, Joseph Ruscio wrote:
>>> It might be desirable for the checkpointing implementation to be
>>> modular enough that a userspace application or library could select
>>> to
>>> handle certain resources on their own. Memory is the primary one that
>>> comes to mind.
>>
>> How would you propose making it modular?
>>
>> -- Dave
>>
>
>
> Well it seems to me that the initial focus here is in live migration of
> traditional enterprise applications, e.g. databases, app-servers, etc. I
> think this is the right focus given how much utility the general
> enterprise is finding in capabilities like VMotion. Providing this
> mobility to applications without the overhead of traditional VM's would
> be very valuable.
>
> On the other hand I've been primarily focused in checkpointing large-
> scale MPI jobs to provide fault tolerance, and that use-case is somewhat
> different then the live-migration one. These checkpoints have huge RAM
> footprints (in-core checkpointing is not an option), require
> coordination across large numbers of servers, some number of open files
> on an enormous parallel filesystem, and some scratch files open on the
> local disk/ramdisk. They generally have very simple process trees with
> one process per core, or one process with a thread for each core.
>
> To support these kinds of jobs, one would ideally instruct the Container
> checkpointer to ignore network resources, dynamically allocated private
> memory, and the contents of open files. You'd be relying on the Container
> checkpointer to recreate processes, open file descriptors, threads,
> thread synchronization primitives, IPC mechanisms (including shm).
>
> As far as the mechanism is concerned, I'd defer to the more experienced
> kernel developers here. I assume that passing a bitmask of flags as an
> argument into the checkpoint syscall would be frowned upon, and anyways
> redundant, as its unlikely that the mask would change within a container
> from checkpoint to checkpoint. If each container is going to have a
> CGroup filesystem directory, then we could have a file(s) along the lines
> of /proc/sys/kernel/randomize_va_space that turn features off for that
> Container. The default settings after Container creation would be a
> complete in-kernel checkpoint/migration.
Did you think about mechanisms/interfaces making the kernel's checkpointing
sub-system and the application/run-time interact to efficiently build the
checkpoint image and restart from it?
Louis
--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes
[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
[-- Attachment #2: Type: text/plain, Size: 206 bytes --]
_______________________________________________
Containers mailing list
Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
https://lists.linux-foundation.org/mailman/listinfo/containers
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
2008-08-05 16:23 ` Dave Hansen
2008-08-06 16:15 ` Joseph Ruscio
@ 2008-08-08 17:20 ` Joseph Ruscio
[not found] ` <03CE5BD3-E84A-4617-93BC-722ECB846C63-ccALPSaRSA5Wk0Htik3J/w@public.gmane.org>
1 sibling, 1 reply; 37+ messages in thread
From: Joseph Ruscio @ 2008-08-08 17:20 UTC (permalink / raw)
To: Dave Hansen; +Cc: Linux Containers, Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ
On Aug 5, 2008, at 9:23 AM, Dave Hansen wrote:
> How would you propose making it modular?
Dave,
What about re-using the madvise() interface for this? Adding a flag
along the lines of MADV_DONTCHECKPOINT? I could probably work up a
patch to Oren's that removes these from the checkpointed ranges if
people think that's feasible.
thanks,
Joe
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [RFC][PATCH 2/2] CR: handle a single task with private memory maps
[not found] ` <03CE5BD3-E84A-4617-93BC-722ECB846C63-ccALPSaRSA5Wk0Htik3J/w@public.gmane.org>
@ 2008-08-08 17:24 ` Dave Hansen
0 siblings, 0 replies; 37+ messages in thread
From: Dave Hansen @ 2008-08-08 17:24 UTC (permalink / raw)
To: Joseph Ruscio; +Cc: Linux Containers, Louis.Rilling-aw0BnHfMbSpBDgjK7y7TUQ
On Fri, 2008-08-08 at 10:20 -0700, Joseph Ruscio wrote:
> On Aug 5, 2008, at 9:23 AM, Dave Hansen wrote:
> > How would you propose making it modular?
> What about re-using the madvise() interface for this? Adding a flag
> along the lines of MADV_DONTCHECKPOINT? I could probably work up a
> patch to Oren's that removes these from the checkpointed ranges if
> people think that's feasible.
Seems reasonable, but I think it is jumping the gun a little bit. There
are plenty of features that will get us quicker, more efficient
checkpoints, but let's get *some* checkpointing in the kernel, first. :)
-- Dave
^ permalink raw reply [flat|nested] 37+ messages in thread
end of thread, other threads:[~2008-08-08 17:24 UTC | newest]
Thread overview: 37+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-07-30 3:27 [RFC][PATCH 2/2] CR: handle a single task with private memory maps Oren Laadan
[not found] ` <Pine.LNX.4.64.0807292325290.9868-CXF6herHY6ykSYb+qCZC/1i27PF6R63G9nwVQlTi/Pw@public.gmane.org>
2008-07-30 4:51 ` KOSAKI Motohiro
[not found] ` <20080730132257.9DF2.KOSAKI.MOTOHIRO-+CUm20s59erQFUHtdCDX3A@public.gmane.org>
2008-07-30 18:22 ` Oren Laadan
2008-07-30 20:58 ` Dave Hansen
2008-07-30 22:07 ` Serge E. Hallyn
[not found] ` <20080730220752.GA3518-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2008-07-30 22:20 ` Oren Laadan
[not found] ` <4890E930.9090204-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2008-07-31 13:57 ` Louis Rilling
[not found] ` <20080731135703.GC22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
2008-07-31 15:09 ` Oren Laadan
[not found] ` <4891D5C2.8090000-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2008-07-31 15:58 ` Louis Rilling
[not found] ` <20080731155856.GH22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
2008-07-31 16:28 ` Oren Laadan
[not found] ` <4891E849.1050701-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2008-07-31 17:50 ` Louis Rilling
[not found] ` <20080731175058.GI22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
2008-07-31 19:12 ` Oren Laadan
[not found] ` <48920EA0.1060608-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2008-08-01 10:26 ` Louis Rilling
[not found] ` <20080801102600.GJ22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
2008-08-01 14:15 ` Oren Laadan
[not found] ` <48931A7E.1040302-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2008-08-01 18:00 ` Louis Rilling
[not found] ` <20080801180038.GL22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
2008-08-01 18:51 ` Oren Laadan
[not found] ` <48935B4D.7070302-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2008-08-04 10:16 ` Louis Rilling
2008-08-05 2:37 ` Oren Laadan
[not found] ` <4897BCE0.1080508-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2008-08-05 3:51 ` Joseph Ruscio
[not found] ` <1FA56146-7C30-4C36-982D-A50AA8BC8392-ccALPSaRSA5Wk0Htik3J/w@public.gmane.org>
2008-08-05 9:19 ` Louis Rilling
2008-08-05 16:20 ` Oren Laadan
[not found] ` <48987DE7.3060408-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2008-08-06 15:41 ` Joseph Ruscio
[not found] ` <3A99F254-E9B3-484B-85B0-29023ADA04C4-ccALPSaRSA5Wk0Htik3J/w@public.gmane.org>
2008-08-07 9:25 ` Louis Rilling
2008-08-05 16:23 ` Dave Hansen
2008-08-06 16:15 ` Joseph Ruscio
[not found] ` <FE4D936E-06F1-45D2-8E7C-85D87149BDC0-ccALPSaRSA5Wk0Htik3J/w@public.gmane.org>
2008-08-07 9:29 ` Louis Rilling
2008-08-08 17:20 ` Joseph Ruscio
[not found] ` <03CE5BD3-E84A-4617-93BC-722ECB846C63-ccALPSaRSA5Wk0Htik3J/w@public.gmane.org>
2008-08-08 17:24 ` Dave Hansen
2008-08-05 9:32 ` Louis Rilling
2008-07-31 21:25 ` Serge E. Hallyn
[not found] ` <20080730161535.GB22403@hawkmoon.kerlabs.com>
[not found] ` <20080730161535.GB22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
2008-07-30 18:27 ` Oren Laadan
[not found] ` <4890B2A8.8010808-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2008-07-31 14:08 ` Louis Rilling
[not found] ` <20080731140844.GE22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
2008-07-31 14:44 ` Oren Laadan
-- strict thread matches above, loose matches on Subject: below --
2008-07-30 16:52 Serge E. Hallyn
[not found] ` <20080730165249.GA23802-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2008-07-30 17:40 ` Dave Hansen
2008-07-31 13:59 ` Louis Rilling
[not found] ` <20080731135910.GD22403-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
2008-07-31 14:14 ` Serge E. Hallyn
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.