- * [RFC v16][PATCH 01/43] c/r: extend arch_setup_additional_pages()
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 02/43] c/r: make file_pos_read/write() public Oren Laadan
                   ` (33 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
From: Alexey Dobriyan <adobriyan@gmail.com>
Add "start" argument, to request to map vDSO to a specific place,
and fail the operation if not.
This is useful for restart(2) to ensure that memory layout is restore
exactly as needed.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 arch/powerpc/include/asm/elf.h     |    1 +
 arch/powerpc/kernel/vdso.c         |   11 ++++++++++-
 arch/s390/include/asm/elf.h        |    2 +-
 arch/s390/kernel/vdso.c            |   13 ++++++++++++-
 arch/sh/include/asm/elf.h          |    1 +
 arch/sh/kernel/vsyscall/vsyscall.c |    2 +-
 arch/x86/include/asm/elf.h         |    3 ++-
 arch/x86/vdso/vdso32-setup.c       |    9 +++++++--
 arch/x86/vdso/vma.c                |    9 +++++++--
 fs/binfmt_elf.c                    |    2 +-
 10 files changed, 43 insertions(+), 10 deletions(-)
diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h
index d6b4a12..3946e01 100644
--- a/arch/powerpc/include/asm/elf.h
+++ b/arch/powerpc/include/asm/elf.h
@@ -271,6 +271,7 @@ extern int ucache_bsize;
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES
 struct linux_binprm;
 extern int arch_setup_additional_pages(struct linux_binprm *bprm,
+				       unsigned long start,
 				       int uses_interp);
 #define VDSO_AUX_ENT(a,b) NEW_AUX_ENT(a,b);
 
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index ad06d5c..48beff6 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -184,7 +184,8 @@ static void dump_vdso_pages(struct vm_area_struct * vma)
  * This is called from binfmt_elf, we create the special vma for the
  * vDSO and insert it into the mm struct tree
  */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+int arch_setup_additional_pages(struct linux_binprm *bprm,
+				unsigned long start, int uses_interp)
 {
 	struct mm_struct *mm = current->mm;
 	struct page **vdso_pagelist;
@@ -211,6 +212,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	vdso_base = VDSO32_MBASE;
 #endif
 
+	/* in case restart(2) mandates a specific location */
+	if (start)
+		vdso_base = start;
+
 	current->mm->context.vdso_base = 0;
 
 	/* vDSO has a problem and was disabled, just don't "enable" it for the
@@ -234,6 +239,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 		goto fail_mmapsem;
 	}
 
+	/* for restart(2), double check that we got we asked for */
+	if (start && vdso_base != start)
+		goto fail_mmapsem;
+
 	/*
 	 * our vma flags don't have VM_WRITE so by default, the process isn't
 	 * allowed to write those pages.
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index 74d0bbb..54235bc 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -205,6 +205,6 @@ do {									    \
 struct linux_binprm;
 
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
-int arch_setup_additional_pages(struct linux_binprm *, int);
+int arch_setup_additional_pages(struct linux_binprm *, unsigned long, int);
 
 #endif
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 89b2e7f..bab43b3 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -182,7 +182,8 @@ static void vdso_init_cr5(void)
  * This is called from binfmt_elf, we create the special vma for the
  * vDSO and insert it into the mm struct tree
  */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+int arch_setup_additional_pages(struct linux_binprm *bprm,
+				unsigned long start, int uses_interp)
 {
 	struct mm_struct *mm = current->mm;
 	struct page **vdso_pagelist;
@@ -213,6 +214,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	vdso_pages = vdso32_pages;
 #endif
 
+	/* in case restart(2) mandates a specific location */
+	if (start)
+		vdso_base = start;
+
 	/*
 	 * vDSO has a problem and was disabled, just don't "enable" it for
 	 * the process
@@ -235,6 +240,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 		goto out_up;
 	}
 
+	/* for restart(2), double check that we got we asked for */
+	if (start && vdso_base != start) {
+		rc = -EINVAL;
+		goto out_up;
+	}
+
 	/*
 	 * our vma flags don't have VM_WRITE so by default, the process
 	 * isn't allowed to write those pages.
diff --git a/arch/sh/include/asm/elf.h b/arch/sh/include/asm/elf.h
index ccb1d93..6c27b1f 100644
--- a/arch/sh/include/asm/elf.h
+++ b/arch/sh/include/asm/elf.h
@@ -202,6 +202,7 @@ do {									\
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES
 struct linux_binprm;
 extern int arch_setup_additional_pages(struct linux_binprm *bprm,
+				       unsigned long start,
 				       int uses_interp);
 
 extern unsigned int vdso_enabled;
diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c
index 3f7e415..64c70e5 100644
--- a/arch/sh/kernel/vsyscall/vsyscall.c
+++ b/arch/sh/kernel/vsyscall/vsyscall.c
@@ -59,7 +59,7 @@ int __init vsyscall_init(void)
 }
 
 /* Setup a VMA at program startup for the vsyscall page */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+int arch_setup_additional_pages(struct linux_binprm *bprm, unsigned long start, int uses_interp)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long addr;
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 83c1bc8..a4398c8 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -336,9 +336,10 @@ struct linux_binprm;
 
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
 extern int arch_setup_additional_pages(struct linux_binprm *bprm,
+				       unsigned long start,
 				       int uses_interp);
 
-extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
+extern int syscall32_setup_pages(struct linux_binprm *, unsigned long start, int exstack);
 #define compat_arch_setup_additional_pages	syscall32_setup_pages
 
 extern unsigned long arch_randomize_brk(struct mm_struct *mm);
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 1241f11..9c72a23 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -310,7 +310,8 @@ int __init sysenter_setup(void)
 }
 
 /* Setup a VMA at program startup for the vsyscall page */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+int arch_setup_additional_pages(struct linux_binprm *bprm,
+				unsigned long start, int uses_interp)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long addr;
@@ -331,13 +332,17 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	if (compat)
 		addr = VDSO_HIGH_BASE;
 	else {
-		addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+		addr = get_unmapped_area(NULL, start, PAGE_SIZE, 0, 0);
 		if (IS_ERR_VALUE(addr)) {
 			ret = addr;
 			goto up_fail;
 		}
 	}
 
+	/* for restart(2), double check that we got we asked for */
+	if (start && addr != start)
+		goto up_fail;
+
 	if (compat_uses_vma || !compat) {
 		/*
 		 * MAYWRITE to allow gdb to COW and set breakpoints
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7133cdf..81fce6d 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -98,7 +98,8 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
 
 /* Setup a VMA at program startup for the vsyscall page.
    Not called for compat tasks */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+int arch_setup_additional_pages(struct linux_binprm *bprm,
+				unsigned long start, int uses_interp)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long addr;
@@ -108,13 +109,17 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 		return 0;
 
 	down_write(&mm->mmap_sem);
-	addr = vdso_addr(mm->start_stack, vdso_size);
+	addr = start ? : vdso_addr(mm->start_stack, vdso_size);
 	addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
 	if (IS_ERR_VALUE(addr)) {
 		ret = addr;
 		goto up_fail;
 	}
 
+	/* for restart(2), double check that we got we asked for */
+	if (start && addr != start)
+		goto up_fail;
+
 	ret = install_special_mapping(mm, addr, vdso_size,
 				      VM_READ|VM_EXEC|
 				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 40381df..aa9a802 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -945,7 +945,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	set_binfmt(&elf_format);
 
 #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
-	retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
+	retval = arch_setup_additional_pages(bprm, 0, !!elf_interpreter);
 	if (retval < 0) {
 		send_sig(SIGKILL, current, 0);
 		goto out;
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 02/43] c/r: make file_pos_read/write() public
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 01/43] c/r: extend arch_setup_additional_pages() Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 03/43] c/r: create syscalls: sys_checkpoint, sys_restart Oren Laadan
                   ` (32 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
These two are used in the next patch when calling vfs_read/write()
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 fs/read_write.c    |   10 ----------
 include/linux/fs.h |   10 ++++++++++
 2 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/fs/read_write.c b/fs/read_write.c
index 9d1e76b..ed63ea3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -359,16 +359,6 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
 
 EXPORT_SYMBOL(vfs_write);
 
-static inline loff_t file_pos_read(struct file *file)
-{
-	return file->f_pos;
-}
-
-static inline void file_pos_write(struct file *file, loff_t pos)
-{
-	file->f_pos = pos;
-}
-
 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 {
 	struct file *file;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3b534e5..9c4348a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1546,6 +1546,16 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 				struct iovec *fast_pointer,
 				struct iovec **ret_pointer);
 
+static inline loff_t file_pos_read(struct file *file)
+{
+	return file->f_pos;
+}
+
+static inline void file_pos_write(struct file *file, loff_t pos)
+{
+	file->f_pos = pos;
+}
+
 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
 extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 03/43] c/r: create syscalls: sys_checkpoint, sys_restart
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 01/43] c/r: extend arch_setup_additional_pages() Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 02/43] c/r: make file_pos_read/write() public Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 04/43] c/r: documentation Oren Laadan
                   ` (31 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Create trivial sys_checkpoint and sys_restore system calls. They will
enable to checkpoint and restart an entire container, to and from a
checkpoint image file descriptor.
The syscalls take a pid, a file descriptor (for the image file) and
flags as arguments. The pid identifies the top-most (root) task in the
process tree, e.g. the container init: for sys_checkpoint the first
argument identifies the pid of the target container/subtree; for
sys_restart it will identify the pid of restarting root task.
A checkpoint, much like a process coredump, dumps the state of multiple
processes at once, including the state of the container. The checkpoint
image is written to (and read from) the file descriptor directly from
the kernel. This way the data is generated and then pushed out naturally
as resources and tasks are scanned to save their state. This is the
approach taken by, e.g., Zap and OpenVZ.
By using a return value and not a file descriptor, we can distinguish
between a return from checkpoint, a return from restart (in case of a
checkpoint that includes self, i.e. a task checkpointing its own
container, or itself), and an error condition, in a manner analogous
to a fork() call.
We don't use copy_from_user()/copy_to_user() because it requires
holding the entire image in user space, and does not make sense for
restart.  Also, we don't use a pipe, pseudo-fs file and the like,
because they work by generating data on demand as the user pulls it
(unless the entire image is buffered in the kernel) and would require
more complex logic.  They also would significantly complicate
checkpoint that includes self.
Changelog[v16]:
  - Change sys_restart() first argument to be 'pid_t pid'
Changelog[v14]:
  - Change CONFIG_CHEKCPOINT_RESTART to CONFIG_CHECKPOINT (Ingo)
  - Remove line 'def_bool n' (default is already 'n')
  - Add CHECKPOINT_SUPPORT in Kconfig (Nathan Lynch)
Changelog[v5]:
  - Config is 'def_bool n' by default
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
---
 arch/x86/Kconfig                   |    4 +++
 arch/x86/include/asm/unistd_32.h   |    2 +
 arch/x86/kernel/syscall_table_32.S |    2 +
 checkpoint/Kconfig                 |   14 ++++++++++++
 checkpoint/Makefile                |    5 ++++
 checkpoint/sys.c                   |   41 ++++++++++++++++++++++++++++++++++++
 include/linux/syscalls.h           |    2 +
 init/Kconfig                       |    2 +
 kernel/sys_ni.c                    |    4 +++
 9 files changed, 76 insertions(+), 0 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a6efe0a..2891a26 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -77,6 +77,10 @@ config STACKTRACE_SUPPORT
 config HAVE_LATENCYTOP_SUPPORT
 	def_bool y
 
+config CHECKPOINT_SUPPORT
+	bool
+	default y if X86_32
+
 config FAST_CMPXCHG_LOCAL
 	bool
 	default y
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6e72d74..48557e1 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -340,6 +340,8 @@
 #define __NR_inotify_init1	332
 #define __NR_preadv		333
 #define __NR_pwritev		334
+#define __NR_checkpoint		335
+#define __NR_restart		336
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c873..e70b7ee 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,5 @@ ENTRY(sys_call_table)
 	.long sys_inotify_init1
 	.long sys_preadv
 	.long sys_pwritev
+	.long sys_checkpoint
+	.long sys_restart
diff --git a/checkpoint/Kconfig b/checkpoint/Kconfig
new file mode 100644
index 0000000..1761b0a
--- /dev/null
+++ b/checkpoint/Kconfig
@@ -0,0 +1,14 @@
+# Architectures should define CHECKPOINT_SUPPORT when they have
+# implemented the hooks for processor state etc. needed by the
+# core checkpoint/restart code.
+
+config CHECKPOINT
+	bool "Enable checkpoint/restart (EXPERIMENTAL)"
+	depends on CHECKPOINT_SUPPORT && EXPERIMENTAL
+	help
+	  Application checkpoint/restart is the ability to save the
+	  state of a running application so that it can later resume
+	  its execution from the time at which it was checkpointed.
+
+	  Turning this option on will enable checkpoint and restart
+	  functionality in the kernel.
diff --git a/checkpoint/Makefile b/checkpoint/Makefile
new file mode 100644
index 0000000..8a32c6f
--- /dev/null
+++ b/checkpoint/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for linux checkpoint/restart.
+#
+
+obj-$(CONFIG_CHECKPOINT) += sys.o
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
new file mode 100644
index 0000000..9d4caff
--- /dev/null
+++ b/checkpoint/sys.c
@@ -0,0 +1,41 @@
+/*
+ *  Generic container checkpoint-restart
+ *
+ *  Copyright (C) 2008 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+
+/**
+ * sys_checkpoint - checkpoint a container
+ * @pid: pid of the container init(1) process
+ * @fd: file to which dump the checkpoint image
+ * @flags: checkpoint operation flags
+ *
+ * Returns positive identifier on success, 0 when returning from restart
+ * or negative value on error
+ */
+SYSCALL_DEFINE3(checkpoint, pid_t, pid, int, fd, unsigned long, flags)
+{
+	return -ENOSYS;
+}
+
+/**
+ * sys_restart - restart a container
+ * @crid: checkpoint image identifier
+ * @fd: file from which read the checkpoint image
+ * @flags: restart operation flags
+ *
+ * Returns negative value on error, or otherwise returns in the realm
+ * of the original checkpoint
+ */
+SYSCALL_DEFINE3(restart, pid_t, pid, int, fd, unsigned long, flags)
+{
+	return -ENOSYS;
+}
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3052084..5dc3f9a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -752,6 +752,8 @@ asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
 			  size_t);
 asmlinkage long sys_pipe2(int __user *, int);
 asmlinkage long sys_pipe(int __user *);
+asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags);
+asmlinkage long sys_restart(int crid, int fd, unsigned long flags);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
diff --git a/init/Kconfig b/init/Kconfig
index 7be4d38..adb4260 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1042,6 +1042,8 @@ config SLOW_WORK
 
 	  See Documentation/slow-work.txt.
 
+source "checkpoint/Kconfig"
+
 endmenu		# General setup
 
 config HAVE_GENERIC_DMA_COHERENT
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 27dad29..e9e749d 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -175,3 +175,7 @@ cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
+
+/* checkpoint/restart */
+cond_syscall(sys_checkpoint);
+cond_syscall(sys_restart);
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 04/43] c/r: documentation
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (2 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 03/43] c/r: create syscalls: sys_checkpoint, sys_restart Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 05/43] c/r: basic infrastructure for checkpoint/restart Oren Laadan
                   ` (30 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Covers application checkpoint/restart, overall design, interfaces,
usage, shared objects, and and checkpoint image format.
Changelog[v16]:
  - Update documentation
  - Unify into readme.txt and usage.txt
Changelog[v14]:
  - Discard the 'h.parent' field
  - New image format (shared objects appear before they are referenced
    unless they are compound)
Changelog[v8]:
  - Split into multiple files in Documentation/checkpoint/...
  - Extend documentation, fix typos and comments from feedback
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
---
 Documentation/checkpoint/ckpt.c     |   32 ++++
 Documentation/checkpoint/readme.txt |  349 +++++++++++++++++++++++++++++++++++
 Documentation/checkpoint/rstr.c     |   20 ++
 Documentation/checkpoint/self.c     |   57 ++++++
 Documentation/checkpoint/test.c     |   48 +++++
 Documentation/checkpoint/usage.txt  |  192 +++++++++++++++++++
 checkpoint/sys.c                    |    2 +-
 7 files changed, 699 insertions(+), 1 deletions(-)
diff --git a/Documentation/checkpoint/ckpt.c b/Documentation/checkpoint/ckpt.c
new file mode 100644
index 0000000..094408c
--- /dev/null
+++ b/Documentation/checkpoint/ckpt.c
@@ -0,0 +1,32 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+int main(int argc, char *argv[])
+{
+	pid_t pid;
+	int ret;
+
+	if (argc != 2) {
+		printf("usage: ckpt PID\n");
+		exit(1);
+	}
+
+	pid = atoi(argv[1]);
+	if (pid <= 0) {
+		printf("invalid pid\n");
+		exit(1);
+	}
+
+	ret = syscall(__NR_checkpoint, pid, STDOUT_FILENO, 0);
+
+	if (ret < 0)
+		perror("checkpoint");
+	else
+		printf("checkpoint id %d\n", ret);
+
+	return (ret > 0 ? 0 : 1);
+}
+
diff --git a/Documentation/checkpoint/readme.txt b/Documentation/checkpoint/readme.txt
new file mode 100644
index 0000000..abc54c1
--- /dev/null
+++ b/Documentation/checkpoint/readme.txt
@@ -0,0 +1,349 @@
+
+	      Checkpoint-Restart support in the Linux kernel
+	==========================================================
+
+Copyright (C) 2008 Oren Laadan
+
+Author:		Oren Laadan <orenl@cs.columbia.edu>
+
+License:	The GNU Free Documentation License, Version 1.2
+		(dual licensed under the GPL v2)
+
+Reviewers:	Serge Hallyn <serue@us.ibm.com>
+		Dave Hansen <dave@linux.vnet.ibm.com>
+
+
+Introduction
+============
+
+Application checkpoint/restart [C/R] is the ability to save the state
+of a running application so that it can later resume its execution
+from the time at which it was checkpointed. An application can be
+migrated by checkpointing it on one machine and restarting it on
+another. C/R can provide many potential benefits:
+
+* Failure recovery: by rolling back to a previous checkpoint
+
+* Improved response time: by restarting applications from checkpoints
+  instead of from scratch.
+
+* Improved system utilization: by suspending long running CPU
+  intensive jobs and resuming them when load decreases.
+
+* Fault resilience: by migrating applications off faulty hosts.
+
+* Dynamic load balancing: by migrating applications to less loaded
+  hosts.
+
+* Improved service availability and administration: by migrating
+  applications before host maintenance so that they continue to run
+  with minimal downtime
+
+* Time-travel: by taking periodic checkpoints and restarting from
+  any previous checkpoint.
+
+Compared to hypervisor approaches, application C/R is more lightweight
+since it need only save the state associated with applications, while
+operating system data structures (e.g. buffer cache, drivers state
+and the like) are uninteresting.
+
+
+Overall design
+==============
+
+Checkpoint and restart are done in the kernel as much as possible.
+Two new system calls are introduced to provide C/R: sys_checkpoint()
+and sys_restart(). They both operate on a process tree (hierarchy),
+either a whole container or a subtree of a container.
+
+Checkpointing entire containers ensures that there are no dependencies
+on anything outside the container, which guarantees that a matching
+restart will succeed (assuming that the file system state remains
+consistent). However, it requires that users will always run the tasks
+that they wish to checkpoint inside containers. This is ideal for,
+e.g., private virtual servers and the like.
+
+In contrast, when checkpointing a subtree of a container it is up to
+the user to ensure that dependencies either don't exist or can be
+safely ignored. This is useful, for instance, for HPC scenarios or
+even a user that would like to periodically checkpoint a long-running
+batch job.
+
+An additional system call, a la madvise(), is planned, so that tasks
+can advise the kernel how to handle specific resources. For instance,
+a task could ask to skip a memory area at checkpoint to save space,
+or to use a preset file descriptor at restart instead of restoring it
+from the checkpoint image. It will provide the flexibility that is
+particularly useful to address the needs of a diverse crowd of users
+and use-cases.
+
+Syscall sys_checkpoint() is given a pid that indicates the top of the
+hierarchy, a file descriptor to store the image, and flags. The code
+serializes internal user- and kernel-state and writes it out to the
+file descriptor. The resulting image is stream-able. The processes are
+expected to be frozen for the duration of the checkpoint.
+
+In general, a checkpoint consists of 5 steps:
+1. Pre-dump
+2. Freeze the container/subtree
+3. Save tasks' and kernel state		<-- sys_checkpoint()
+4. Thaw (or kill) the container/subtree
+5. Post-dump
+
+Step 3 is done by calling sys_checkpoint(). Steps 1 and 5 are an
+optimization to reduce application downtime. In particular, "pre-dump"
+works before freezing the container, e.g. the pre-copy for live
+migration, and "post-dump" works after the container resumes
+execution, e.g. write-back the data to secondary storage.
+
+The kernel exports a relatively opaque 'blob' of data to userspace
+which can then be handed to the new kernel at restart time.  The
+'blob' contains data and state of select portions of kernel structures
+such as VMAs and mm_structs, as well as copies of the actual memory
+that the tasks use. Any changes in this blob's format between kernel
+revisions can be handled by an in-userspace conversion program.
+
+To restart, userspace first create a process hierarchy that matches
+that of the checkpoint, and each task calls sys_restart(). The syscall
+reads the saved kernel state from a file descriptor, and re-creates
+the resources that the tasks need to resume execution. The restart
+code is executed by each task that is restored in the new hierarchy to
+reconstruct its own state.
+
+In general, a restart consists of 3 steps:
+1. Create hierarchy
+2. Restore tasks' and kernel state	<-- sys_restart()
+3. Resume userspace (or freeze tasks)
+
+Because the process hierarchy, during restart in created in userspace,
+the restarting tasks have the flexibility to prepare before calling
+sys_restart().
+
+
+Checkpoint image format
+=======================
+
+The checkpoint image format is built of records that consist of a
+pre-header identifying its contents, followed by a payload. This
+format allow userspace tools to easily parse and skip through the
+image without requiring intimate knowledge of the data. It will also
+be handy to enable parallel checkpointing in the future where multiple
+threads interleave data from multiple processes into a single stream.
+
+The pre-header is defined by 'struct ckpt_hdr' as follows: @type
+identifies the type of the payload, @len tells its length in bytes
+including the pre-header.
+
+struct ckpt_hdr {
+	__s32 type;
+	__s32 len;
+};
+
+The pre-header must be the first component in all other headers. For
+instance, the task data is saved in 'struct ckpt_hdr_task', which
+looks something like this:
+
+struct ckpt_hdr_task {
+	struct ckpt_hdr h;
+	__u32 pid;
+	...
+};
+
+THE IMAGE FORMAT IS EXPECTED TO CHANGE over time as more features are
+supported, or as existing features change in the kernel and require to
+adjust their representation. Any such changes will be be handled by
+in-userspace conversion tools.
+
+The general format of the checkpoint image is as follows:
+1. Image header
+2. Task hierarchy
+3. Tasks' state
+4. Image trailer
+
+The image always begins with a general header that holds a magic
+number, an architecture identifier (little endian format), a format
+version number (@rev), followed by information about the kernel
+(currently version and UTS data). It also holds the time of the
+checkpoint and the flags given to sys_checkpoint(). This header is
+followed by an arch-specific header.
+
+The task hierarchy comes next so that userspace tools can read it
+early (even from a stream) and re-create the restarting tasks. This is
+basically an array of all checkpointed tasks, and their relationships
+(parent, siblings, threads, etc).
+
+Then the state of all tasks is saved, in the order that they appear in
+the tasks array above. For each state, we save data like task_struct,
+namespaces, open files, memory layout, memory contents, cpu state,
+signals and signal handlers, etc. For resources that are shared among
+multiple processes, we first checkpoint said resource (and only once),
+and in the task data we give a reference to it. More about shared
+resources below.
+
+Finally, the image always ends with a trailer that holds a (different)
+magic number, serving for sanity check.
+
+
+Shared objects
+==============
+
+Many resources may be shared by multiple tasks (e.g. file descriptors,
+memory address space, etc), or even have multiple references from
+other resources (e.g. a single inode that represents two ends of a
+pipe).
+
+Shared objects are tracked using a hash table (objhash) to ensure that
+they are only checkpointed or restored once. To handle a shared
+object, it is first looked up in the hash table, to determine if is
+the first encounter or a recurring appearance.  The hash table itself
+is not saved as part of the checkpoint image: it is constructed
+dynamically during both checkpoint and restart, and discarded at the
+end of the operation.
+
+During checkpoint, when a shared object is encountered for the first
+time, it is inserted to the hash table, indexed by its kernel address.
+It is assigned an identifier (@objref) in order of appearance, and
+then its state if saved. Subsequent lookups of that object in the hash
+will yield that entry, in which case only the @objref is saved, as
+opposed the entire state of the object.
+
+During restart, shared objects are indexed by their @objref as given
+during the checkpoint. On the first appearance of each shared object,
+a new resource will be created and its state restored from the image.
+Then the object is added to the hash table. Subsequent lookups of the
+same unique identifier in the hash table will yield that entry, and
+then the existing object instance is reused instead of creating
+a new one.
+
+The hash grabs a reference to each object that is inserted, and
+maintains this reference for the entire lifetime of the hash. Thus,
+it is always safe to reference an object that is stored in the hash.
+The hash is "one-way" in the sense that objects that are added are
+never deleted from the hash until the hash is discarded. This, in
+turn, happens only when the checkpoint (or restart) terminates.
+
+Shared objects are thus saved when they are first seen, and _before_
+the parent object that uses them. Therefore by the time the parent
+objects needs them, they should already be in the objhash. The one
+exception is when more than a single shared resource will be restarted
+at once (e.g. like the two ends of a pipe, or all the namespaces in an
+nsproxy). In this case the parent object is dumped first followed by
+the individual sub-resources).
+
+The checkpoint image is stream-able, meaning that restarting from it
+may not require lseek(). This is enforced at checkpoint time, by
+carefully selecting the order of shared objects, to respect the rule
+that an object is always saved before the objects that refers to it.
+
+
+Memory contents format
+======================
+
+The memory contents of a given memory address space (->mm) is dumped
+as a sequence of vma objects, represented by 'struct ckpt_hdr_vma'.
+This header details the vma properties, and a reference to a file
+(if file backed) or an inode (or shared memory) object.
+
+The vma header is followed by the actual contents - but only those
+pages that need to be saved, i.e. dirty pages. They are written in
+chunks of data, where each chunks contains a header that indicates
+that number of pages in the chunk, followed by an array of virtual
+addresses and then an array of actual page contents. The last chunk
+holds zero pages.
+
+To illustrate this, consider a single simple task with two vmas: one
+is file mapped with two dumped pages, and the other is anonymous with
+three dumped pages. The memory dump will look like this:
+
+	ckpt_hdr + ckpt_hdr_vma
+		ckpt_hdr_pgarr (nr_pages = 2)
+			addr1, addr2
+			page1, page2
+		ckpt_hdr_pgarr (nr_pages = 0)
+	ckpt_hdr + ckpt_hdr_vma
+		ckpt_hdr_pgarr (nr_pages = 3)
+		addr3, addr4, addr5
+		page3, page4, page5
+		ckpt_hdr_pgarr (nr_pages = 0)
+
+
+Error handling
+==============
+
+Both checkpoint and restart operations may fail due to a variety of
+reasons. Using a simple, single return value from the system call is
+insufficient to report the reason of a failure.
+
+Checkpoint - to provide informative status report upon failure, the
+checkpoint image may contain one (or more) error objects, 'struct
+ckpt_hdr_err'.  An error objects consists of a mandatory pre-header
+followed by a null character ('\0'), and then a string that describes
+the error. By default, if an error occurs, this will be the last
+object written to the checkpoint image.
+
+Upon failure, the caller can examine the image (e.g. with 'ckptinfo')
+and extract the detailed error message. The leading '\0' is useful if
+one wants to seek back from the end of the checkpoint image, instead
+of parsing the entire image separately.
+
+Restart - to be defined.
+
+
+Security
+========
+
+The main question is whether sys_checkpoint() and sys_restart()
+require privileged or unprivileged operation.
+
+Early versions checked capable(CAP_SYS_ADMIN) assuming that we would
+attempt to remove the need for privilege, so that all users could
+safely use it. Arnd Bergmann pointed out that it'd make more sense to
+let unprivileged users use them now, so that we'll be more careful
+about the security as patches roll in.
+
+Checkpoint: the main concern is whether a task that performs the
+checkpoint of another task has sufficient privileges to access its
+state. We address this by requiring that the checkpointer task will be
+able to ptrace the target task, by means of ptrace_may_access() with
+read mode.
+
+Restart: the main concern is that we may allow an unprivileged user to
+feed the kernel with random data. To this end, the restart works in a
+way that does not skip the usual security checks. Task credentials,
+i.e. euid, reuid, and LSM security contexts currently come from the
+caller, not the checkpoint image.  When restoration of credentials
+becomes supported, then definitely the ability of the task that calls
+sys_restore() to setresuid/setresgid to those values must be checked.
+
+Keeping the restart procedure to operate within the limits of the
+caller's credentials means that there various scenarios that cannot
+be supported. For instance, a setuid program that opened a protected
+log file and then dropped privileges will fail the restart, because
+the user won't have enough credentials to reopen the file. In these
+cases, we should probably treat restarting like inserting a kernel
+module: surely the user can cause havoc by providing incorrect data,
+but then again we must trust the root account.
+
+So that's why we don't want CAP_SYS_ADMIN required up-front. That way
+we will be forced to more carefully review each of those features.
+However, this can be controlled with a sysctl-variable.
+
+
+Kernel interfaces
+=================
+
+To checkpoint a vma, the 'struct vm_operations_struct' needs to
+provide a method ->checkpoint:
+  int checkpoint(struct ckpt_ctx *, struct vma_struct *)
+Restart requires a matching (exported) restore:
+  int restore(struct ckpt_ctx *, struct mm_struct *, struct ckpt_hdr_vma *)
+
+To checkpoint a vma, the 'struct file_operations' needs to provide
+a method ->checkpoint:
+  int checkpoint(struct ckpt_ctx *, struct file *)
+Restart requires a matching (exported) restore:
+  int restore(struct ckpt_ctx *, struct ckpt_hdr_file *)
+For most file systems, generic_file_{checkpoint,restore}() can be
+used.
+
+
diff --git a/Documentation/checkpoint/rstr.c b/Documentation/checkpoint/rstr.c
new file mode 100644
index 0000000..288209d
--- /dev/null
+++ b/Documentation/checkpoint/rstr.c
@@ -0,0 +1,20 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/syscall.h>
+
+int main(int argc, char *argv[])
+{
+	pid_t pid = getpid();
+	int ret;
+
+	ret = syscall(__NR_restart, pid, STDIN_FILENO, 0);
+	if (ret < 0)
+		perror("restart");
+
+	printf("should not reach here !\n");
+
+	return 0;
+}
+
diff --git a/Documentation/checkpoint/self.c b/Documentation/checkpoint/self.c
new file mode 100644
index 0000000..febb888
--- /dev/null
+++ b/Documentation/checkpoint/self.c
@@ -0,0 +1,57 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <math.h>
+#include <sys/syscall.h>
+
+#define OUTFILE  "/tmp/cr-test.out"
+
+int main(int argc, char *argv[])
+{
+	pid_t pid = getpid();
+	FILE *file;
+	int i, ret;
+	float a;
+
+	close(0);
+	close(2);
+
+	unlink(OUTFILE);
+	file = fopen(OUTFILE, "w+");
+	if (!file) {
+		perror("open");
+		exit(1);
+	}
+	if (dup2(0, 2) < 0) {
+		perror("dup2");
+		exit(1);
+	}
+
+	a = sqrt(2.53 * (getpid() / 1.21));
+
+	fprintf(file, "hello, world (%.2f)!\n", a);
+	fflush(file);
+
+	for (i = 0; i < 1000; i++) {
+		sleep(1);
+		/* make the fpu work ->  a = a + i/10  */
+		a = sqrt(a*a + 2*a*(i/10.0) + i*i/100.0);
+		fprintf(file, "count %d (%.2f)!\n", i, a);
+		fflush(file);
+
+		if (i == 2) {
+			ret = syscall(__NR_checkpoint, pid, STDOUT_FILENO, 0);
+			if (ret < 0) {
+				fprintf(file, "ckpt: %s\n", strerror(errno));
+				exit(2);
+			}
+			fprintf(file, "checkpoint ret: %d\n", ret);
+			fflush(file);
+		}
+	}
+
+	return 0;
+}
+
diff --git a/Documentation/checkpoint/test.c b/Documentation/checkpoint/test.c
new file mode 100644
index 0000000..1183655
--- /dev/null
+++ b/Documentation/checkpoint/test.c
@@ -0,0 +1,48 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <math.h>
+
+#define OUTFILE  "/tmp/cr-test.out"
+
+int main(int argc, char *argv[])
+{
+	FILE *file;
+	float a;
+	int i;
+
+	close(0);
+	close(1);
+	close(2);
+
+	unlink(OUTFILE);
+	file = fopen(OUTFILE, "w+");
+	if (!file) {
+		perror("open");
+		exit(1);
+	}
+	if (dup2(0, 2) < 0) {
+		perror("dup2");
+		exit(1);
+	}
+
+	a = sqrt(2.53 * (getpid() / 1.21));
+
+	fprintf(file, "hello, world (%.2f)!\n", a);
+	fflush(file);
+
+	for (i = 0; i < 1000; i++) {
+		sleep(1);
+		/* make the fpu work ->  a = a + i/10  */
+		a = sqrt(a*a + 2*a*(i/10.0) + i*i/100.0);
+		fprintf(file, "count %d (%.2f)!\n", i, a);
+		fflush(file);
+	}
+
+	fprintf(file, "world, hello (%.2f) !\n", a);
+	fflush(file);
+
+	return 0;
+}
+
diff --git a/Documentation/checkpoint/usage.txt b/Documentation/checkpoint/usage.txt
new file mode 100644
index 0000000..3432dc1
--- /dev/null
+++ b/Documentation/checkpoint/usage.txt
@@ -0,0 +1,192 @@
+
+	      How to use Checkpoint-Restart
+	=========================================
+
+
+API
+===
+
+The API consists of two new system calls:
+
+* int checkpoint(pid_t pid, int fd, unsigned long flag);
+
+ Checkpoint a (sub-)container whose root task is identified by @pid,
+ to the open file indicated by @fd. @flags may be on or more of:
+   - CHECKPOINT_SUBTREE : allow checkpoint of sub-container
+ (other value are not allowed).
+
+ Returns: a positive checkpoint identifier (ckptid) upon success, 0 if
+ it returns from a restart, and -1 if an error occurs. The ckptid will
+ uniquely identify a checkpoint image, for as long as the checkpoint
+ is kept in the kernel (e.g. if one wishes to keep a checkpoint, or a
+ partial checkpoint, residing in kernel memory).
+
+* int sys_restart(pid_t pid, int fd, unsigned long flags);
+
+ Restart a process hierarchy from a checkpoint image that is read from
+ the blob stored in the file indicated by @fd. The @flags' will have
+ future meaning (must be 0 for now). @pid indicates the root of the
+ hierarchy (may mean 'ckptid' to identify an in-kernel checkpoint
+ image, with some @flags in the future).
+
+ Returns: -1 if an error occurs, 0 on success when restarting from a
+ "self" checkpoint, and return value of system call at the time of the
+ checkpoint when restarting from an "external" checkpoint.
+
+ TODO: upon successful "external" restart, the container will end up
+ in a frozen state.
+
+
+Sysctl/proc
+===========
+
+/proc/sys/kernel/ckpt_unpriv_allowed		[default = 1]
+  controls whether c/r operation is allowed for unprivileged users
+
+
+Operation
+=========
+
+The granularity of a checkpoint usually is a process hierarchy. The
+'pid' argument is interpreted in the caller's pid namespace. So to
+checkpoint a container whose init task (pid 1 in that pidns) appears
+as pid 3497 the caller's pidns, the caller must use pid 3497. Passing
+pid 1 will attempt to checkpoint the caller's container, and if the
+caller isn't privileged and init is owned by root, it will fail.
+
+Unless the CHECKPOINT_SUBTREE flag is set, if the caller passes a pid
+which does not refer to a container's init task, then sys_checkpoint()
+would return -EINVAL.
+
+We assume that during checkpoint and restart the container state is
+quiescent. During checkpoint, this means that all affected tasks are
+frozen (or otherwise stopped). During restart, this means that all
+affected tasks are executing the sys_restart() call. In both cases, if
+there are other tasks possible sharing state with the container, they
+must not modify it during the operation. It is the responsibility of
+the caller to follow this requirement.
+
+If the assumption that all tasks are frozen and that there is no other
+sharing doesn't hold - then the results of the operation are undefined
+(just as, e.g. not calling execve() immediately after vfork() produces
+undefined results). In particular, either checkpoint will fail, or it
+may produce a checkpoint image that can't be restarted, or (unlikely)
+the restart may produce a container whose state does not match that of
+the original container.
+
+
+User tools
+==========
+
+* ckpt: a tool to perform a checkpoint of a container/subtree
+* mktree: a tool to restart a container/subtree
+* ckptinfo: a tool to examine a checkpoint image
+
+It is best to use the dedicated user tools for checkpoint and restart.
+
+If you insist, then here is a code snippet that illustrates how a
+checkpoint is initiated by a process inside a container - the logic is
+similar to fork():
+	...
+	ckptid = checkpoint(1, ...);
+	switch (crid) {
+	case -1:
+		perror("checkpoint failed");
+		break;
+	default:
+		fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
+		/* proceed with execution after checkpoint */
+		...
+		break;
+	case 0:
+		fprintf(stderr, "returned after restart\n");
+		/* proceed with action required following a restart */
+		...
+		break;
+	}
+	...
+
+And to initiate a restart, the process in an empty container can use
+logic similar to execve():
+	...
+	if (restart(pid, ...) < 0)
+		perror("restart failed");
+	/* only get here if restart failed */
+	...
+
+Note, that the code also supports "self" checkpoint, where a process
+can checkpoint itself. This mode does not capture the relationships of
+the task with other tasks, or any shared resources. It is useful for
+application that wish to be able to save and restore their state.
+They will either not use (or care about) shared resources, or they
+will be aware of the operations and adapt suitably after a restart.
+The code above can also be used for "self" checkpoint.
+
+
+You may find the following sample programs useful:
+
+* ckpt.c: accepts a 'pid' argument and checkpoint that task to stdout
+* rstr.c: restarts a checkpoint image from stdin
+* self.c: a simple test program doing self-checkpoint
+* test.c: a simple test program to checkpoint
+
+
+"External" checkpoint
+=====================
+
+To do "external" checkpoint, you need to first freeze that other task
+either using the freezer cgroup.
+
+Restart does not preserve the original PID yet, (because we haven't
+solved yet the fork-with-specific-pid issue). In a real scenario, you
+probably want to first create a new names space, and have the init
+task there call 'sys_restart()'.
+
+I tested it this way:
+	$ ./test &
+	[1] 3493
+
+	$ kill -STOP 3493
+	$ ./ckpt 3493 > ckpt.image
+
+	$ mv /tmp/cr-test.out /tmp/cr-test.out.orig
+	$ cp /tmp/cr-test.out.orig /tmp/cr-test.out
+
+	$ kill -CONT 3493
+
+	$ ./rstr < ckpt.image
+Now compare the output of the two output files.
+
+
+"Self checkpoint
+================
+
+To do "self" checkpoint, you can incorporate the code from ckpt.c into
+your application.
+
+Here is how to test the "self" checkpoint:
+	$ ./self > self.image &
+	[1] 3512
+
+	$ sleep 3
+	$ mv /tmp/cr-test.out /tmp/cr-test.out.orig
+	$ cp /tmp/cr-test.out.orig /tmp/cr-test.out
+
+	$ cat /tmp/cr-rest.out
+	hello, world (85.46)!
+	count 0 (85.46)!
+	count 1 (85.56)!
+	count 2 (85.76)!
+	count 3 (86.46)!
+
+	$ sed -i 's/count/xxxx/g' /tmp/cr-rest.out
+
+	$ ./rstr < self.image &
+Now compare the output of the two output files.
+
+Note how in test.c we close stdin, stdout, stderr - that's because
+currently we only support regular files (not ttys/ptys).
+
+If you check the output of ps, you'll see that "rstr" changed its name
+to "test" or "self", as expected.
+
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 9d4caff..881965b 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -1,7 +1,7 @@
 /*
  *  Generic container checkpoint-restart
  *
- *  Copyright (C) 2008 Oren Laadan
+ *  Copyright (C) 2008-2009 Oren Laadan
  *
  *  This file is subject to the terms and conditions of the GNU General Public
  *  License.  See the file COPYING in the main directory of the Linux
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 05/43] c/r: basic infrastructure for checkpoint/restart
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (3 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 04/43] c/r: documentation Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 06/43] c/r: x86_32 support " Oren Laadan
                   ` (29 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Add those interfaces, as well as helpers needed to easily manage the
file format. The code is roughly broken out as follows:
checkpoint/sys.c - user/kernel data transfer, as well as setup of the
  c/r context (a per-checkpoint data structure for housekeeping)
checkpoint/checkpoint.c - output wrappers and basic checkpoint handling
checkpoint/restart.c - input wrappers and basic restart handling
checkpoint/process.c - c/r of task data
For now, we can only checkpoint the 'current' task ("self" checkpoint),
and the 'pid' argument to the syscall is ignored.
Patches to add the per-architecture support as well as the actual
work to do the memory checkpoint follow in subsequent patches.
Changelog[v16]:
  - Split ctx->flags to ->uflags (user flags) and ->kflags (kernel flags)
  - Introduce __ckpt_write_err() and ckpt_write_err() to report errors
  - Allow @ptr == NULL to write (or read) header only without payload
  _ Introduce _ckpt_read_obj_type()
Changelog[v15]:
  - Replace header buffer in ckpt_ctx (hbuf,hpos) with kmalloc/kfree()
Changelog[v14]:
  - Cleanup interface to get/put hdr buffers
  - Merge checkpoint and restart code into a single file (per subsystem)
  - Take uts_sem around access to uts->{release,version,machine}
  - Embed ckpt_hdr in all ckpt_hdr_...., cleanup read/write helpers
  - Define sys_checkpoint(0,...) as asking for a self-checkpoint (Serge)
  - Revert use of 'pr_fmt' to avoid tainting whom includes us (Nathan Lynch)
  - Explicitly indicate length of UTS fields in header
  - Discard field 'h->parent' from ckpt_hdr
Changelog[v12]:
  - ckpt_kwrite/ckpt_kread() again use vfs_read(), vfs_write() (safer)
  - Split ckpt_write/ckpt_read() to two parts: _ckpt_write/read() helper
  - Befriend with sparse : explicit conversion to 'void __user *'
  - Redfine 'pr_fmt' instead of using special ckpt_debug()
Changelog[v10]:
  - add ckpt_write_buffer(), ckpt_read_buffer() and ckpt_read_buf_type()
  - force end-of-string in ckpt_read_string() (fix possible DoS)
Changelog[v9]:
  - ckpt_kwrite/ckpt_kread() use file->f_op->write() directly
  - Drop ckpt_uwrite/ckpt_uread() since they aren't used anywhere
Changelog[v6]:
  - Balance all calls to ckpt_hbuf_get() with matching ckpt_hbuf_put()
    (although it's not really needed)
Changelog[v5]:
  - Rename headers files s/ckpt/checkpoint/
Changelog[v2]:
  - Added utsname->{release,version,machine} to checkpoint header
  - Pad header structures to 64 bits to ensure compatibility
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 Makefile                         |    2 +-
 checkpoint/Makefile              |    6 +-
 checkpoint/checkpoint.c          |  261 +++++++++++++++++++++++++++++++
 checkpoint/process.c             |   97 ++++++++++++
 checkpoint/restart.c             |  316 ++++++++++++++++++++++++++++++++++++++
 checkpoint/sys.c                 |  242 +++++++++++++++++++++++++++++-
 include/linux/checkpoint.h       |   84 ++++++++++
 include/linux/checkpoint_hdr.h   |  100 ++++++++++++
 include/linux/checkpoint_types.h |   43 +++++
 include/linux/magic.h            |    4 +
 lib/Kconfig.debug                |   13 ++
 11 files changed, 1164 insertions(+), 4 deletions(-)
diff --git a/Makefile b/Makefile
index 739fd34..7ad0ca8 100644
--- a/Makefile
+++ b/Makefile
@@ -646,7 +646,7 @@ export mod_strip_cmd
 
 
 ifeq ($(KBUILD_EXTMOD),)
-core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/
+core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/ checkpoint/
 
 vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
 		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
diff --git a/checkpoint/Makefile b/checkpoint/Makefile
index 8a32c6f..99364cc 100644
--- a/checkpoint/Makefile
+++ b/checkpoint/Makefile
@@ -2,4 +2,8 @@
 # Makefile for linux checkpoint/restart.
 #
 
-obj-$(CONFIG_CHECKPOINT) += sys.o
+obj-$(CONFIG_CHECKPOINT) += \
+	sys.o \
+	checkpoint.o \
+	restart.o \
+	process.o
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
new file mode 100644
index 0000000..56b690a
--- /dev/null
+++ b/checkpoint/checkpoint.c
@@ -0,0 +1,261 @@
+/*
+ *  Checkpoint logic and helpers
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <linux/version.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/utsname.h>
+#include <linux/magic.h>
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+/* unique checkpoint identifier (FIXME: should be per-container ?) */
+static atomic_t ctx_count = ATOMIC_INIT(0);
+
+/**
+ * ckpt_write_obj - write an object
+ * @ctx: checkpoint context
+ * @h: object descriptor
+ */
+int ckpt_write_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h)
+{
+	_ckpt_debug(CKPT_DRW, "type %d len %d\n", h->type, h->len);
+	return ckpt_kwrite(ctx, h, h->len);
+}
+
+/**
+ * ckpt_write_obj_type - write an object (from a pointer)
+ * @ctx: checkpoint context
+ * @ptr: buffer pointer
+ * @len: buffer size
+ * @type: desired type
+ *
+ * If @ptr is NULL, then write only the header (payload to follow)
+ */
+int ckpt_write_obj_type(struct ckpt_ctx *ctx, void *ptr, int len, int type)
+{
+	struct ckpt_hdr *h;
+	int ret;
+
+	h = ckpt_hdr_get(ctx, sizeof(*h));
+	if (!h)
+		return -ENOMEM;
+
+	h->type = type;
+	h->len = len + sizeof(*h);
+
+	_ckpt_debug(CKPT_DRW, "type %d len %d\n", h->type, h->len);
+	ret = ckpt_kwrite(ctx, h, sizeof(*h));
+	if (ret < 0)
+		goto out;
+	if (ptr)
+		ret = ckpt_kwrite(ctx, ptr, len);
+ out:
+	_ckpt_hdr_put(ctx, h, sizeof(*h));
+	return ret;
+}
+
+/**
+ * ckpt_write_buffer - write an object of type buffer
+ * @ctx: checkpoint context
+ * @ptr: buffer pointer
+ * @len: buffer size
+ */
+int ckpt_write_buffer(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	return ckpt_write_obj_type(ctx, ptr, len, CKPT_HDR_BUFFER);
+}
+
+/**
+ * ckpt_write_string - write an object of type string
+ * @ctx: checkpoint context
+ * @str: string pointer
+ * @len: string length
+ */
+int ckpt_write_string(struct ckpt_ctx *ctx, char *str, int len)
+{
+	return ckpt_write_obj_type(ctx, str, len, CKPT_HDR_STRING);
+}
+
+static void __ckpt_generate_err(struct ckpt_ctx *ctx, char *fmt, va_list ap)
+{
+	va_list aq;
+	char *str;
+	int len;
+
+	va_copy(aq, ap);
+
+	/*
+	 * prefix the error string with a '\0' to facilitate easy
+	 * backtrace to the beginning of the error message without
+	 * needing to parse the entire checkpoint image.
+	 */
+	ctx->err_string[0] = '\0';
+	str = &ctx->err_string[1];
+	len = vsnprintf(str, 255, fmt, ap) + 2;
+
+	if (len > 256) {
+		printk(KERN_NOTICE "c/r: error string truncated: ");
+		vprintk(fmt, aq);
+	}
+
+	va_end(aq);
+
+	ckpt_debug("c/r: checkpoint error: %s\n", str);
+}
+
+/**
+ * __ckpt_write_err - save an error string on the ctx->err_string
+ * @ctx: checkpoint context
+ * @fmt: error string format
+ * @...: error string arguments
+ *
+ * Use this during checkpoint to report while holding a spinlock
+ */
+void __ckpt_write_err(struct ckpt_ctx *ctx, char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	__ckpt_generate_err(ctx, fmt, ap);
+	va_end(ap);
+}
+
+/**
+ * ckpt_write_err - write an object describing an error
+ * @ctx: checkpoint context
+ * @fmt: error string format
+ * @...: error string arguments
+ *
+ * If @fmt is null, the string in the ctx->err_string will be used (and freed)
+ */
+int ckpt_write_err(struct ckpt_ctx *ctx, char *fmt, ...)
+{
+	va_list ap;
+	char *str;
+	int len, ret = 0;
+
+	if (fmt) {
+		va_start(ap, fmt);
+		__ckpt_generate_err(ctx, fmt, ap);
+		va_end(ap);
+	}
+
+	str = ctx->err_string;
+	len = strlen(str + 1) + 2;	/* leading and trailing '\0' */
+
+	if (len == 0)	/* empty error string */
+		return 0;
+
+	ret = ckpt_write_obj_type(ctx, NULL, 0, CKPT_HDR_ERROR);
+	if (!ret)
+		ret = ckpt_write_string(ctx, str, len);
+	if (ret < 0)
+		printk(KERN_NOTICE "c/r: error string unsaved (%d): %s\n",
+		       ret, str + 1);
+
+	str[1] = '\0';
+	return ret;
+}
+
+/***********************************************************************
+ * Checkpoint
+ */
+
+/* write the checkpoint header */
+static int checkpoint_write_header(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_header *h;
+	struct new_utsname *uts;
+	struct timeval ktv;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_HEADER);
+	if (!h)
+		return -ENOMEM;
+
+	do_gettimeofday(&ktv);
+	uts = utsname();
+
+	h->magic = CHECKPOINT_MAGIC_HEAD;
+	h->major = (LINUX_VERSION_CODE >> 16) & 0xff;
+	h->minor = (LINUX_VERSION_CODE >> 8) & 0xff;
+	h->patch = (LINUX_VERSION_CODE) & 0xff;
+
+	h->rev = CHECKPOINT_VERSION;
+
+	h->uflags = ctx->uflags;
+	h->time = ktv.tv_sec;
+
+	h->uts_release_len = sizeof(uts->release);
+	h->uts_version_len = sizeof(uts->version);
+	h->uts_machine_len = sizeof(uts->machine);
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ret;
+
+	down_read(&uts_sem);
+	ret = ckpt_write_buffer(ctx, uts->release, sizeof(uts->release));
+	if (ret < 0)
+		goto up;
+	ret = ckpt_write_buffer(ctx, uts->version, sizeof(uts->version));
+	if (ret < 0)
+		goto up;
+	ret = ckpt_write_buffer(ctx, uts->machine, sizeof(uts->machine));
+ up:
+	up_read(&uts_sem);
+	return ret;
+}
+
+/* write the checkpoint trailer */
+static int checkpoint_write_tail(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_tail *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TAIL);
+	if (!h)
+		return -ENOMEM;
+
+	h->magic = CHECKPOINT_MAGIC_TAIL;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+int do_checkpoint(struct ckpt_ctx *ctx, pid_t pid)
+{
+	int ret;
+
+	ret = checkpoint_write_header(ctx);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_task(ctx, current);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_write_tail(ctx);
+	if (ret < 0)
+		goto out;
+
+	/* on success, return (unique) checkpoint identifier */
+	ctx->crid = atomic_inc_return(&ctx_count);
+	ret = ctx->crid;
+ out:
+	return ret;
+}
diff --git a/checkpoint/process.c b/checkpoint/process.c
new file mode 100644
index 0000000..c2b7564
--- /dev/null
+++ b/checkpoint/process.c
@@ -0,0 +1,97 @@
+/*
+ *  Checkpoint task structure
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <linux/sched.h>
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+/***********************************************************************
+ * Checkpoint
+ */
+
+/* dump the task_struct of a given task */
+static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct ckpt_hdr_task *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK);
+	if (!h)
+		return -ENOMEM;
+
+	h->state = t->state;
+	h->exit_state = t->exit_state;
+	h->exit_code = t->exit_code;
+	h->exit_signal = t->exit_signal;
+
+	h->task_comm_len = TASK_COMM_LEN;
+
+	/* FIXME: save remaining relevant task_struct fields */
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ret;
+
+	return ckpt_write_string(ctx, t->comm, TASK_COMM_LEN);
+}
+
+/* dump the entire state of a given task */
+int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	int ret;
+
+	ret = checkpoint_task_struct(ctx, t);
+	ckpt_debug("task %d\n", ret);
+
+	return ret;
+}
+
+/***********************************************************************
+ * Restart
+ */
+
+/* read the task_struct into the current task */
+static int restore_task_struct(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_task *h;
+	struct task_struct *t = current;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ret = -EINVAL;
+	if (h->task_comm_len > TASK_COMM_LEN)
+		goto out;
+
+	memset(t->comm, 0, TASK_COMM_LEN);
+	ret = _ckpt_read_string(ctx, t->comm, h->task_comm_len);
+
+	/* FIXME: restore remaining relevant task_struct fields */
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+/* read the entire state of the current task */
+int restore_task(struct ckpt_ctx *ctx)
+{
+	int ret;
+
+	ret = restore_task_struct(ctx);
+	ckpt_debug("task %d\n", ret);
+
+	return ret;
+}
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
new file mode 100644
index 0000000..a6f73d3
--- /dev/null
+++ b/checkpoint/restart.c
@@ -0,0 +1,316 @@
+/*
+ *  Restart logic and helpers
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <linux/version.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/magic.h>
+#include <linux/utsname.h>
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+/**
+ * _ckpt_read_obj - read an object (ckpt_hdr followed by payload)
+ * @ctx: checkpoint context
+ * @h: desired ckpt_hdr
+ * @ptr: desired buffer
+ * @len: desired payload length (if 0, flexible)
+ * @max: maximum payload length
+ *
+ * If @ptr is NULL, then read only the header (payload to follow)
+ */
+static int _ckpt_read_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h,
+			void *ptr, int len, int max)
+{
+	int ret;
+
+	ret = ckpt_kread(ctx, h, sizeof(*h));
+	if (ret < 0)
+		return ret;
+	_ckpt_debug(CKPT_DRW, "type %d len %d(%d,%d)\n",
+		    h->type, h->len, len, max);
+	if (h->len < sizeof(*h))
+		return -EINVAL;
+	/* if len specified, enforce, else if maximum specified, enforce */
+	if ((len && h->len != len) || (!len && max && h->len > max))
+		return -EINVAL;
+
+	if (ptr)
+		ret = ckpt_kread(ctx, ptr, h->len - sizeof(struct ckpt_hdr));
+	return ret;
+}
+
+/**
+ * _ckpt_read_nbuffer - read an object of type buffer (variable length)
+ * @ctx: checkpoint context
+ * @ptr: provided buffer
+ * @len: buffer length
+ *
+ * If @ptr is NULL, then read only the header (payload to follow)
+ * Returns: actual buffer length (bounded by @len)
+ */
+int _ckpt_read_nbuffer(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	struct ckpt_hdr h;
+	int ret;
+
+	BUG_ON(!len);
+
+	len += sizeof(struct ckpt_hdr);
+	ret = _ckpt_read_obj(ctx, &h, ptr, 0, len);
+	if (ret < 0)
+		return ret;
+	_ckpt_debug(CKPT_DRW, "type %d len %d\n", h.type, h.len);
+	if (h.type != CKPT_HDR_BUFFER)
+		return -EINVAL;
+	return h.len;
+}
+
+/**
+ * _ckpt_read_obj_type - read an object of some type (set length)
+ * @ctx: checkpoint context
+ * @ptr: provided buffer
+ * @len: buffer length
+ * @type: buffer type
+ *
+ * If @ptr is NULL, then read only the header (payload to follow)
+ */
+int _ckpt_read_obj_type(struct ckpt_ctx *ctx, void *ptr, int len, int type)
+{
+	struct ckpt_hdr h;
+	int ret;
+
+	len += sizeof(struct ckpt_hdr);
+	ret = _ckpt_read_obj(ctx, &h, ptr, len, len);
+	if (ret < 0)
+		return ret;
+	if (h.type != type)
+		return -EINVAL;
+	return 0;
+}
+
+/**
+ * _ckpt_read_buffer - read an object of type buffer (set length)
+ * @ctx: checkpoint context
+ * @ptr: provided buffer
+ * @len: buffer length
+ *
+ * If @ptr is NULL, then read only the header (payload to follow)
+ */
+int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	BUG_ON(!len);
+	return _ckpt_read_obj_type(ctx, ptr, len, CKPT_HDR_BUFFER);
+}
+
+/**
+ * _ckpt_read_string - read an object of type string (set length)
+ * @ctx: checkpoint context
+ * @ptr: provided buffer
+ * @len: string length
+ *
+ * If @ptr is NULL, then read only the header (payload to follow)
+ */
+int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	int ret;
+
+	BUG_ON(!len);
+
+	ret = _ckpt_read_obj_type(ctx, ptr, len, CKPT_HDR_STRING);
+	if (ret < 0)
+		return ret;
+	if (ptr)
+		((char *) ptr)[len - 1] = '\0';	/* always play it safe */
+	return 0;
+}
+
+/**
+ * ckpt_read_obj - allocate and read an object (ckpt_hdr followed by payload)
+ * @ctx: checkpoint context
+ * @h: object descriptor
+ * @len: desired payload length (if 0, flexible)
+ * @max: maximum payload length
+ *
+ * Return: new buffer allocated on success, error pointer otherwise
+ */
+static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
+{
+	struct ckpt_hdr hh;
+	struct ckpt_hdr *h;
+	int ret;
+
+	ret = ckpt_kread(ctx, &hh, sizeof(hh));
+	if (ret < 0)
+		return ERR_PTR(ret);
+	_ckpt_debug(CKPT_DRW, "type %d len %d(%d,%d)\n",
+		    hh.type, hh.len, len, max);
+	if (hh.len < sizeof(*h))
+		return ERR_PTR(-EINVAL);
+	/* if len specified, enforce, else if maximum specified, enforce */
+	if ((len && hh.len != len) || (!len && max && hh.len > max))
+		return ERR_PTR(-EINVAL);
+
+	h = ckpt_hdr_get(ctx, hh.len);
+	if (!h)
+		return ERR_PTR(-ENOMEM);
+
+	*h = hh;	/* yay ! */
+
+	ret = ckpt_kread(ctx, (h + 1), hh.len - sizeof(struct ckpt_hdr));
+	if (ret < 0) {
+		ckpt_hdr_put(ctx, h);
+		h = ERR_PTR(ret);
+	}
+
+	return h;
+}
+
+/**
+ * ckpt_read_obj_type - allocate and read an object of some type
+ * @ctx: checkpoint context
+ * @len: desired object length
+ * @type: desired object type
+ *
+ * Return: new buffer allocated on success, error pointer otherwise
+ */
+void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type)
+{
+	struct ckpt_hdr *h;
+
+	BUG_ON(!len);
+
+	h = ckpt_read_obj(ctx, len, len);
+	if (IS_ERR(h))
+		return h;
+
+	if (h->type != type) {
+		ckpt_hdr_put(ctx, h);
+		h = ERR_PTR(-EINVAL);
+	}
+
+	return h;
+}
+
+/**
+ * ckpt_read_buf_type - allocate and read an object of some type (flxible)
+ * @ctx: checkpoint context
+ * @len: maximum object length
+ * @type: desired object type
+ *
+ * This differs from ckpt_read_obj_type() in that the length of the
+ * incoming object is flexible (up to the maximum specified by @len),
+ * as determined by the ckpt_hdr data.
+ *
+ * Return: new buffer allocated on success, error pointer otherwise
+ */
+void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int len, int type)
+{
+	struct ckpt_hdr *h;
+
+	BUG_ON(!len);
+
+	h = ckpt_read_obj(ctx, 0, len);
+	if (IS_ERR(h))
+		return h;
+
+	if (h->type != type) {
+		ckpt_hdr_put(ctx, h);
+		h = ERR_PTR(-EINVAL);
+	}
+
+	return h;
+}
+
+/***********************************************************************
+ * Restart
+ */
+
+/* read the checkpoint header */
+static int restore_read_header(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_header *h;
+	struct new_utsname *uts = NULL;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_HEADER);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ret = -EINVAL;
+	if (h->magic != CHECKPOINT_MAGIC_HEAD ||
+	    h->rev != CHECKPOINT_VERSION ||
+	    h->major != ((LINUX_VERSION_CODE >> 16) & 0xff) ||
+	    h->minor != ((LINUX_VERSION_CODE >> 8) & 0xff) ||
+	    h->patch != ((LINUX_VERSION_CODE) & 0xff))
+		goto out;
+	if (h->uflags)
+		goto out;
+	if (h->uts_release_len != sizeof(uts->release) ||
+	    h->uts_version_len != sizeof(uts->version) ||
+	    h->uts_machine_len != sizeof(uts->machine))
+		goto out;
+
+	ret = -ENOMEM;
+	uts = kmalloc(sizeof(*uts), GFP_KERNEL);
+	if (!uts)
+		goto out;
+
+	ctx->oflags = h->uflags;
+
+	/* FIX: verify compatibility of release, version and machine */
+	ret = _ckpt_read_buffer(ctx, uts->release, sizeof(uts->release));
+	if (ret < 0)
+		goto out;
+	ret = _ckpt_read_buffer(ctx, uts->version, sizeof(uts->version));
+	if (ret < 0)
+		goto out;
+	ret = _ckpt_read_buffer(ctx, uts->machine, sizeof(uts->machine));
+ out:
+	kfree(uts);
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+/* read the checkpoint trailer */
+static int restore_read_tail(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_tail *h;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TAIL);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	if (h->magic != CHECKPOINT_MAGIC_TAIL)
+		ret = -EINVAL;
+
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+int do_restart(struct ckpt_ctx *ctx, pid_t pid)
+{
+	int ret;
+
+	ret = restore_read_header(ctx);
+	if (ret < 0)
+		return ret;
+	ret = restore_task(ctx);
+	if (ret < 0)
+		return ret;
+	ret = restore_read_tail(ctx);
+
+	/* on success, adjust the return value if needed [TODO] */
+	return ret;
+}
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 881965b..2ad9722 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -8,9 +8,192 @@
  *  distribution for more details.
  */
 
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/capability.h>
+#include <linux/checkpoint.h>
+
+/*
+ * Helpers to write(read) from(to) kernel space to(from) the checkpoint
+ * image file descriptor (similar to how a core-dump is performed).
+ *
+ *   ckpt_kwrite() - write a kernel-space buffer to the checkpoint image
+ *   ckpt_kread() - read from the checkpoint image to a kernel-space buffer
+ */
+
+static inline int _ckpt_kwrite(struct file *file, void *addr, int count)
+{
+	void __user *uaddr = (__force void __user *) addr;
+	ssize_t nwrite;
+	int nleft;
+
+	for (nleft = count; nleft; nleft -= nwrite) {
+		loff_t pos = file_pos_read(file);
+		nwrite = vfs_write(file, uaddr, nleft, &pos);
+		file_pos_write(file, pos);
+		if (nwrite < 0) {
+			if (nwrite == -EAGAIN)
+				nwrite = 0;
+			else
+				return nwrite;
+		}
+		uaddr += nwrite;
+	}
+	return 0;
+}
+
+int ckpt_kwrite(struct ckpt_ctx *ctx, void *addr, int count)
+{
+	mm_segment_t fs;
+	int ret;
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = _ckpt_kwrite(ctx->file, addr, count);
+	set_fs(fs);
+
+	ctx->total += count;
+	return ret;
+}
+
+static inline int _ckpt_kread(struct file *file, void *addr, int count)
+{
+	void __user *uaddr = (__force void __user *) addr;
+	ssize_t nread;
+	int nleft;
+
+	for (nleft = count; nleft; nleft -= nread) {
+		loff_t pos = file_pos_read(file);
+		nread = vfs_read(file, uaddr, nleft, &pos);
+		file_pos_write(file, pos);
+		if (nread <= 0) {
+			if (nread == -EAGAIN) {
+				nread = 0;
+				continue;
+			} else if (nread == 0)
+				nread = -EPIPE;		/* unexecpted EOF */
+			return nread;
+		}
+		uaddr += nread;
+	}
+	return 0;
+}
+
+int ckpt_kread(struct ckpt_ctx *ctx, void *addr, int count)
+{
+	mm_segment_t fs;
+	int ret;
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = _ckpt_kread(ctx->file , addr, count);
+	set_fs(fs);
+
+	ctx->total += count;
+	return ret;
+}
+
+/**
+ * ckpt_hdr_get - get a hdr of certain size
+ * @ctx: checkpoint context
+ * @len: desired length
+ *
+ * Returns pointer to header
+ */
+void *ckpt_hdr_get(struct ckpt_ctx *ctx, int len)
+{
+	return kzalloc(len, GFP_KERNEL);
+}
+
+/**
+ * _ckpt_hdr_put - free a hdr allocated with ckpt_hdr_get
+ * @ctx: checkpoint context
+ * @ptr: header to free
+ * @len: header length
+ *
+ * (requiring 'ptr' makes it easily interchangable with kmalloc/kfree
+ */
+void _ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	kfree(ptr);
+}
+
+/**
+ * ckpt_hdr_put - free a hdr allocated with ckpt_hdr_get
+ * @ctx: checkpoint context
+ * @ptr: header to free
+ *
+ * It is assumed that @ptr begins with a 'struct ckpt_hdr'.
+ */
+void ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr)
+{
+	struct ckpt_hdr *h = (struct ckpt_hdr *) ptr;
+	_ckpt_hdr_put(ctx, ptr, h->len);
+}
+
+/**
+ * ckpt_hdr_get_type - get a hdr of certain size
+ * @ctx: checkpoint context
+ * @len: number of bytes to reserve
+ *
+ * Returns pointer to reserved space on hbuf
+ */
+void *ckpt_hdr_get_type(struct ckpt_ctx *ctx, int len, int type)
+{
+	struct ckpt_hdr *h;
+
+	h = ckpt_hdr_get(ctx, len);
+	if (!h)
+		return NULL;
+
+	h->type = type;
+	h->len = len;
+	return h;
+}
+
+
+/*
+ * Helpers to manage c/r contexts: allocated for each checkpoint and/or
+ * restart operation, and persists until the operation is completed.
+ */
+
+static void ckpt_ctx_free(struct ckpt_ctx *ctx)
+{
+	if (ctx->file)
+		fput(ctx->file);
+	kfree(ctx);
+}
+
+static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
+				       unsigned long kflags)
+{
+	struct ckpt_ctx *ctx;
+	int err;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	ctx->uflags = uflags;
+	ctx->kflags = kflags;
+
+	err = -EBADF;
+	ctx->file = fget(fd);
+	if (!ctx->file)
+		goto err;
+
+	return ctx;
+ err:
+	ckpt_ctx_free(ctx);
+	return ERR_PTR(err);
+}
 
 /**
  * sys_checkpoint - checkpoint a container
@@ -23,7 +206,26 @@
  */
 SYSCALL_DEFINE3(checkpoint, pid_t, pid, int, fd, unsigned long, flags)
 {
-	return -ENOSYS;
+	struct ckpt_ctx *ctx;
+	int ret;
+
+	/* no flags for now */
+	if (flags)
+		return -EINVAL;
+
+	if (pid == 0)
+		pid = task_pid_vnr(current);
+	ctx = ckpt_ctx_alloc(fd, flags, CKPT_CTX_CHECKPOINT);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = do_checkpoint(ctx, pid);
+
+	if (!ret)
+		ret = ctx->crid;
+
+	ckpt_ctx_free(ctx);
+	return ret;
 }
 
 /**
@@ -37,5 +239,41 @@ SYSCALL_DEFINE3(checkpoint, pid_t, pid, int, fd, unsigned long, flags)
  */
 SYSCALL_DEFINE3(restart, pid_t, pid, int, fd, unsigned long, flags)
 {
-	return -ENOSYS;
+	struct ckpt_ctx *ctx;
+	int ret;
+
+	/* no flags for now */
+	if (flags)
+		return -EINVAL;
+
+	ctx = ckpt_ctx_alloc(fd, flags, CKPT_CTX_RESTART);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = do_restart(ctx, pid);
+
+	ckpt_ctx_free(ctx);
+	return ret;
+}
+
+
+/* 'ckpt_debug_level' controls the verbosity level of c/r code */
+#ifdef CONFIG_CHECKPOINT_DEBUG
+
+/* FIX: allow to change during runtime */
+unsigned long __read_mostly ckpt_debug_level = CKPT_DDEFAULT;
+
+static __init int ckpt_debug_setup(char *s)
+{
+	long val, ret;
+
+	ret = strict_strtoul(s, 10, &val);
+	if (ret < 0)
+		return ret;
+	ckpt_debug_level = val;
+	return 0;
 }
+
+__setup("ckpt_debug=", ckpt_debug_setup);
+
+#endif /* CONFIG_CHECKPOINT_DEBUG */
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
new file mode 100644
index 0000000..14da928
--- /dev/null
+++ b/include/linux/checkpoint.h
@@ -0,0 +1,84 @@
+#ifndef _LINUX_CHECKPOINT_H_
+#define _LINUX_CHECKPOINT_H_
+/*
+ *  Generic checkpoint-restart
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <linux/checkpoint_types.h>
+#include <linux/checkpoint_hdr.h>
+
+extern int ckpt_kwrite(struct ckpt_ctx *ctx, void *buf, int count);
+extern int ckpt_kread(struct ckpt_ctx *ctx, void *buf, int count);
+
+extern void _ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr, int n);
+extern void ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr);
+extern void *ckpt_hdr_get(struct ckpt_ctx *ctx, int n);
+extern void *ckpt_hdr_get_type(struct ckpt_ctx *ctx, int n, int type);
+
+extern int ckpt_write_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h);
+extern int ckpt_write_obj_type(struct ckpt_ctx *ctx,
+			       void *ptr, int len, int type);
+extern int ckpt_write_buffer(struct ckpt_ctx *ctx, void *ptr, int len);
+extern int ckpt_write_string(struct ckpt_ctx *ctx, char *str, int len);
+extern void __ckpt_write_err(struct ckpt_ctx *ctx, char *fmt, ...);
+extern int ckpt_write_err(struct ckpt_ctx *ctx, char *fmt, ...);
+
+extern int _ckpt_read_obj_type(struct ckpt_ctx *ctx,
+			       void *ptr, int len, int type);
+extern int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len);
+extern int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len);
+extern void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type);
+extern void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int len, int type);
+
+extern int do_checkpoint(struct ckpt_ctx *ctx, pid_t pid);
+extern int do_restart(struct ckpt_ctx *ctx, pid_t pid);
+
+/* task */
+extern int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int restore_task(struct ckpt_ctx *ctx);
+
+
+/* debugging flags */
+#define CKPT_DBASE	0x1		/* anything */
+#define CKPT_DSYS	0x2		/* generic (system) */
+#define CKPT_DRW	0x4		/* image read/write */
+
+#define CKPT_DDEFAULT	0x7		/* default debug level */
+
+#ifndef CKPT_DFLAG
+#define CKPT_DFLAG	0x0		/* nothing */
+#endif
+
+#ifdef CONFIG_CHECKPOINT_DEBUG
+extern unsigned long ckpt_debug_level;
+
+/* use this to select a specific debug level */
+#define _ckpt_debug(level, fmt, args...)			\
+	do {						\
+		if (ckpt_debug_level & (level))		\
+			pr_debug("[%d:c/r:%s] " fmt,	\
+				task_pid_vnr(current),	\
+				 __func__, ## args);	\
+	} while (0)
+
+/*
+ * CKPT_DBASE is the base flags, doesn't change
+ * CKPT_DFLAG is to be redfined in each source file
+ */
+#define ckpt_debug(fmt, args...)  \
+	_ckpt_debug(CKPT_DBASE | CKPT_DFLAG, fmt, ## args)
+
+#else
+
+#define _ckpt_debug(level, fmt, args...)	do { } while (0)
+#define ckpt_debug(fmt, args...)		do { } while (0)
+
+#endif /* CONFIG_CHECKPOINT_DEBUG */
+
+#endif /* _LINUX_CHECKPOINT_H_ */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
new file mode 100644
index 0000000..c9a1653
--- /dev/null
+++ b/include/linux/checkpoint_hdr.h
@@ -0,0 +1,100 @@
+#ifndef _CHECKPOINT_CKPT_HDR_H_
+#define _CHECKPOINT_CKPT_HDR_H_
+/*
+ *  Generic container checkpoint-restart
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <linux/types.h>
+#include <linux/utsname.h>
+
+/*
+ * To maintain compatibility between 32-bit and 64-bit architecture flavors,
+ * keep data 64-bit aligned: use padding for structure members, and use
+ * __attribute__((aligned (8))) for the entire structure.
+ *
+ * Quoting Arnd Bergmann:
+ *   "This structure has an odd multiple of 32-bit members, which means
+ *   that if you put it into a larger structure that also contains 64-bit
+ *   members, the larger structure may get different alignment on x86-32
+ *   and x86-64, which you might want to avoid. I can't tell if this is
+ *   an actual problem here. ... In this case, I'm pretty sure that
+ *   sizeof(ckpt_hdr_task) on x86-32 is different from x86-64, since it
+ *   will be 32-bit aligned on x86-32."
+ */
+
+/*
+ * header format: 'struct ckpt_hdr' must prefix all other headers. Therfore
+ * when a header is passed around, the information about it (type, size)
+ * is readily available.
+ */
+struct ckpt_hdr {
+	__u32 type;
+	__u32 len;
+} __attribute__((aligned(8)));
+
+/* header types */
+enum {
+	CKPT_HDR_HEADER = 1,
+	CKPT_HDR_BUFFER,
+	CKPT_HDR_STRING,
+
+	CKPT_HDR_TASK = 101,
+
+	CKPT_HDR_TAIL = 9001,
+
+	CKPT_HDR_ERROR = 9999,
+};
+
+/* checkpoint image header */
+struct ckpt_hdr_header {
+	struct ckpt_hdr h;
+	__u64 magic;
+
+	__u16 _padding;
+
+	__u16 major;
+	__u16 minor;
+	__u16 patch;
+	__u16 rev;
+
+	__u16 uts_release_len;
+	__u16 uts_version_len;
+	__u16 uts_machine_len;
+
+	__u64 time;	/* when checkpoint taken */
+	__u64 uflags;	/* uflags from checkpoint */
+
+	/*
+	 * the header is followed by three strings:
+	 *   char release[__NEW_UTS_LEN];
+	 *   char version[__NEW_UTS_LEN];
+	 *   char machine[__NEW_UTS_LEN];
+	 */
+} __attribute__((aligned(8)));
+
+
+/* checkpoint image trailer */
+struct ckpt_hdr_tail {
+	struct ckpt_hdr h;
+	__u64 magic;
+} __attribute__((aligned(8)));
+
+
+/* task data */
+struct ckpt_hdr_task {
+	struct ckpt_hdr h;
+	__u32 state;
+	__u32 exit_state;
+	__u32 exit_code;
+	__u32 exit_signal;
+
+	__u32 task_comm_len;
+} __attribute__((aligned(8)));
+
+#endif /* _CHECKPOINT_CKPT_HDR_H_ */
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
new file mode 100644
index 0000000..2b8d59f
--- /dev/null
+++ b/include/linux/checkpoint_types.h
@@ -0,0 +1,43 @@
+#ifndef _LINUX_CHECKPOINT_TYPES_H_
+#define _LINUX_CHECKPOINT_TYPES_H_
+/*
+ *  Generic checkpoint-restart
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#define CHECKPOINT_VERSION  1
+
+
+#ifdef __KERNEL__
+
+struct ckpt_ctx {
+	int crid;		/* unique checkpoint id */
+
+	pid_t root_pid;		/* container identifier */
+
+	unsigned long kflags;	/* kerenl flags */
+	unsigned long uflags;	/* user flags */
+	unsigned long oflags;	/* restart: uflags from checkpoint */
+
+	struct file *file;	/* input/output file */
+	int total;		/* total read/written */
+
+	char err_string[256];	/* checkpoint: error string */
+};
+
+/* ckpt_ctx: kflags */
+#define CKPT_CTX_CHECKPOINT_BIT		1
+#define CKPT_CTX_RESTART_BIT		2
+
+#define CKPT_CTX_CHECKPOINT	(1 << CKPT_CTX_CHECKPOINT_BIT)
+#define CKPT_CTX_RESTART	(1 << CKPT_CTX_RESTART_BIT)
+
+#endif /* __KERNEL__ */
+
+
+#endif /* _LINUX_CHECKPOINT_TYPES_H_ */
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 5b4e28b..73a9d02 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -50,4 +50,8 @@
 #define INOTIFYFS_SUPER_MAGIC	0x2BAD1DEA
 
 #define STACK_END_MAGIC		0x57AC6E9D
+
+#define CHECKPOINT_MAGIC_HEAD  0x00feed0cc0a2d200LL
+#define CHECKPOINT_MAGIC_TAIL  0x002d2a0cc0deef00LL
+
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 6cdcf38..ab795a6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -961,6 +961,19 @@ config DMA_API_DEBUG
 	  This option causes a performance degredation.  Use only if you want
 	  to debug device drivers. If unsure, say N.
 
+config CHECKPOINT_DEBUG
+	bool "Checkpoint/restart debugging (EXPERIMENTAL)"
+	depends on CHECKPOINT
+	default y
+	help
+	  This options turns on the debugging output of checkpoint/restart.
+	  The level of verbosity is controlled by 'ckpt_debug_level' and can
+	  be set at boot time with "ckpt_debug=" option.
+
+	  Turning this option off will reduce the size of the c/r code. If
+	  turned on, it is unlikely to incur visible overhead if the debug
+	  level is set to zero.
+
 source "samples/Kconfig"
 
 source "lib/Kconfig.kgdb"
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 06/43] c/r: x86_32 support for checkpoint/restart
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (4 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 05/43] c/r: basic infrastructure for checkpoint/restart Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 07/43] c/r: infrastructure for shared objects Oren Laadan
                   ` (28 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Add logic to save and restore architecture specific state, including
thread-specific state, CPU registers and FPU state.
In addition, architecture capabilities are saved in an architecure
specific extension of the header (ckpt_hdr_head_arch); Currently this
includes only FPU capabilities.
Currently only x86-32 is supported.
TODO: validate input values for regs, debug regs, and TLS on restart.
Changelog[v16]:
  - All objects are preceded by ckpt_hdr (TLS and xstate_buf)
  - Add architecture identifier to main header
Changelog[v14]:
  - Use new interface ckpt_hdr_get/put()
  - Embed struct ckpt_hdr in struct ckpt_hdr...
  - Remove preempt_disable/enable() around init_fpu() and fix leak
  - Revert change to pr_debug(), back to ckpt_debug()
  - Move code related to task_struct to checkpoint/process.c
Changelog[v12]:
  - A couple of missed calls to ckpt_hbuf_put()
  - Replace obsolete ckpt_debug() with pr_debug()
Changelog[v9]:
  - Add arch-specific header that details architecture capabilities;
    split FPU restore to send capabilities only once.
  - Test for zero TLS entries in ckpt_write_thread()
  - Fix asm/checkpoint_hdr.h so it can be included from user-space
Changelog[v7]:
  - Fix save/restore state of FPU
Changelog[v5]:
  - Remove preempt_disable() when restoring debug registers
Changelog[v4]:
  - Fix header structure alignment
Changelog[v2]:
  - Pad header structures to 64 bits to ensure compatibility
  - Follow Dave Hansen's refactoring of the original post
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 arch/x86/include/asm/checkpoint_hdr.h |  110 +++++++++
 arch/x86/mm/Makefile                  |    2 +
 arch/x86/mm/checkpoint.c              |  431 +++++++++++++++++++++++++++++++++
 checkpoint/checkpoint.c               |    7 +-
 checkpoint/process.c                  |   20 ++-
 checkpoint/restart.c                  |    6 +
 include/linux/checkpoint.h            |   10 +
 include/linux/checkpoint_hdr.h        |   12 +-
 8 files changed, 594 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/checkpoint_hdr.h b/arch/x86/include/asm/checkpoint_hdr.h
new file mode 100644
index 0000000..362b499
--- /dev/null
+++ b/arch/x86/include/asm/checkpoint_hdr.h
@@ -0,0 +1,110 @@
+#ifndef __ASM_X86_CKPT_HDR_H
+#define __ASM_X86_CKPT_HDR_H
+/*
+ *  Checkpoint/restart - architecture specific headers x86
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <linux/types.h>
+#include <linux/checkpoint_hdr.h>
+
+/*
+ * To maintain compatibility between 32-bit and 64-bit architecture flavors,
+ * keep data 64-bit aligned: use padding for structure members, and use
+ * __attribute__((aligned (8))) for the entire structure.
+ *
+ * Quoting Arnd Bergmann:
+ *   "This structure has an odd multiple of 32-bit members, which means
+ *   that if you put it into a larger structure that also contains 64-bit
+ *   members, the larger structure may get different alignment on x86-32
+ *   and x86-64, which you might want to avoid. I can't tell if this is
+ *   an actual problem here. ... In this case, I'm pretty sure that
+ *   sizeof(ckpt_hdr_task) on x86-32 is different from x86-64, since it
+ *   will be 32-bit aligned on x86-32."
+ */
+
+/* i387 structure seen from kernel/userspace */
+#ifdef __KERNEL__
+#include <asm/processor.h>
+#else
+#include <sys/user.h>
+#endif
+
+#ifndef CONFIG_X86_64
+#define CKPT_ARCH_ID	CKPT_ARCH_X86_32
+#endif
+
+/* arch dependent header types */
+enum {
+	CKPT_HDR_THREAD_TLS = 201,
+	CKPT_HDR_CPU_FPU,
+};
+
+struct ckpt_hdr_header_arch {
+	struct ckpt_hdr h;
+	/* FIXME: add HAVE_HWFP */
+	__u16 has_fxsr;
+	__u16 has_xsave;
+	__u16 xstate_size;
+	__u16 _pading;
+} __attribute__((aligned(8)));
+
+struct ckpt_hdr_thread {
+	struct ckpt_hdr h;
+	/* FIXME: restart blocks */
+	__u16 gdt_entry_tls_entries;
+	__u16 sizeof_tls_array;
+	__u16 ntls;	/* number of TLS entries to follow */
+} __attribute__((aligned(8)));
+
+struct ckpt_hdr_cpu {
+	struct ckpt_hdr h;
+	/* see struct pt_regs (x86-64) */
+	__u64 r15;
+	__u64 r14;
+	__u64 r13;
+	__u64 r12;
+	__u64 bp;
+	__u64 bx;
+	__u64 r11;
+	__u64 r10;
+	__u64 r9;
+	__u64 r8;
+	__u64 ax;
+	__u64 cx;
+	__u64 dx;
+	__u64 si;
+	__u64 di;
+	__u64 orig_ax;
+	__u64 ip;
+	__u64 cs;
+	__u64 flags;
+	__u64 sp;
+	__u64 ss;
+
+	/* segment registers */
+	__u64 ds;
+	__u64 es;
+	__u64 fs;
+	__u64 gs;
+
+	/* debug registers */
+	__u64 debugreg0;
+	__u64 debugreg1;
+	__u64 debugreg2;
+	__u64 debugreg3;
+	__u64 debugreg6;
+	__u64 debugreg7;
+
+	__u32 uses_debug;
+	__u32 used_math;
+
+	/* thread_xstate contents follow (if used_math) */
+} __attribute__((aligned(8)));
+
+#endif /* __ASM_X86_CKPT_HDR__H */
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fdd30d0..7d894a5 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -19,3 +19,5 @@ obj-$(CONFIG_K8_NUMA)		+= k8topology_64.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat_$(BITS).o
 
 obj-$(CONFIG_MEMTEST)		+= memtest.o
+
+obj-$(CONFIG_CHECKPOINT)	+= checkpoint.o
diff --git a/arch/x86/mm/checkpoint.c b/arch/x86/mm/checkpoint.c
new file mode 100644
index 0000000..f54fe80
--- /dev/null
+++ b/arch/x86/mm/checkpoint.c
@@ -0,0 +1,431 @@
+/*
+ *  Checkpoint/restart - architecture specific support for x86
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <asm/desc.h>
+#include <asm/i387.h>
+
+#include <asm/checkpoint_hdr.h>
+#include <linux/checkpoint.h>
+
+/**************************************************************************
+ * Checkpoint
+ */
+
+/* dump the thread_struct of a given task */
+int checkpoint_thread(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct ckpt_hdr_thread *h;
+	struct thread_struct *thread;
+	struct desc_struct *desc;
+	int ntls = 0;
+	int n, ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_THREAD);
+	if (!h)
+		return -ENOMEM;
+
+	thread = &t->thread;
+
+	/* calculate no. of TLS entries that follow */
+	desc = thread->tls_array;
+	for (n = GDT_ENTRY_TLS_ENTRIES; n > 0; n--, desc++) {
+		if (desc->a || desc->b)
+			ntls++;
+	}
+
+	h->gdt_entry_tls_entries = GDT_ENTRY_TLS_ENTRIES;
+	h->sizeof_tls_array = sizeof(thread->tls_array);
+	h->ntls = ntls;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ret;
+
+	ckpt_debug("ntls %d\n", ntls);
+	if (ntls == 0)
+		return 0;
+
+	/*
+	 * For simplicity dump the entire array, cherry-pick upon restart
+	 * FIXME: the TLS descriptors in the GDT should be called out and
+	 * not tied to the in-kernel representation.
+	 */
+	ret = ckpt_write_obj_type(ctx, thread->tls_array,
+				  sizeof(thread->tls_array),
+				  CKPT_HDR_THREAD_TLS);
+
+	/* IGNORE RESTART BLOCKS FOR NOW ... */
+
+	return ret;
+}
+
+#ifndef CONFIG_X86_64
+
+static void save_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+	struct thread_struct *thread = &t->thread;
+	struct pt_regs *regs = task_pt_regs(t);
+
+	h->bp = regs->bp;
+	h->bx = regs->bx;
+	h->ax = regs->ax;
+	h->cx = regs->cx;
+	h->dx = regs->dx;
+	h->si = regs->si;
+	h->di = regs->di;
+	h->orig_ax = regs->orig_ax;
+	h->ip = regs->ip;
+	h->cs = regs->cs;
+	h->flags = regs->flags;
+	h->sp = regs->sp;
+	h->ss = regs->ss;
+
+	h->ds = regs->ds;
+	h->es = regs->es;
+
+	/*
+	 * for checkpoint in process context (from within a container)
+	 * the GS and FS registers should be saved from the hardware;
+	 * otherwise they are already sabed on the thread structure
+	 */
+	if (t == current) {
+		savesegment(gs, h->gs);
+		savesegment(fs, h->fs);
+	} else {
+		h->gs = thread->gs;
+		h->fs = thread->fs;
+	}
+
+	/*
+	 * for checkpoint in process context (from within a container),
+	 * the actual syscall is taking place at this very moment; so
+	 * we (optimistically) subtitute the future return value (0) of
+	 * this syscall into the orig_eax, so that upon restart it will
+	 * succeed (or it will endlessly retry checkpoint...)
+	 */
+	if (t == current) {
+		BUG_ON(h->orig_ax < 0);
+		h->ax = 0;
+	}
+}
+
+static void save_cpu_debug(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+	struct thread_struct *thread = &t->thread;
+
+	/* debug regs */
+
+	/*
+	 * for checkpoint in process context (from within a container),
+	 * get the actual registers; otherwise get the saved values.
+	 */
+
+	if (t == current) {
+		get_debugreg(h->debugreg0, 0);
+		get_debugreg(h->debugreg1, 1);
+		get_debugreg(h->debugreg2, 2);
+		get_debugreg(h->debugreg3, 3);
+		get_debugreg(h->debugreg6, 6);
+		get_debugreg(h->debugreg7, 7);
+	} else {
+		h->debugreg0 = thread->debugreg0;
+		h->debugreg1 = thread->debugreg1;
+		h->debugreg2 = thread->debugreg2;
+		h->debugreg3 = thread->debugreg3;
+		h->debugreg6 = thread->debugreg6;
+		h->debugreg7 = thread->debugreg7;
+	}
+
+	h->uses_debug = !!(task_thread_info(t)->flags & TIF_DEBUG);
+}
+
+static void save_cpu_fpu(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+	h->used_math = tsk_used_math(t) ? 1 : 0;
+}
+
+static int checkpoint_cpu_fpu(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct ckpt_hdr *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, xstate_size + sizeof(*h),
+			      CKPT_HDR_CPU_FPU);
+	if (!h)
+		return -ENOMEM;
+
+	/* i387 + MMU + SSE logic */
+	preempt_disable();	/* needed it (t == current) */
+
+	/*
+	 * normally, no need to unlazy_fpu(), since TS_USEDFPU flag
+	 * was cleared when task was context-switched out...
+	 * except if we are in process context, in which case we do
+	 */
+	if (t == current && (task_thread_info(t)->status & TS_USEDFPU))
+		unlazy_fpu(current);
+
+	/*
+	 * For simplicity dump the entire structure.
+	 * FIX: need to be deliberate about what registers we are
+	 * dumping for traceability and compatibility.
+	 */
+	memcpy(h + 1, t->thread.xstate, xstate_size);
+	preempt_enable();	/* needed if (t == current) */
+
+	ret = ckpt_write_obj(ctx, h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+#endif	/* !CONFIG_X86_64 */
+
+/* dump the cpu state and registers of a given task */
+int checkpoint_cpu(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct ckpt_hdr_cpu *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_CPU);
+	if (!h)
+		return -ENOMEM;
+
+	save_cpu_regs(h, t);
+	save_cpu_debug(h, t);
+	save_cpu_fpu(h, t);
+
+	ckpt_debug("math %d debug %d\n", h->used_math, h->uses_debug);
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	if (ret < 0)
+		goto out;
+
+	if (h->used_math)
+		ret = checkpoint_cpu_fpu(ctx, t);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+int checkpoint_write_header_arch(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_header_arch *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_HEADER_ARCH);
+	if (!h)
+		return -ENOMEM;
+
+	/* FPU capabilities */
+	h->has_fxsr = cpu_has_fxsr;
+	h->has_xsave = cpu_has_xsave;
+	h->xstate_size = xstate_size;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+/**************************************************************************
+ * Restart
+ */
+
+/* read the thread_struct into the current task */
+int restore_thread(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_thread *h;
+	struct task_struct *t = current;
+	struct thread_struct *thread = &t->thread;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_THREAD);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ckpt_debug("ntls %d\n", h->ntls);
+
+	ret = -EINVAL;
+	if (h->gdt_entry_tls_entries != GDT_ENTRY_TLS_ENTRIES ||
+	    h->sizeof_tls_array != sizeof(thread->tls_array) ||
+	    h->ntls > GDT_ENTRY_TLS_ENTRIES)
+		goto out;
+
+	if (h->ntls > 0) {
+		struct ckpt_hdr *hh;
+		int size, cpu;
+
+		/*
+		 * restore TLS by hand: why convert to struct user_desc if
+		 * sys_set_thread_entry() will convert it back ?
+		 */
+
+		size = sizeof(struct desc_struct) * GDT_ENTRY_TLS_ENTRIES;
+		hh = ckpt_read_obj_type(ctx, size + sizeof(*hh),
+					CKPT_HDR_THREAD_TLS);
+		if (IS_ERR(hh)) {
+			ret = PTR_ERR(hh);
+			goto out;
+		}
+
+		/* TODO: ADD SANITY CHECKS TO VERIFY VALIDITY OF VALUES */
+		cpu = get_cpu();
+		memcpy(thread->tls_array, hh + 1, size);
+		load_TLS(thread, cpu);
+		put_cpu();
+
+		ckpt_hdr_put(ctx, hh);
+	}
+
+	ret = 0;
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+#ifndef CONFIG_X86_64
+
+static int load_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+	struct thread_struct *thread = &t->thread;
+	struct pt_regs *regs = task_pt_regs(t);
+
+	/* TODO: ADD SANITY CHECKS TO VERIFY VALIDITY OF VALUES */
+	regs->bx = h->bx;
+	regs->cx = h->cx;
+	regs->dx = h->dx;
+	regs->si = h->si;
+	regs->di = h->di;
+	regs->bp = h->bp;
+	regs->ax = h->ax;
+	regs->ds = h->ds;
+	regs->es = h->es;
+	regs->orig_ax = h->orig_ax;
+	regs->ip = h->ip;
+	regs->cs = h->cs;
+	regs->flags = h->flags;
+	regs->sp = h->sp;
+	regs->ss = h->ss;
+
+	thread->gs = h->gs;
+	thread->fs = h->fs;
+	loadsegment(gs, h->gs);
+	loadsegment(fs, h->fs);
+
+	return 0;
+}
+
+static int load_cpu_debug(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+	/* TODO: ADD SANITY CHECKS TO VERIFY VALIDITY OF VALUES */
+	if (h->uses_debug) {
+		set_debugreg(h->debugreg0, 0);
+		set_debugreg(h->debugreg1, 1);
+		/* ignore 4, 5 */
+		set_debugreg(h->debugreg2, 2);
+		set_debugreg(h->debugreg3, 3);
+		set_debugreg(h->debugreg6, 6);
+		set_debugreg(h->debugreg7, 7);
+	}
+
+	return 0;
+}
+
+static int load_cpu_fpu(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+	preempt_disable();
+
+	__clear_fpu(t);		/* in case we used FPU in user mode */
+
+	if (!h->used_math)
+		clear_used_math();
+
+	preempt_enable();
+	return 0;
+}
+
+static int restore_cpu_fpu(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct ckpt_hdr *h;
+	int ret;
+
+	/* init_fpu() eventually also calls set_used_math() */
+	ret = init_fpu(current);
+	if (ret < 0)
+		return ret;
+
+	h = ckpt_read_obj_type(ctx, xstate_size + sizeof(*h),
+			       CKPT_HDR_CPU_FPU);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	memcpy(t->thread.xstate, h + 1, xstate_size);
+
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+#endif	/* !CONFIG_X86_64 */
+
+/* read the cpu state and registers for the current task */
+int restore_cpu(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_cpu *h;
+	struct task_struct *t = current;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_CPU);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ckpt_debug("math %d debug %d\n", h->used_math, h->uses_debug);
+
+	ret = load_cpu_regs(h, t);
+	if (ret < 0)
+		goto out;
+	ret = load_cpu_debug(h, t);
+	if (ret < 0)
+		goto out;
+	ret = load_cpu_fpu(h, t);
+	if (ret < 0)
+		goto out;
+
+	if (h->used_math)
+		ret = restore_cpu_fpu(ctx, t);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+int restore_read_header_arch(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_header_arch *h;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_HEADER_ARCH);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	/* FIX: verify compatibility of architecture features */
+
+	/* verify FPU capabilities */
+	if (h->has_fxsr != cpu_has_fxsr ||
+	    h->has_xsave != cpu_has_xsave ||
+	    h->xstate_size != xstate_size)
+		ret = -EINVAL;
+
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index 56b690a..409c78b 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -190,6 +190,8 @@ static int checkpoint_write_header(struct ckpt_ctx *ctx)
 	do_gettimeofday(&ktv);
 	uts = utsname();
 
+	h->arch_id = cpu_to_le16(CKPT_ARCH_ID);  /* see asm/checkpoitn.h */
+
 	h->magic = CHECKPOINT_MAGIC_HEAD;
 	h->major = (LINUX_VERSION_CODE >> 16) & 0xff;
 	h->minor = (LINUX_VERSION_CODE >> 8) & 0xff;
@@ -219,7 +221,10 @@ static int checkpoint_write_header(struct ckpt_ctx *ctx)
 	ret = ckpt_write_buffer(ctx, uts->machine, sizeof(uts->machine));
  up:
 	up_read(&uts_sem);
-	return ret;
+	if (ret < 0)
+		return ret;
+
+	return checkpoint_write_header_arch(ctx);
 }
 
 /* write the checkpoint trailer */
diff --git a/checkpoint/process.c b/checkpoint/process.c
index c2b7564..6cab717 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -53,7 +53,15 @@ int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 
 	ret = checkpoint_task_struct(ctx, t);
 	ckpt_debug("task %d\n", ret);
-
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_thread(ctx, t);
+	ckpt_debug("thread %d\n", ret);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_cpu(ctx, t);
+	ckpt_debug("cpu %d\n", ret);
+ out:
 	return ret;
 }
 
@@ -92,6 +100,14 @@ int restore_task(struct ckpt_ctx *ctx)
 
 	ret = restore_task_struct(ctx);
 	ckpt_debug("task %d\n", ret);
-
+	if (ret < 0)
+		goto out;
+	ret = restore_thread(ctx);
+	ckpt_debug("thread %d\n", ret);
+	if (ret < 0)
+		goto out;
+	ret = restore_cpu(ctx);
+	ckpt_debug("cpu %d\n", ret);
+ out:
 	return ret;
 }
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index a6f73d3..e839538 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -248,6 +248,8 @@ static int restore_read_header(struct ckpt_ctx *ctx)
 		return PTR_ERR(h);
 
 	ret = -EINVAL;
+	if (le16_to_cpu(h->arch_id) != CKPT_ARCH_ID)
+		goto out;
 	if (h->magic != CHECKPOINT_MAGIC_HEAD ||
 	    h->rev != CHECKPOINT_VERSION ||
 	    h->major != ((LINUX_VERSION_CODE >> 16) & 0xff) ||
@@ -276,6 +278,10 @@ static int restore_read_header(struct ckpt_ctx *ctx)
 	if (ret < 0)
 		goto out;
 	ret = _ckpt_read_buffer(ctx, uts->machine, sizeof(uts->machine));
+	if (ret < 0)
+		goto out;
+
+	ret = restore_read_header_arch(ctx);
  out:
 	kfree(uts);
 	ckpt_hdr_put(ctx, h);
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 14da928..6247114 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -12,6 +12,7 @@
 
 #include <linux/checkpoint_types.h>
 #include <linux/checkpoint_hdr.h>
+#include <asm/checkpoint_hdr.h>
 
 extern int ckpt_kwrite(struct ckpt_ctx *ctx, void *buf, int count);
 extern int ckpt_kread(struct ckpt_ctx *ctx, void *buf, int count);
@@ -43,6 +44,15 @@ extern int do_restart(struct ckpt_ctx *ctx, pid_t pid);
 extern int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t);
 extern int restore_task(struct ckpt_ctx *ctx);
 
+/* arch hooks */
+extern int checkpoint_write_header_arch(struct ckpt_ctx *ctx);
+extern int checkpoint_thread(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int checkpoint_cpu(struct ckpt_ctx *ctx, struct task_struct *t);
+
+extern int restore_read_header_arch(struct ckpt_ctx *ctx);
+extern int restore_thread(struct ckpt_ctx *ctx);
+extern int restore_cpu(struct ckpt_ctx *ctx);
+
 
 /* debugging flags */
 #define CKPT_DBASE	0x1		/* anything */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index c9a1653..a0c576c 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -41,22 +41,32 @@ struct ckpt_hdr {
 /* header types */
 enum {
 	CKPT_HDR_HEADER = 1,
+	CKPT_HDR_HEADER_ARCH,
 	CKPT_HDR_BUFFER,
 	CKPT_HDR_STRING,
 
 	CKPT_HDR_TASK = 101,
+	CKPT_HDR_THREAD,
+	CKPT_HDR_CPU,
+
+	/* 201-299: reserved for arch-dependent */
 
 	CKPT_HDR_TAIL = 9001,
 
 	CKPT_HDR_ERROR = 9999,
 };
 
+/* architecture */
+enum {
+	CKPT_ARCH_X86_32 = 1,
+};
+
 /* checkpoint image header */
 struct ckpt_hdr_header {
 	struct ckpt_hdr h;
 	__u64 magic;
 
-	__u16 _padding;
+	__u16 arch_id;
 
 	__u16 major;
 	__u16 minor;
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 07/43] c/r: infrastructure for shared objects
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (5 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 06/43] c/r: x86_32 support " Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 08/43] c/r: introduce '->checkpoint()' method in 'struct file_operations' Oren Laadan
                   ` (27 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
The state of shared objects is saved once. On the first encounter, the
state is dumped and the object is assigned a unique identifier (objref)
and also stored in a hash table (indexed by its physical kernel address).
>From then on the object will be found in the hash and only its identifier
is saved.
On restart the identifier is looked up in the hash table; if not found
then the state is read, the object is created, and added to the hash
table (this time indexed by its identifier). Otherwise, the object in
the hash table is used.
The hash is "one-way": objects added to it are never deleted until the
hash it discarded. The hash is discarded at the end of checkpoint or
restart, whether successful or not.
The hash keeps a reference to every object that is added to it, matching
the object's type, and maintains this reference during its lifetime.
Therefore, it is always safe to use an object that is stored in the hash.
Changelog[v16]:
  - Introduce ckpt_obj_lookup() to find an object by its ptr
Changelog[v14]:
  - Introduce 'struct ckpt_obj_ops' to better modularize shared objs.
  - Replace long 'switch' statements with table lookups and callbacks.
  - Introduce checkpoint_obj() and restart_obj() helpers
  - Shared objects now dumped/saved right before they are referenced
  - Cleanup interface of shared objects
Changelog[v13]:
  - Use hash_long() with 'unsigned long' cast to support 64bit archs
    (Nathan Lynch <ntl@pobox.com>)
Changelog[v11]:
  - Doc: be explicit about grabbing a reference and object lifetime
Changelog[v4]:
  - Fix calculation of hash table size
Changelog[v3]:
  - Use standard hlist_... for hash table
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 checkpoint/Makefile              |    1 +
 checkpoint/objhash.c             |  397 ++++++++++++++++++++++++++++++++++++++
 checkpoint/restart.c             |   46 +++++
 checkpoint/sys.c                 |    7 +
 include/linux/checkpoint.h       |   15 ++
 include/linux/checkpoint_hdr.h   |   14 ++
 include/linux/checkpoint_types.h |    2 +
 7 files changed, 482 insertions(+), 0 deletions(-)
diff --git a/checkpoint/Makefile b/checkpoint/Makefile
index 99364cc..5aa6a75 100644
--- a/checkpoint/Makefile
+++ b/checkpoint/Makefile
@@ -4,6 +4,7 @@
 
 obj-$(CONFIG_CHECKPOINT) += \
 	sys.o \
+	objhash.o \
 	checkpoint.o \
 	restart.o \
 	process.o
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
new file mode 100644
index 0000000..82b4618
--- /dev/null
+++ b/checkpoint/objhash.c
@@ -0,0 +1,397 @@
+/*
+ *  Checkpoint-restart - object hash infrastructure to manage shared objects
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DOBJ
+
+#include <linux/kernel.h>
+#include <linux/hash.h>
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+struct ckpt_obj;
+struct ckpt_obj_ops;
+
+/* object operations */
+struct ckpt_obj_ops {
+	char *obj_name;
+	enum obj_type obj_type;
+	void (*ref_drop)(void *ptr);
+	int (*ref_grab)(void *ptr);
+	int (*checkpoint)(struct ckpt_ctx *ctx, void *ptr);
+	void *(*restore)(struct ckpt_ctx *ctx);
+};
+
+struct ckpt_obj {
+	int objref;
+	void *ptr;
+	struct ckpt_obj_ops *ops;
+	struct hlist_node hash;
+};
+
+struct ckpt_obj_hash {
+	struct hlist_head *head;
+	int next_free_objref;
+};
+
+/* helper grab/drop functions: */
+
+static void obj_no_drop(void *ptr)
+{
+	return;
+}
+
+static int obj_no_grab(void *ptr)
+{
+	return 0;
+}
+
+static struct ckpt_obj_ops ckpt_obj_ops[] = {
+	/* ignored object */
+	{
+		.obj_name = "IGNORED",
+		.obj_type = CKPT_OBJ_IGNORE,
+		.ref_drop = obj_no_drop,
+		.ref_grab = obj_no_grab,
+	},
+};
+
+
+#define CKPT_OBJ_HASH_NBITS  10
+#define CKPT_OBJ_HASH_TOTAL  (1UL << CKPT_OBJ_HASH_NBITS)
+
+static void obj_hash_clear(struct ckpt_obj_hash *obj_hash)
+{
+	struct hlist_head *h = obj_hash->head;
+	struct hlist_node *n, *t;
+	struct ckpt_obj *obj;
+	int i;
+
+	for (i = 0; i < CKPT_OBJ_HASH_TOTAL; i++) {
+		hlist_for_each_entry_safe(obj, n, t, &h[i], hash) {
+			obj->ops->ref_drop(obj->ptr);
+			kfree(obj);
+		}
+	}
+}
+
+void ckpt_obj_hash_free(struct ckpt_ctx *ctx)
+{
+	struct ckpt_obj_hash *obj_hash = ctx->obj_hash;
+
+	if (obj_hash) {
+		obj_hash_clear(obj_hash);
+		kfree(obj_hash->head);
+		kfree(ctx->obj_hash);
+		ctx->obj_hash = NULL;
+	}
+}
+
+int ckpt_obj_hash_alloc(struct ckpt_ctx *ctx)
+{
+	struct ckpt_obj_hash *obj_hash;
+	struct hlist_head *head;
+
+	obj_hash = kzalloc(sizeof(*obj_hash), GFP_KERNEL);
+	if (!obj_hash)
+		return -ENOMEM;
+	head = kzalloc(CKPT_OBJ_HASH_TOTAL * sizeof(*head), GFP_KERNEL);
+	if (!head) {
+		kfree(obj_hash);
+		return -ENOMEM;
+	}
+
+	obj_hash->head = head;
+	obj_hash->next_free_objref = 1;
+
+	ctx->obj_hash = obj_hash;
+	return 0;
+}
+
+static struct ckpt_obj *obj_find_by_ptr(struct ckpt_ctx *ctx, void *ptr)
+{
+	struct hlist_head *h;
+	struct hlist_node *n;
+	struct ckpt_obj *obj;
+
+	h = &ctx->obj_hash->head[hash_long((unsigned long) ptr,
+					   CKPT_OBJ_HASH_NBITS)];
+	hlist_for_each_entry(obj, n, h, hash)
+		if (obj->ptr == ptr)
+			return obj;
+	return NULL;
+}
+
+static struct ckpt_obj *obj_find_by_objref(struct ckpt_ctx *ctx, int objref)
+{
+	struct hlist_head *h;
+	struct hlist_node *n;
+	struct ckpt_obj *obj;
+
+	h = &ctx->obj_hash->head[hash_long((unsigned long) objref,
+					   CKPT_OBJ_HASH_NBITS)];
+	hlist_for_each_entry(obj, n, h, hash)
+		if (obj->objref == objref)
+			return obj;
+	return NULL;
+}
+
+/**
+ * ckpt_obj_new - add an object to the obj_hash
+ * @ctx: checkpoint context
+ * @ptr: pointer to object
+ * @objref: object unique id
+ * @ops: object operations
+ *
+ * Returns: objref
+ *
+ * Add the object to the obj_hash. If @objref is zero, assign a unique
+ * object id and use @ptr as a hash key [checkpoint]. Else use @objref
+ * as a key [restart].
+ */
+static int obj_new(struct ckpt_ctx *ctx, void *ptr, int objref,
+		   struct ckpt_obj_ops *ops)
+{
+	struct ckpt_obj *obj;
+	int i, ret;
+
+	obj = kmalloc(sizeof(*obj), GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	obj->ptr = ptr;
+	obj->ops = ops;
+
+	if (objref) {
+		/* use @obj->objref to index (restart) */
+		obj->objref = objref;
+		i = hash_long((unsigned long) objref, CKPT_OBJ_HASH_NBITS);
+	} else {
+		/* use @obj->ptr to index, assign objref (checkpoint) */
+		obj->objref = ctx->obj_hash->next_free_objref++;;
+		i = hash_long((unsigned long) ptr, CKPT_OBJ_HASH_NBITS);
+	}
+
+	ret = ops->ref_grab(obj->ptr);
+	if (ret < 0)
+		kfree(obj);
+	else
+		hlist_add_head(&obj->hash, &ctx->obj_hash->head[i]);
+
+	return (ret < 0 ? ret : obj->objref);
+}
+
+/**
+* ckpt_obj_lookup - lookup object (by pointer) in objhash
+* @ctx: checkpoint context
+* @ptr: pointer to object
+* @type: object type
+*
+* Look up the object pointed to by @ptr in the hash table
+*
+* [This is used during checkpoint].
+*
+* Return: objref (or zero if not found)
+*/
+int ckpt_obj_lookup(struct ckpt_ctx *ctx, void *ptr, enum obj_type type)
+{
+	struct ckpt_obj *obj;
+
+	obj = obj_find_by_ptr(ctx, ptr);
+	BUG_ON(obj && obj->ops->obj_type != type);
+	if (obj)
+		ckpt_debug("%s objref %d\n", obj->ops->obj_name, obj->objref);
+	return obj ? obj->objref : 0;
+}
+
+/**
+* ckpt_obj_lookup_add - lookup object and add if not in obj_hash
+* @ctx: checkpoint context
+* @ptr: pointer to object
+* @type: object type
+* @first: [output] first encoutner (added to table)
+*
+* Look up the object pointed to by @ptr in the hash table. If it isn't
+* already found there, add the object, and allocate a unique object
+* id. Grab a reference to every object that is added, and maintain the
+* reference until the entire hash is freed.
+*
+* [This is used during checkpoint].
+*
+* Return: objref
+*/
+int ckpt_obj_lookup_add(struct ckpt_ctx *ctx, void *ptr,
+			enum obj_type type, int *first)
+{
+	struct ckpt_obj_ops *ops = &ckpt_obj_ops[type];
+	struct ckpt_obj *obj;
+	int objref;
+
+	obj = obj_find_by_ptr(ctx, ptr);
+	if (!obj) {
+		objref = obj_new(ctx, ptr, 0, ops);
+		if (objref < 0)
+			return objref;
+		*first = 1;
+	} else if (obj->ops->obj_type != type) {   /* sanity check */
+		return -EINVAL;
+	} else {
+		objref = obj->objref;
+		*first = 0;
+	}
+
+	ckpt_debug("%s objref %d first %d\n", ops->obj_name, objref, *first);
+	return objref;
+}
+
+/**
+ * checkpoint_obj - if not already in hash, add object and checkpoint
+ * @ctx: checkpoint context
+ * @ptr: pointer to object
+ * @type: object type
+ *
+ * Look up the object pointed to by @ptr in the hash table. If it
+ * isn't already there, then add the object to the table, allocate a
+ * fresh unique id (objref) and save the object's state, and grab a
+ * reference to every object that is added. (Maintain the reference
+ * until the entire hash is free).
+ *
+ * [This is used during checkpoint].
+ *
+ * Returns: objref
+ */
+int checkpoint_obj(struct ckpt_ctx *ctx, void *ptr, enum obj_type type)
+{
+	struct ckpt_obj_ops *ops = &ckpt_obj_ops[type];
+	struct ckpt_hdr_objref *h;
+	struct ckpt_obj *obj;
+	int objref, ret = 0;
+
+	/* make sure we don't change this accidentally */
+	BUG_ON(ops->obj_type != type);
+
+	obj = obj_find_by_ptr(ctx, ptr);
+	if (obj) {
+		BUG_ON(obj->ops->obj_type != type);
+		return obj->objref;
+	}
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_OBJREF);
+	if (!h)
+		return -ENOMEM;
+
+	objref = obj_new(ctx, ptr, 0, ops);
+	if (objref < 0)
+		goto out;
+
+	h->objtype = type;
+	h->objref = objref;
+	ret = ckpt_write_obj(ctx, &h->h);
+	if (ret < 0)
+		goto out;
+
+	/* invoke callback to actually dump the state */
+	if (ops->checkpoint)
+		ret = ops->checkpoint(ctx, ptr);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return (ret < 0 ? ret : objref);
+}
+
+/**
+ * restore_obj - read in and restore a (first seen) shared object
+ * @ctx: checkpoint context
+ * @h: ckpt_hdr of shared object
+ *
+ * Read in the header payload (struct ckpt_hdr_objref). Lookup the
+ * object to verify it isn't there.  Then restore the object's state
+ * and add it to the objash. No need to explicitly grab a reference -
+ * we hold the initial instance of this object. (Object maintained
+ * until the entire hash is free).
+ *
+ * [This is used during restart].
+ */
+int restore_obj(struct ckpt_ctx *ctx, struct ckpt_hdr_objref *h)
+{
+	struct ckpt_obj_ops *ops;
+	void *ptr = NULL;
+	int ret;
+
+	ckpt_debug("len %d ref %d type %d\n", h->h.len, h->objref, h->objtype);
+	if (obj_find_by_objref(ctx, h->objref))
+		return -EINVAL;
+
+	if (h->objtype >= CKPT_OBJ_MAX)
+		return -EINVAL;
+
+	ops = &ckpt_obj_ops[h->objtype];
+	BUG_ON(ops->obj_type != h->objtype);
+
+	if (ops->restore)
+		ptr = ops->restore(ctx);
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+
+	ret = obj_new(ctx, ptr, h->objref, ops);
+	/*
+	 * Drop an extra reference to the object returned by ops->restore:
+	 * On success, this clears the extra reference taken by obj_new(),
+	 * and on failure, this cleans up the object itself.
+	 */
+	ops->ref_drop(ptr);
+	if (ret < 0)
+		ops->ref_drop(ptr);
+
+	return ret;
+}
+
+/**
+ * ckpt_obj_insert - add an object with a given objref to obj_hash
+ * @ctx: checkpoint context
+ * @ptr: pointer to object
+ * @objref: unique object id
+ * @type: object type
+ *
+ * Add the object pointer to by @ptr and identified by unique object id
+ * @objref to the hash table (indexed by @objref).  Grab a reference to
+ * every object added, and maintain it until the entire hash is freed.
+ *
+ * [This is used during restart].
+ */
+int ckpt_obj_insert(struct ckpt_ctx *ctx, void *ptr, int objref,
+		    enum obj_type type)
+{
+	struct ckpt_obj_ops *ops = &ckpt_obj_ops[type];
+
+	ckpt_debug("%s objref %d\n", ops->obj_name, objref);
+	return obj_new(ctx, ptr, objref, ops);
+}
+
+/**
+ * ckpt_obj_fetch - fetch an object by its identifier
+ * @ctx: checkpoint context
+ * @objref: object id
+ * @type: object type
+ *
+ * Lookup the objref identifier by @objref in the hash table. Return
+ * an error not found.
+ *
+ * [This is used during restart].
+ */
+void *ckpt_obj_fetch(struct ckpt_ctx *ctx, int objref, enum obj_type type)
+{
+	struct ckpt_obj *obj;
+
+	obj = obj_find_by_objref(ctx, objref);
+	if (!obj)
+		return ERR_PTR(-EINVAL);
+	ckpt_debug("%s ref %d\n", obj->ops->obj_name, obj->objref);
+	return (obj->ops->obj_type == type ? obj->ptr : ERR_PTR(-ENOMSG));
+}
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index e839538..ce52e30 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -20,6 +20,34 @@
 #include <linux/checkpoint_hdr.h>
 
 /**
+ * _ckpt_read_objref - dispatch handling of a shared object
+ * @ctx: checkpoint context
+ * @hh: objrect descriptor
+ */
+static int _ckpt_read_objref(struct ckpt_ctx *ctx, struct ckpt_hdr *hh)
+{
+	struct ckpt_hdr *h;
+	int ret;
+
+	h = ckpt_hdr_get(ctx, hh->len);
+	if (!h)
+		return -ENOMEM;
+
+	*h = *hh;	/* yay ! */
+
+	_ckpt_debug(CKPT_DOBJ, "shared len %d type %d\n", h->len, h->type);
+	ret = ckpt_kread(ctx, (h + 1), hh->len - sizeof(struct ckpt_hdr));
+	if (ret < 0)
+		goto out;
+
+	ret = restore_obj(ctx, (struct ckpt_hdr_objref *) h);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+
+/**
  * _ckpt_read_obj - read an object (ckpt_hdr followed by payload)
  * @ctx: checkpoint context
  * @h: desired ckpt_hdr
@@ -34,6 +62,7 @@ static int _ckpt_read_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h,
 {
 	int ret;
 
+ again:
 	ret = ckpt_kread(ctx, h, sizeof(*h));
 	if (ret < 0)
 		return ret;
@@ -41,7 +70,15 @@ static int _ckpt_read_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h,
 		    h->type, h->len, len, max);
 	if (h->len < sizeof(*h))
 		return -EINVAL;
+
 	/* if len specified, enforce, else if maximum specified, enforce */
+	if (h->type == CKPT_HDR_OBJREF) {
+		ret = _ckpt_read_objref(ctx, h);
+		if (ret < 0)
+			return ret;
+		goto again;
+	}
+
 	if ((len && h->len != len) || (!len && max && h->len > max))
 		return -EINVAL;
 
@@ -150,6 +187,7 @@ static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
 	struct ckpt_hdr *h;
 	int ret;
 
+ again:
 	ret = ckpt_kread(ctx, &hh, sizeof(hh));
 	if (ret < 0)
 		return ERR_PTR(ret);
@@ -157,6 +195,14 @@ static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
 		    hh.type, hh.len, len, max);
 	if (hh.len < sizeof(*h))
 		return ERR_PTR(-EINVAL);
+
+	if (hh.type == CKPT_HDR_OBJREF) {
+		ret = _ckpt_read_objref(ctx, &hh);
+		if (ret < 0)
+			return ERR_PTR(ret);
+		goto again;
+	}
+
 	/* if len specified, enforce, else if maximum specified, enforce */
 	if ((len && hh.len != len) || (!len && max && hh.len > max))
 		return ERR_PTR(-EINVAL);
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 2ad9722..c8a260d 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -168,6 +168,9 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 {
 	if (ctx->file)
 		fput(ctx->file);
+
+	ckpt_obj_hash_free(ctx);
+
 	kfree(ctx);
 }
 
@@ -189,6 +192,10 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
 	if (!ctx->file)
 		goto err;
 
+	err = -ENOMEM;
+	if (ckpt_obj_hash_alloc(ctx) < 0)
+		goto err;
+
 	return ctx;
  err:
 	ckpt_ctx_free(ctx);
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 6247114..9f65a81 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -32,11 +32,25 @@ extern int ckpt_write_err(struct ckpt_ctx *ctx, char *fmt, ...);
 
 extern int _ckpt_read_obj_type(struct ckpt_ctx *ctx,
 			       void *ptr, int len, int type);
+extern int _ckpt_read_nbuffer(struct ckpt_ctx *ctx, void *ptr, int len);
 extern int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len);
 extern int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len);
 extern void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type);
 extern void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int len, int type);
 
+/* obj_hash */
+extern void ckpt_obj_hash_free(struct ckpt_ctx *ctx);
+extern int ckpt_obj_hash_alloc(struct ckpt_ctx *ctx);
+
+extern int restore_obj(struct ckpt_ctx *ctx, struct ckpt_hdr_objref *h);
+extern int checkpoint_obj(struct ckpt_ctx *ctx, void *ptr, enum obj_type type);
+extern void *ckpt_obj_fetch(struct ckpt_ctx *ctx, int objref,
+			    enum obj_type type);
+extern int ckpt_obj_lookup_add(struct ckpt_ctx *ctx, void *ptr,
+			       enum obj_type type, int *first);
+extern int ckpt_obj_insert(struct ckpt_ctx *ctx, void *ptr, int objref,
+			   enum obj_type type);
+
 extern int do_checkpoint(struct ckpt_ctx *ctx, pid_t pid);
 extern int do_restart(struct ckpt_ctx *ctx, pid_t pid);
 
@@ -58,6 +72,7 @@ extern int restore_cpu(struct ckpt_ctx *ctx);
 #define CKPT_DBASE	0x1		/* anything */
 #define CKPT_DSYS	0x2		/* generic (system) */
 #define CKPT_DRW	0x4		/* image read/write */
+#define CKPT_DOBJ	0x8		/* shared objects */
 
 #define CKPT_DDEFAULT	0x7		/* default debug level */
 
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index a0c576c..195e44b 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -44,6 +44,7 @@ enum {
 	CKPT_HDR_HEADER_ARCH,
 	CKPT_HDR_BUFFER,
 	CKPT_HDR_STRING,
+	CKPT_HDR_OBJREF,
 
 	CKPT_HDR_TASK = 101,
 	CKPT_HDR_THREAD,
@@ -61,6 +62,19 @@ enum {
 	CKPT_ARCH_X86_32 = 1,
 };
 
+/* shared objrects (objref) */
+struct ckpt_hdr_objref {
+	struct ckpt_hdr h;
+	__u32 objtype;
+	__s32 objref;
+} __attribute__((aligned(8)));
+
+/* shared objects types */
+enum obj_type {
+	CKPT_OBJ_IGNORE = 0,
+	CKPT_OBJ_MAX
+};
+
 /* checkpoint image header */
 struct ckpt_hdr_header {
 	struct ckpt_hdr h;
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index 2b8d59f..c1032fa 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -27,6 +27,8 @@ struct ckpt_ctx {
 	struct file *file;	/* input/output file */
 	int total;		/* total read/written */
 
+	struct ckpt_obj_hash *obj_hash;	/* repository for shared objects */
+
 	char err_string[256];	/* checkpoint: error string */
 };
 
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 08/43] c/r: introduce '->checkpoint()' method in 'struct file_operations'
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (6 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 07/43] c/r: infrastructure for shared objects Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 09/43] c/r: dump open file descriptors Oren Laadan
                   ` (26 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
While we assume all normal files and directories can be checkpointed,
there are, as usual in the VFS, specialized places that will always
need an ability to override these defaults. Although we could do this
completely in the checkpoint code, that would bitrot quickly.
This adds a new 'file_operations' function for checkpointing a file.
It is assumed that there should be a dirt-simple way to make something
(un)checkpointable that fits in with current code.
As you can see in the ext[234] patches down the road, all that we have
to do to make something simple be supported is add a single "generic"
f_op entry.
Also introduce vfs_fcntl() so that it can be called from restart (see
patch adding restart of files).
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 fs/fcntl.c                       |   21 +++++++++++++--------
 include/linux/checkpoint_types.h |    2 ++
 include/linux/fs.h               |    6 ++++++
 3 files changed, 21 insertions(+), 8 deletions(-)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 1ad7031..17020a9 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -337,6 +337,18 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	return err;
 }
 
+int vfs_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp)
+{
+	int err;
+
+	err = security_file_fcntl(filp, cmd, arg);
+	if (err)
+		goto out;
+	err = do_fcntl(fd, cmd, arg, filp);
+ out:
+	return err;
+}
+
 SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {	
 	struct file *filp;
@@ -346,14 +358,7 @@ SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 	if (!filp)
 		goto out;
 
-	err = security_file_fcntl(filp, cmd, arg);
-	if (err) {
-		fput(filp);
-		return err;
-	}
-
-	err = do_fcntl(fd, cmd, arg, filp);
-
+	err = vfs_fcntl(fd, cmd, arg, filp);
  	fput(filp);
 out:
 	return err;
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index c1032fa..9c14034 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -15,6 +15,8 @@
 
 #ifdef __KERNEL__
 
+#include <linux/sched.h>
+
 struct ckpt_ctx {
 	int crid;		/* unique checkpoint id */
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9c4348a..60d9229 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -8,6 +8,7 @@
 
 #include <linux/limits.h>
 #include <linux/ioctl.h>
+#include <linux/checkpoint_types.h>
 
 /*
  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -1082,6 +1083,8 @@ struct file_lock {
 
 #include <linux/fcntl.h>
 
+extern int vfs_fcntl(int fd, unsigned cmd, unsigned long arg, struct file *fp);
+
 extern void send_sigio(struct fown_struct *fown, int fd, int band);
 
 /* fs/sync.c */
@@ -1508,6 +1511,7 @@ struct file_operations {
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
 	int (*setlease)(struct file *, long, struct file_lock **);
+	int (*checkpoint)(struct ckpt_ctx *, struct file *);
 };
 
 struct inode_operations {
@@ -2306,6 +2310,8 @@ void inode_sub_bytes(struct inode *inode, loff_t bytes);
 loff_t inode_get_bytes(struct inode *inode);
 void inode_set_bytes(struct inode *inode, loff_t bytes);
 
+#define generic_file_checkpoint NULL
+
 extern int vfs_readdir(struct file *, filldir_t, void *);
 
 extern int vfs_stat(char __user *, struct kstat *);
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 09/43] c/r: dump open file descriptors
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (7 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 08/43] c/r: introduce '->checkpoint()' method in 'struct file_operations' Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 10/43] c/r: restore " Oren Laadan
                   ` (25 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Dump the file table with 'struct ckpt_hdr_file_table, followed by all
open file descriptors. Because the 'struct file' corresponding to an
fd can be shared, they are assigned an objref and registered in the
object hash. A reference to the 'file *' is kept for as long as it
lives in the hash (the hash is only cleaned up at the end of the
checkpoint).
Also provide generic_checkpoint_file() and generic_restore_file()
which is good for normal files and directories. It does not support
yet unlinked files or directories.
Changelog[v16]:
  - Reorder patch (move earlier in series)
  - Handle shared files_struct objects
Changelog[v14]:
  - File objects are dumped/restored prior to the first reference
  - Introduce a per file-type restore() callback
  - Use struct file_operations->checkpoint()
  - Put code for generic file descriptors in a separate function
  - Use one CKPT_FILE_GENERIC for both regular files and dirs
  - Revert change to pr_debug(), back to ckpt_debug()
  - Use only unsigned fields in checkpoint headers
  - Rename:  ckpt_write_files() => checkpoint_fd_table()
  - Rename:  ckpt_write_fd_data() => checkpoint_file()
  - Discard field 'h->parent'
Changelog[v12]:
  - Replace obsolete ckpt_debug() with pr_debug()
Changelog[v11]:
  - Discard handling of opened symlinks (there is no such thing)
  - ckpt_scan_fds() retries from scratch if hits size limits
Changelog[v9]:
  - Fix a couple of leaks in ckpt_write_files()
  - Drop useless kfree from ckpt_scan_fds()
Changelog[v8]:
  - initialize 'coe' to workaround gcc false warning
Changelog[v6]:
  - Balance all calls to ckpt_hbuf_get() with matching ckpt_hbuf_put()
    (even though it's not really needed)
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 checkpoint/Makefile              |    3 +-
 checkpoint/checkpoint.c          |   25 +++
 checkpoint/files.c               |  311 ++++++++++++++++++++++++++++++++++++++
 checkpoint/objhash.c             |   40 +++++
 checkpoint/process.c             |   28 ++++
 checkpoint/sys.c                 |    1 +
 include/linux/checkpoint.h       |   14 ++-
 include/linux/checkpoint_hdr.h   |   49 ++++++
 include/linux/checkpoint_types.h |    8 +
 include/linux/fs.h               |    4 +
 10 files changed, 481 insertions(+), 2 deletions(-)
diff --git a/checkpoint/Makefile b/checkpoint/Makefile
index 5aa6a75..1d0c058 100644
--- a/checkpoint/Makefile
+++ b/checkpoint/Makefile
@@ -7,4 +7,5 @@ obj-$(CONFIG_CHECKPOINT) += \
 	objhash.o \
 	checkpoint.o \
 	restart.o \
-	process.o
+	process.o \
+	files.o
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index 409c78b..a346b7e 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -15,6 +15,7 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/file.h>
+#include <linux/fs_struct.h>
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/utsname.h>
@@ -244,10 +245,34 @@ static int checkpoint_write_tail(struct ckpt_ctx *ctx)
 	return ret;
 }
 
+/* setup checkpoint-specific parts of ctx */
+static int init_checkpoint_ctx(struct ckpt_ctx *ctx, pid_t pid)
+{
+	struct fs_struct *fs;
+
+	ctx->root_pid = pid;
+
+	/*
+	 * assume checkpointer is in container's root vfs
+	 * FIXME: this works for now, but will change with real containers
+	 */
+
+	fs = current->fs;
+	read_lock(&fs->lock);
+	ctx->fs_mnt = fs->root;
+	path_get(&ctx->fs_mnt);
+	read_unlock(&fs->lock);
+
+	return 0;
+}
+
 int do_checkpoint(struct ckpt_ctx *ctx, pid_t pid)
 {
 	int ret;
 
+	ret = init_checkpoint_ctx(ctx, pid);
+	if (ret < 0)
+		goto out;
 	ret = checkpoint_write_header(ctx);
 	if (ret < 0)
 		goto out;
diff --git a/checkpoint/files.c b/checkpoint/files.c
new file mode 100644
index 0000000..d10dfb6
--- /dev/null
+++ b/checkpoint/files.c
@@ -0,0 +1,311 @@
+/*
+ *  Checkpoint file descriptors
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DFILE
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+
+/**************************************************************************
+ * Checkpoint
+ */
+
+/**
+ * fill_fname - return pathname of a given file
+ * @path: path name
+ * @root: relative root
+ * @buf: buffer for pathname
+ * @len: buffer length (in) and pathname length (out)
+ */
+static char *fill_fname(struct path *path, struct path *root,
+			char *buf, int *len)
+{
+	struct path tmp = *root;
+	char *fname;
+
+	BUG_ON(!buf);
+	spin_lock(&dcache_lock);
+	fname = __d_path(path, &tmp, buf, *len);
+	spin_unlock(&dcache_lock);
+	if (IS_ERR(fname))
+		return fname;
+	*len = (buf + (*len) - fname);
+	/*
+	 * FIX: if __d_path() changed these, it must have stepped out of
+	 * init's namespace. Since currently we require a unified namespace
+	 * within the container: simply fail.
+	 */
+	if (tmp.mnt != root->mnt || tmp.dentry != root->dentry)
+		fname = ERR_PTR(-EBADF);
+
+	return fname;
+}
+
+/**
+ * dump_fname - write a file name
+ * @ctx: checkpoint context
+ * @path: path name
+ * @root: relative root
+ */
+static int dump_fname(struct ckpt_ctx *ctx,
+		      struct path *path, struct path *root)
+{
+	char *buf, *fname;
+	int ret, flen;
+
+	/*
+	 * FIXME: we can optimize and save memory (and storage) if we
+	 * share strings (through objhash) and reference them instead
+	 */
+
+	flen = PATH_MAX;
+	buf = kmalloc(flen, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	fname = fill_fname(path, root, buf, &flen);
+	if (!IS_ERR(fname))
+		ret = ckpt_write_obj_type(ctx, fname, flen,
+					  CKPT_HDR_FILE_NAME);
+	else
+		ret = PTR_ERR(fname);
+
+	kfree(buf);
+	return ret;
+}
+
+#define CKPT_DEFAULT_FDTABLE  256		/* an initial guess */
+
+/**
+ * scan_fds - scan file table and construct array of open fds
+ * @files: files_struct pointer
+ * @fdtable: (output) array of open fds
+ *
+ * Returns the number of open fds found, and also the file table
+ * array via *fdtable. The caller should free the array.
+ *
+ * The caller must validate the file descriptors collected in the
+ * array before using them, e.g. by using fcheck_files(), in case
+ * the task's fdtable changes in the meantime.
+ */
+static int scan_fds(struct files_struct *files, int **fdtable)
+{
+	struct fdtable *fdt;
+	int *fds = NULL;
+	int i = 0, n = 0;
+	int tot = CKPT_DEFAULT_FDTABLE;
+
+	/*
+	 * We assume that all tasks possibly sharing the file table are
+	 * frozen (or we are a single process and we checkpoint ourselves).
+	 * Therefore, we can safely proceed after krealloc() from where we
+	 * left off. Otherwise the file table may be modified by another
+	 * task after we scan it. The behavior is this case is undefined,
+	 * and either checkpoint or restart will likely fail.
+	 */
+ retry:
+	fds = krealloc(fds, tot * sizeof(*fds), GFP_KERNEL);
+	if (!fds)
+		return -ENOMEM;
+
+	spin_lock(&files->file_lock);
+	rcu_read_lock();
+	fdt = files_fdtable(files);
+	for (/**/; i < fdt->max_fds; i++) {
+		if (!fcheck_files(files, i))
+			continue;
+		if (n == tot) {
+			spin_unlock(&files->file_lock);
+			rcu_read_unlock();
+			tot *= 2;	/* won't overflow: kmalloc will fail */
+			goto retry;
+		}
+		fds[n++] = i;
+	}
+	rcu_read_unlock();
+	spin_unlock(&files->file_lock);
+
+	*fdtable = fds;
+	return n;
+}
+
+int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
+			   struct ckpt_hdr_file *h)
+{
+	h->f_flags = file->f_flags;
+	h->f_mode = file->f_mode;
+	h->f_pos = file->f_pos;
+	h->f_version = file->f_version;
+
+	/* FIX: need also file->uid, file->gid, file->f_owner, etc */
+
+	return 0;
+}
+
+int generic_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+	struct ckpt_hdr_file_generic *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+	if (!h)
+		return -ENOMEM;
+
+	/*
+	 * FIXME: when we'll add support for unlinked files/dirs, we'll
+	 * need to distinguish between unlinked filed and unlinked dirs.
+	 */
+	h->common.f_type = CKPT_FILE_GENERIC;
+
+	ret = checkpoint_file_common(ctx, file, &h->common);
+	if (ret < 0)
+		goto out;
+	ret = ckpt_write_obj(ctx, &h->common.h);
+	if (ret < 0)
+		goto out;
+	ret = dump_fname(ctx, &file->f_path, &ctx->fs_mnt);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+EXPORT_SYMBOL(generic_file_checkpoint);
+
+/* checkpoint callback for file pointer */
+int checkpoint_file(struct ckpt_ctx *ctx, void *ptr)
+{
+	struct file *file = (struct file *) ptr;
+
+	if (!file->f_op->checkpoint)
+		return -EBADF;
+	return file->f_op->checkpoint(ctx, file);
+}
+
+/**
+ * ckpt_write_file_desc - dump the state of a given file descriptor
+ * @ctx: checkpoint context
+ * @files: files_struct pointer
+ * @fd: file descriptor
+ *
+ * Saves the state of the file descriptor; looks up the actual file
+ * pointer in the hash table, and if found saves the matching objref,
+ * otherwise calls ckpt_write_file to dump the file pointer too.
+ */
+static int checkpoint_file_desc(struct ckpt_ctx *ctx,
+				struct files_struct *files, int fd)
+{
+	struct ckpt_hdr_file_desc *h;
+	struct file *file = NULL;
+	struct fdtable *fdt;
+	int objref, ret;
+	int coe = 0;	/* avoid gcc warning */
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_DESC);
+	if (!h)
+		return -ENOMEM;
+
+	rcu_read_lock();
+	fdt = files_fdtable(files);
+	file = fcheck_files(files, fd);
+	if (file) {
+		coe = FD_ISSET(fd, fdt->close_on_exec);
+		get_file(file);
+	}
+	rcu_read_unlock();
+
+	/* sanity check (although this shouldn't happen) */
+	ret = -EBADF;
+	if (!file)
+		goto out;
+
+	/*
+	 * if seen first time, this will add 'file' to the objhash, keep
+	 * a reference to it, dump its state while at it.
+	 */
+	objref = checkpoint_obj(ctx, file, CKPT_OBJ_FILE);
+	ckpt_debug("fd %d objref %d file %p coe %d)\n", fd, objref, file, coe);
+	if (objref < 0) {
+		ret = objref;
+		goto out;
+	}
+
+	h->fd_objref = objref;
+	h->fd_descriptor = fd;
+	h->fd_close_on_exec = coe;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+out:
+	ckpt_hdr_put(ctx, h);
+	if (file)
+		fput(file);
+	return ret;
+}
+
+static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
+				    struct files_struct *files)
+{
+	struct ckpt_hdr_file_table *h;
+	int *fdtable = NULL;
+	int nfds, n, ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_TABLE);
+	if (!h)
+		return -ENOMEM;
+
+	nfds = scan_fds(files, &fdtable);
+	if (nfds < 0) {
+		ret = nfds;
+		goto out;
+	}
+
+	h->fdt_nfds = nfds;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		goto out;
+
+	ckpt_debug("nfds %d\n", nfds);
+	for (n = 0; n < nfds; n++) {
+		ret = checkpoint_file_desc(ctx, files, fdtable[n]);
+		if (ret < 0)
+			break;
+	}
+ out:
+	kfree(fdtable);
+	return ret;
+}
+
+/* checkpoint callback for file table */
+int checkpoint_file_table(struct ckpt_ctx *ctx, void *ptr)
+{
+	return do_checkpoint_file_table(ctx, (struct files_struct *) ptr);
+}
+
+/* checkpoint wrapper for file table */
+int checkpoint_obj_file_table(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct files_struct *files;
+	int objref;
+
+	files = get_files_struct(t);
+	if (!files)
+		return -EBUSY;
+	objref = checkpoint_obj(ctx, files, CKPT_OBJ_FILE_TABLE);
+	put_files_struct(files);
+
+	return objref;
+}
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index 82b4618..ea15958 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -13,6 +13,8 @@
 
 #include <linux/kernel.h>
 #include <linux/hash.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 
@@ -53,6 +55,28 @@ static int obj_no_grab(void *ptr)
 	return 0;
 }
 
+static int obj_file_table_grab(void *ptr)
+{
+	atomic_inc(&((struct files_struct *) ptr)->count);
+	return 0;
+}
+
+static void obj_file_table_drop(void *ptr)
+{
+	put_files_struct((struct files_struct *) ptr);
+}
+
+static int obj_file_grab(void *ptr)
+{
+	get_file((struct file *) ptr);
+	return 0;
+}
+
+static void obj_file_drop(void *ptr)
+{
+	fput((struct file *) ptr);
+}
+
 static struct ckpt_obj_ops ckpt_obj_ops[] = {
 	/* ignored object */
 	{
@@ -61,6 +85,22 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.ref_drop = obj_no_drop,
 		.ref_grab = obj_no_grab,
 	},
+	/* files_struct object */
+	{
+		.obj_name = "FILE_TABLE",
+		.obj_type = CKPT_OBJ_FILE_TABLE,
+		.ref_drop = obj_file_table_drop,
+		.ref_grab = obj_file_table_grab,
+		.checkpoint = checkpoint_file_table,
+	},
+	/* file object */
+	{
+		.obj_name = "FILE",
+		.obj_type = CKPT_OBJ_FILE,
+		.ref_drop = obj_file_drop,
+		.ref_grab = obj_file_grab,
+		.checkpoint = checkpoint_file,
+	},
 };
 
 
diff --git a/checkpoint/process.c b/checkpoint/process.c
index 6cab717..4c922b6 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -46,6 +46,30 @@ static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
 	return ckpt_write_string(ctx, t->comm, TASK_COMM_LEN);
 }
 
+static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct ckpt_hdr_task_objs *h;
+	int files_objref;
+	int ret;
+
+	files_objref = checkpoint_obj_file_table(ctx, t);
+	ckpt_debug("files: objref %d\n", files_objref);
+	if (files_objref < 0) {
+		ckpt_write_err(ctx, "task %d (%s), files_struct: %d",
+			       task_pid_vnr(t), t->comm, files_objref);
+		return files_objref;
+	}
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_OBJS);
+	if (!h)
+		return -ENOMEM;
+	h->files_objref = files_objref;
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
 /* dump the entire state of a given task */
 int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 {
@@ -55,6 +79,10 @@ int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 	ckpt_debug("task %d\n", ret);
 	if (ret < 0)
 		goto out;
+	ret = checkpoint_task_objs(ctx, t);
+	ckpt_debug("shared %d\n", ret);
+	if (ret < 0)
+		goto out;
 	ret = checkpoint_thread(ctx, t);
 	ckpt_debug("thread %d\n", ret);
 	if (ret < 0)
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index c8a260d..c2d3d90 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -170,6 +170,7 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 		fput(ctx->file);
 
 	ckpt_obj_hash_free(ctx);
+	path_put(&ctx->fs_mnt);
 
 	kfree(ctx);
 }
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 9f65a81..d170988 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -67,14 +67,26 @@ extern int restore_read_header_arch(struct ckpt_ctx *ctx);
 extern int restore_thread(struct ckpt_ctx *ctx);
 extern int restore_cpu(struct ckpt_ctx *ctx);
 
+/* file table */
+extern int checkpoint_obj_file_table(struct ckpt_ctx *ctx,
+				     struct task_struct *t);
+extern int checkpoint_file_table(struct ckpt_ctx *ctx, void *ptr);
+
+/* files */
+extern int checkpoint_file(struct ckpt_ctx *ctx, void *ptr);
+
+extern int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
+				  struct ckpt_hdr_file *h);
+
 
 /* debugging flags */
 #define CKPT_DBASE	0x1		/* anything */
 #define CKPT_DSYS	0x2		/* generic (system) */
 #define CKPT_DRW	0x4		/* image read/write */
 #define CKPT_DOBJ	0x8		/* shared objects */
+#define CKPT_DFILE	0x10		/* files and filesystem */
 
-#define CKPT_DDEFAULT	0x7		/* default debug level */
+#define CKPT_DDEFAULT	0x17		/* default debug level */
 
 #ifndef CKPT_DFLAG
 #define CKPT_DFLAG	0x0		/* nothing */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 195e44b..5b4fc52 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -47,11 +47,17 @@ enum {
 	CKPT_HDR_OBJREF,
 
 	CKPT_HDR_TASK = 101,
+	CKPT_HDR_TASK_OBJS,
 	CKPT_HDR_THREAD,
 	CKPT_HDR_CPU,
 
 	/* 201-299: reserved for arch-dependent */
 
+	CKPT_HDR_FILE_TABLE = 301,
+	CKPT_HDR_FILE_DESC,
+	CKPT_HDR_FILE_NAME,
+	CKPT_HDR_FILE,
+
 	CKPT_HDR_TAIL = 9001,
 
 	CKPT_HDR_ERROR = 9999,
@@ -72,6 +78,8 @@ struct ckpt_hdr_objref {
 /* shared objects types */
 enum obj_type {
 	CKPT_OBJ_IGNORE = 0,
+	CKPT_OBJ_FILE_TABLE,
+	CKPT_OBJ_FILE,
 	CKPT_OBJ_MAX
 };
 
@@ -121,4 +129,45 @@ struct ckpt_hdr_task {
 	__u32 task_comm_len;
 } __attribute__((aligned(8)));
 
+/* task's shared resources */
+struct ckpt_hdr_task_objs {
+	struct ckpt_hdr h;
+	__s32 files_objref;
+} __attribute__((aligned(8)));
+
+/* file system */
+struct ckpt_hdr_file_table {
+	struct ckpt_hdr h;
+	__s32 fdt_nfds;
+} __attribute__((aligned(8)));
+
+/* file descriptors */
+struct ckpt_hdr_file_desc {
+	struct ckpt_hdr h;
+	__s32 fd_objref;
+	__s32 fd_descriptor;
+	__u32 fd_close_on_exec;
+} __attribute__((aligned(8)));
+
+enum file_type {
+	CKPT_FILE_IGNORE = 0,
+	CKPT_FILE_GENERIC,
+	CKPT_FILE_MAX
+};
+
+/* file objects */
+struct ckpt_hdr_file {
+	struct ckpt_hdr h;
+	__u32 f_type;
+	__u32 f_mode;
+	__u32 f_flags;
+	__u32 _padding;
+	__u64 f_pos;
+	__u64 f_version;
+} __attribute__((aligned(8)));
+
+struct ckpt_hdr_file_generic {
+	struct ckpt_hdr_file common;
+} __attribute__((aligned(8)));
+
 #endif /* _CHECKPOINT_CKPT_HDR_H_ */
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index 9c14034..067d579 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -14,6 +14,12 @@
 
 
 #ifdef __KERNEL__
+struct ckpt_ctx;
+struct ckpt_hdr_file;
+
+#include <linux/list.h>
+#include <linux/path.h>
+#include <linux/fs.h>
 
 #include <linux/sched.h>
 
@@ -31,6 +37,8 @@ struct ckpt_ctx {
 
 	struct ckpt_obj_hash *obj_hash;	/* repository for shared objects */
 
+	struct path fs_mnt;     /* container root (FIXME) */
+
 	char err_string[256];	/* checkpoint: error string */
 };
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 60d9229..e64f892 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2310,7 +2310,11 @@ void inode_sub_bytes(struct inode *inode, loff_t bytes);
 loff_t inode_get_bytes(struct inode *inode);
 void inode_set_bytes(struct inode *inode, loff_t bytes);
 
+#ifdef CONFIG_CHECKPOINT
+extern int generic_file_checkpoint(struct ckpt_ctx *ctx, struct file *file);
+#else
 #define generic_file_checkpoint NULL
+#endif
 
 extern int vfs_readdir(struct file *, filldir_t, void *);
 
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 10/43] c/r: restore open file descriptors
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (8 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 09/43] c/r: dump open file descriptors Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 11/43] c/r: add generic '->checkpoint' f_op to ext fses Oren Laadan
                   ` (24 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
For each fd read 'struct ckpt_hdr_file_desc' and lookup objref in the
hash table; If not found in the hash table, (first occurence), read in
'struct ckpt_hdr_file', create a new file and register in the hash.
Otherwise attach the file pointer from the hash as an FD.
Changelog[v16]:
  - Reorder patch (move earlier in series)
  - Handle shared files_struct objects
Changelog[v14]:
  - Introduce a per file-type restore() callback
  - Revert change to pr_debug(), back to ckpt_debug()
  - Rename:  restore_files() => restore_fd_table()
  - Rename:  ckpt_read_fd_data() => restore_file()
  - Check whether calls to ckpt_hbuf_get() fail
  - Discard field 'hh->parent'
Changelog[v12]:
  - Replace obsolete ckpt_debug() with pr_debug()
Changelog[v6]:
  - Balance all calls to ckpt_hbuf_get() with matching ckpt_hbuf_put()
    (even though it's not really needed)
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 checkpoint/files.c         |  285 ++++++++++++++++++++++++++++++++++++++++++++
 checkpoint/objhash.c       |    2 +
 checkpoint/process.c       |   20 +++
 checkpoint/restart.c       |    9 ++
 include/linux/checkpoint.h |    5 +
 5 files changed, 321 insertions(+), 0 deletions(-)
diff --git a/checkpoint/files.c b/checkpoint/files.c
index d10dfb6..d7583d3 100644
--- a/checkpoint/files.c
+++ b/checkpoint/files.c
@@ -16,6 +16,8 @@
 #include <linux/sched.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/fsnotify.h>
+#include <linux/syscalls.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 
@@ -309,3 +311,286 @@ int checkpoint_obj_file_table(struct ckpt_ctx *ctx, struct task_struct *t)
 
 	return objref;
 }
+
+/**************************************************************************
+ * Restart
+ */
+
+/**
+ * read_open_fname - read a file name and open a file
+ * @ctx: checkpoint context
+ * @flags: file flags
+ * @mode: file mode
+ */
+static struct file *read_open_fname(struct ckpt_ctx *ctx, int flags, int mode)
+{
+	struct ckpt_hdr *h;
+	struct file *file;
+	char *fname;
+
+	h = ckpt_read_buf_type(ctx, PATH_MAX, CKPT_HDR_FILE_NAME);
+	if (IS_ERR(h))
+		return (struct file *) h;
+	fname = (char *) (h + 1);
+	ckpt_debug("fname '%s' flags %#x mode %#x\n", fname, flags, mode);
+
+	file = filp_open(fname, flags, mode);
+	ckpt_hdr_put(ctx, h);
+	return file;
+}
+
+static int close_all_fds(struct files_struct *files)
+{
+	int *fdtable;
+	int nfds;
+
+	nfds = scan_fds(files, &fdtable);
+	if (nfds < 0)
+		return nfds;
+	while (nfds--)
+		sys_close(fdtable[nfds]);
+	kfree(fdtable);
+	return 0;
+}
+
+/**
+ * attach_file - attach a lonely file ptr to a file descriptor
+ * @file: lonely file pointer
+ */
+static int attach_file(struct file *file)
+{
+	int fd = get_unused_fd_flags(0);
+
+	if (fd >= 0) {
+		get_file(file);
+		fsnotify_open(file->f_path.dentry);
+		fd_install(fd, file);
+	}
+	return fd;
+}
+
+#define CKPT_SETFL_MASK  \
+	(O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT | O_NOATIME)
+
+int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
+			struct ckpt_hdr_file *h)
+{
+	int ret;
+
+	/* FIX: need to restore uid, gid, owner etc */
+
+	/* safe to set 1st arg (fd) to 0, as command is F_SETFL */
+	ret = vfs_fcntl(0, F_SETFL, h->f_flags & CKPT_SETFL_MASK, file);
+	if (ret < 0)
+		goto out;
+
+	ret = vfs_llseek(file, h->f_pos, SEEK_SET);
+	if (ret == -ESPIPE)	/* ignore error on non-seekable files */
+		ret = 0;
+ out:
+	return ret;
+}
+
+static struct file *generic_file_restore(struct ckpt_ctx *ctx,
+					 struct ckpt_hdr_file *ptr)
+{
+	struct file *file;
+	int ret;
+
+	if (ptr->h.type != CKPT_HDR_FILE  ||
+	    ptr->h.len != sizeof(*ptr) || ptr->f_type != CKPT_FILE_GENERIC)
+		return ERR_PTR(-EINVAL);
+
+	file = read_open_fname(ctx, ptr->f_flags, ptr->f_mode);
+	if (IS_ERR(file))
+		return file;
+
+	ret = restore_file_common(ctx, file, ptr);
+	if (ret < 0) {
+		fput(file);
+		file = ERR_PTR(ret);
+	}
+	return file;
+}
+
+struct restore_file_ops {
+	char *file_name;
+	enum file_type file_type;
+	struct file * (*restore) (struct ckpt_ctx *ctx,
+				  struct ckpt_hdr_file *ptr);
+};
+
+static struct restore_file_ops restore_file_ops[] = {
+	/* ignored file */
+	{
+		.file_name = "IGNORE",
+		.file_type = CKPT_FILE_IGNORE,
+		.restore = NULL,
+	},
+	/* regular file/directory */
+	{
+		.file_name = "GENERIC",
+		.file_type = CKPT_FILE_GENERIC,
+		.restore = generic_file_restore,
+	},
+};
+
+static struct file *do_restore_file(struct ckpt_ctx *ctx)
+{
+	struct restore_file_ops *ops;
+	struct ckpt_hdr_file *h;
+	struct file *file = ERR_PTR(-EINVAL);
+
+	/*
+	 * All 'struct ckpt_hdr_file_...' begin with ckpt_hdr_file,
+	 * but the actual object depends on the file type. The length
+	 * should never be more than page.
+	 */
+	h = ckpt_read_buf_type(ctx, PAGE_SIZE, CKPT_HDR_FILE);
+	if (IS_ERR(h))
+		return (struct file *) h;
+	ckpt_debug("flags %#x mode %#x type %d\n",
+		 h->f_flags, h->f_mode, h->f_type);
+
+	if (h->f_type >= CKPT_FILE_MAX)
+		goto out;
+
+	ops = &restore_file_ops[h->f_type];
+	BUG_ON(ops->file_type != h->f_type);
+
+	if (ops->restore)
+		file = ops->restore(ctx, h);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return file;
+}
+
+/* restore callback for file pointer */
+void *restore_file(struct ckpt_ctx *ctx)
+{
+	return (void *) do_restore_file(ctx);
+}
+
+/**
+ * ckpt_read_file_desc - restore the state of a given file descriptor
+ * @ctx: checkpoint context
+ *
+ * Restores the state of a file descriptor; looks up the objref (in the
+ * header) in the hash table, and if found picks the matching file and
+ * use it; otherwise calls restore_file to restore the file too.
+ */
+static int restore_file_desc(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_file_desc *h;
+	struct file *file;
+	int newfd, ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FILE_DESC);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+	ckpt_debug("ref %d fd %d c.o.e %d\n",
+		 h->fd_objref, h->fd_descriptor, h->fd_close_on_exec);
+
+	ret = -EINVAL;
+	if (h->fd_objref <= 0 || h->fd_descriptor < 0)
+		goto out;
+
+	file = ckpt_obj_fetch(ctx, h->fd_objref, CKPT_OBJ_FILE);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto out;
+	}
+
+	newfd = attach_file(file);
+	if (newfd < 0) {
+		ret = newfd;
+		goto out;
+	}
+
+	ckpt_debug("newfd got %d wanted %d\n", newfd, h->fd_descriptor);
+
+	/* reposition if newfd isn't desired fd */
+	if (newfd != h->fd_descriptor) {
+		ret = sys_dup2(newfd, h->fd_descriptor);
+		if (ret < 0)
+			goto out;
+		sys_close(newfd);
+	}
+
+	if (h->fd_close_on_exec)
+		set_close_on_exec(h->fd_descriptor, 1);
+
+	ret = 0;
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+/* restore callback for file table */
+static struct files_struct *do_restore_file_table(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_file_table *h;
+	struct files_struct *files;
+	int i, ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FILE_TABLE);
+	if (IS_ERR(h))
+		return (struct files_struct *) h;
+
+	ckpt_debug("nfds %d\n", h->fdt_nfds);
+
+	ret = -EMFILE;
+	if (h->fdt_nfds < 0 || h->fdt_nfds > sysctl_nr_open)
+		goto out;
+
+	/*
+	 * We assume that restarting tasks, as created in user-space,
+	 * have distinct files_struct objects each. If not, we need to
+	 * call dup_fd() to make sure we don't overwrite an already
+	 * restored one.
+	 */
+
+	/* point of no return -- close all file descriptors */
+	ret = close_all_fds(current->files);
+	if (ret < 0)
+		goto out;
+
+	for (i = 0; i < h->fdt_nfds; i++) {
+		ret = restore_file_desc(ctx);
+		if (ret < 0)
+			break;
+	}
+ out:
+	ckpt_hdr_put(ctx, h);
+	if (!ret) {
+		files = current->files;
+		atomic_inc(&files->count);
+	} else {
+		files = ERR_PTR(ret);
+	}
+	return files;
+}
+
+void *restore_file_table(struct ckpt_ctx *ctx)
+{
+	return (void *) do_restore_file_table(ctx);
+}
+
+int restore_obj_file_table(struct ckpt_ctx *ctx, int files_objref)
+{
+	struct files_struct *files;
+
+	files = ckpt_obj_fetch(ctx, files_objref, CKPT_OBJ_FILE_TABLE);
+	if (IS_ERR(files))
+		return PTR_ERR(files);
+
+	if (files != current->files) {
+		task_lock(current);
+		put_files_struct(current->files);
+		current->files = files;
+		task_unlock(current);
+		atomic_inc(&files->count);
+	}
+
+	return 0;
+}
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index ea15958..f5c9f7c 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -92,6 +92,7 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.ref_drop = obj_file_table_drop,
 		.ref_grab = obj_file_table_grab,
 		.checkpoint = checkpoint_file_table,
+		.restore = restore_file_table,
 	},
 	/* file object */
 	{
@@ -100,6 +101,7 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.ref_drop = obj_file_drop,
 		.ref_grab = obj_file_grab,
 		.checkpoint = checkpoint_file,
+		.restore = restore_file,
 	},
 };
 
diff --git a/checkpoint/process.c b/checkpoint/process.c
index 4c922b6..5fd1573 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -121,6 +121,22 @@ static int restore_task_struct(struct ckpt_ctx *ctx)
 	return ret;
 }
 
+static int restore_task_objs(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_task_objs *h;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_OBJS);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ret = restore_obj_file_table(ctx, h->files_objref);
+	ckpt_debug("file_table: ret %d (%p)\n", ret, current->files);
+
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
 /* read the entire state of the current task */
 int restore_task(struct ckpt_ctx *ctx)
 {
@@ -130,6 +146,10 @@ int restore_task(struct ckpt_ctx *ctx)
 	ckpt_debug("task %d\n", ret);
 	if (ret < 0)
 		goto out;
+	ret = restore_task_objs(ctx);
+	ckpt_debug("shared %d\n", ret);
+	if (ret < 0)
+		goto out;
 	ret = restore_thread(ctx);
 	ckpt_debug("thread %d\n", ret);
 	if (ret < 0)
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index ce52e30..d3d6c5e 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -351,10 +351,19 @@ static int restore_read_tail(struct ckpt_ctx *ctx)
 	return ret;
 }
 
+/* setup restart-specific parts of ctx */
+static int init_restart_ctx(struct ckpt_ctx *ctx)
+{
+	return 0;
+}
+
 int do_restart(struct ckpt_ctx *ctx, pid_t pid)
 {
 	int ret;
 
+	ret = init_restart_ctx(ctx);
+	if (ret < 0)
+		return ret;
 	ret = restore_read_header(ctx);
 	if (ret < 0)
 		return ret;
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index d170988..bb14a66 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -70,13 +70,18 @@ extern int restore_cpu(struct ckpt_ctx *ctx);
 /* file table */
 extern int checkpoint_obj_file_table(struct ckpt_ctx *ctx,
 				     struct task_struct *t);
+extern int restore_obj_file_table(struct ckpt_ctx *ctx, int files_objref);
 extern int checkpoint_file_table(struct ckpt_ctx *ctx, void *ptr);
+extern void *restore_file_table(struct ckpt_ctx *ctx);
 
 /* files */
 extern int checkpoint_file(struct ckpt_ctx *ctx, void *ptr);
+extern void *restore_file(struct ckpt_ctx *ctx);
 
 extern int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
 				  struct ckpt_hdr_file *h);
+extern int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
+			       struct ckpt_hdr_file *h);
 
 
 /* debugging flags */
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 11/43] c/r: add generic '->checkpoint' f_op to ext fses
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (9 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 10/43] c/r: restore " Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 12/43] c/r: add generic '->checkpoint()' f_op to simple devices Oren Laadan
                   ` (23 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan
From: Dave Hansen <dave@linux.vnet.ibm.com>
This marks ext[234] as being checkpointable.  There will be many
more to do this to, but this is a start.
Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
---
 fs/ext2/dir.c  |    1 +
 fs/ext2/file.c |    2 ++
 fs/ext3/dir.c  |    1 +
 fs/ext3/file.c |    1 +
 fs/ext4/dir.c  |    1 +
 fs/ext4/file.c |    1 +
 6 files changed, 7 insertions(+), 0 deletions(-)
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2999d72..4f1dd79 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -721,4 +721,5 @@ const struct file_operations ext2_dir_operations = {
 	.compat_ioctl	= ext2_compat_ioctl,
 #endif
 	.fsync		= ext2_sync_file,
+	.checkpoint	= generic_file_checkpoint,
 };
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 45ed071..e1731c5 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -58,6 +58,7 @@ const struct file_operations ext2_file_operations = {
 	.fsync		= ext2_sync_file,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
+	.checkpoint	= generic_file_checkpoint,
 };
 
 #ifdef CONFIG_EXT2_FS_XIP
@@ -73,6 +74,7 @@ const struct file_operations ext2_xip_file_operations = {
 	.open		= generic_file_open,
 	.release	= ext2_release_file,
 	.fsync		= ext2_sync_file,
+	.checkpoint	= generic_file_checkpoint,
 };
 #endif
 
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 3d724a9..54b05d2 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -48,6 +48,7 @@ const struct file_operations ext3_dir_operations = {
 #endif
 	.fsync		= ext3_sync_file,	/* BKL held */
 	.release	= ext3_release_dir,
+	.checkpoint	= generic_file_checkpoint,
 };
 
 
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5b49704..a421e07 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -126,6 +126,7 @@ const struct file_operations ext3_file_operations = {
 	.fsync		= ext3_sync_file,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
+	.checkpoint	= generic_file_checkpoint,
 };
 
 const struct inode_operations ext3_file_inode_operations = {
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index b647899..2787fdb 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -48,6 +48,7 @@ const struct file_operations ext4_dir_operations = {
 #endif
 	.fsync		= ext4_sync_file,
 	.release	= ext4_release_dir,
+	.checkpoint	= generic_file_checkpoint,
 };
 
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 588af8c..c2dab33 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -161,6 +161,7 @@ const struct file_operations ext4_file_operations = {
 	.fsync		= ext4_sync_file,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
+	.checkpoint	= generic_file_checkpoint,
 };
 
 const struct inode_operations ext4_file_inode_operations = {
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 12/43] c/r: add generic '->checkpoint()' f_op to simple devices
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (10 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 11/43] c/r: add generic '->checkpoint' f_op to ext fses Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 14/43] c/r: dump memory address space (private memory) Oren Laadan
                   ` (22 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
* /dev/null
* /dev/zero
* /dev/random
* /dev/urandom
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 drivers/char/mem.c    |    2 ++
 drivers/char/random.c |    2 ++
 2 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 8f05c38..bfde41f 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -797,6 +797,7 @@ static const struct file_operations null_fops = {
 	.read		= read_null,
 	.write		= write_null,
 	.splice_write	= splice_write_null,
+	.checkpoint	= generic_file_checkpoint,
 };
 
 #ifdef CONFIG_DEVPORT
@@ -813,6 +814,7 @@ static const struct file_operations zero_fops = {
 	.read		= read_zero,
 	.write		= write_zero,
 	.mmap		= mmap_zero,
+	.checkpoint	= generic_file_checkpoint,
 };
 
 /*
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 8c74448..211ca70 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1164,6 +1164,7 @@ const struct file_operations random_fops = {
 	.poll  = random_poll,
 	.unlocked_ioctl = random_ioctl,
 	.fasync = random_fasync,
+	.checkpoint = generic_file_checkpoint,
 };
 
 const struct file_operations urandom_fops = {
@@ -1171,6 +1172,7 @@ const struct file_operations urandom_fops = {
 	.write = random_write,
 	.unlocked_ioctl = random_ioctl,
 	.fasync = random_fasync,
+	.checkpoint = generic_file_checkpoint,
 };
 
 /***************************************************************
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 14/43] c/r: dump memory address space (private memory)
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (11 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 12/43] c/r: add generic '->checkpoint()' f_op to simple devices Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 17/43] c/r: dump anonymous- and file-mapped- shared memory Oren Laadan
                   ` (21 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
For each vma, there is a 'struct ckpt_vma'; Then comes the actual
contents, in one or more chunk: each chunk begins with a header that
specifies how many pages it holds, then the virtual addresses of all
the dumped pages in that chunk, followed by the actual contents of all
dumped pages. A header with zero number of pages marks the end of the
contents.  Then comes the next vma and so on.
To checkpoint a vma, call the ops->checkpoint() method of that vma.
Normally the per-vma function will invoke generic_vma_checkpoint()
which first writes the vma description, followed by the specific
logic to dump the contents of the pages.
Currently for private mapped memory we save the pathname of the file
that is mapped (restart will use it to re-open it and then map it).
Later we change that to reference a file object.
Changelog[v16]:
  - Precede vaddrs/pages with a buffer header
  - Checkpoint mm->exe_file
  - Handle shared task->mm
Changelog[v14]:
  - Modify the ops->checkpoint method to be much more powerful
  - Improve support for VDSO (with special_mapping checkpoint callback)
  - Save new field 'vdso' in mm_context
  - Revert change to pr_debug(), back to ckpt_debug()
  - Check whether calls to ckpt_hbuf_get() fail
  - Discard field 'h->parent'
Changelog[v13]:
  - pgprot_t is an abstract type; use the proper accessor (fix for
    64-bit powerpc (Nathan Lynch <ntl@pobox.com>)
Changelog[v12]:
  - Hide pgarr management inside ckpt_private_vma_fill_pgarr()
  - Fix management of pgarr chain reset and alloc/expand: keep empty
    pgarr in a pool chain
  - Replace obsolete ckpt_debug() with pr_debug()
Changelog[v11]:
  - Copy contents of 'init->fs->root' instead of pointing to them.
  - Add missing test for VM_MAYSHARE when dumping memory
Changelog[v10]:
  - Acquire dcache_lock around call to __d_path() in ckpt_fill_name()
Changelog[v9]:
  - Introduce ckpt_ctx_checkpoint() for checkpoint-specific ctx setup
  - Test if __d_path() changes mnt/dentry (when crossing filesystem
    namespace boundary). for now ckpt_fill_fname() fails the checkpoint.
Changelog[v7]:
  - Fix argument given to kunmap_atomic() in memory dump/restore
Changelog[v6]:
  - Balance all calls to ckpt_hbuf_get() with matching ckpt_hbuf_put()
    (even though it's not really needed)
Changelog[v5]:
  - Improve memory dump code (following Dave Hansen's comments)
  - Change dump format (and code) to allow chunks of <vaddrs, pages>
    instead of one long list of each
  - Fix use of follow_page() to avoid faulting in non-present pages
Changelog[v4]:
  - Use standard list_... for ckpt_pgarr
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 arch/x86/include/asm/checkpoint_hdr.h |    8 +
 arch/x86/mm/checkpoint.c              |   32 ++
 checkpoint/Makefile                   |    3 +-
 checkpoint/memory.c                   |  633 +++++++++++++++++++++++++++++++++
 checkpoint/objhash.c                  |   19 +
 checkpoint/process.c                  |   10 +
 checkpoint/sys.c                      |    4 +
 include/linux/checkpoint.h            |   26 ++-
 include/linux/checkpoint_hdr.h        |   47 +++
 include/linux/checkpoint_types.h      |    3 +
 mm/filemap.c                          |   28 ++
 mm/mmap.c                             |   31 ++
 12 files changed, 842 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/checkpoint_hdr.h b/arch/x86/include/asm/checkpoint_hdr.h
index 362b499..cf90170 100644
--- a/arch/x86/include/asm/checkpoint_hdr.h
+++ b/arch/x86/include/asm/checkpoint_hdr.h
@@ -43,6 +43,7 @@
 enum {
 	CKPT_HDR_THREAD_TLS = 201,
 	CKPT_HDR_CPU_FPU,
+	CKPT_HDR_MM_CONTEXT_LDT,
 };
 
 struct ckpt_hdr_header_arch {
@@ -107,4 +108,11 @@ struct ckpt_hdr_cpu {
 	/* thread_xstate contents follow (if used_math) */
 } __attribute__((aligned(8)));
 
+struct ckpt_hdr_mm_context {
+	struct ckpt_hdr h;
+	__u64 vdso;
+	__u32 ldt_entry_size;
+	__u32 nldt;
+} __attribute__((aligned(8)));
+
 #endif /* __ASM_X86_CKPT_HDR__H */
diff --git a/arch/x86/mm/checkpoint.c b/arch/x86/mm/checkpoint.c
index f54fe80..dc4fbb4 100644
--- a/arch/x86/mm/checkpoint.c
+++ b/arch/x86/mm/checkpoint.c
@@ -14,6 +14,7 @@
 #include <asm/desc.h>
 #include <asm/i387.h>
 
+#include <linux/checkpoint_types.h>
 #include <asm/checkpoint_hdr.h>
 #include <linux/checkpoint.h>
 
@@ -239,6 +240,37 @@ int checkpoint_write_header_arch(struct ckpt_ctx *ctx)
 	return ret;
 }
 
+/* dump the mm->context state */
+int checkpoint_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+	struct ckpt_hdr_mm_context *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_MM_CONTEXT);
+	if (!h)
+		return -ENOMEM;
+
+	mutex_lock(&mm->context.lock);
+
+	h->vdso = (unsigned long) mm->context.vdso;
+	h->ldt_entry_size = LDT_ENTRY_SIZE;
+	h->nldt = mm->context.size;
+
+	ckpt_debug("nldt %d vdso %#llx\n", h->nldt, h->vdso);
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		goto out;
+
+	ret = ckpt_write_obj_type(ctx, mm->context.ldt,
+				  mm->context.size * LDT_ENTRY_SIZE,
+				  CKPT_HDR_MM_CONTEXT_LDT);
+ out:
+	mutex_unlock(&mm->context.lock);
+	return ret;
+}
+
 /**************************************************************************
  * Restart
  */
diff --git a/checkpoint/Makefile b/checkpoint/Makefile
index 1d0c058..f56a7d6 100644
--- a/checkpoint/Makefile
+++ b/checkpoint/Makefile
@@ -8,4 +8,5 @@ obj-$(CONFIG_CHECKPOINT) += \
 	checkpoint.o \
 	restart.o \
 	process.o \
-	files.o
+	files.o \
+	memory.o
diff --git a/checkpoint/memory.c b/checkpoint/memory.c
new file mode 100644
index 0000000..6d1a3cd
--- /dev/null
+++ b/checkpoint/memory.c
@@ -0,0 +1,633 @@
+/*
+ *  Checkpoint/restart memory contents
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DMEM
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/mm_types.h>
+#include <linux/proc_fs.h>
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+/*
+ * page-array chains: each ckpt_pgarr describes a set of <struct page *,vaddr>
+ * tuples (where vaddr is the virtual address of a page in a particular mm).
+ * Specifically, we use separate arrays so that all vaddrs can be written
+ * and read at once.
+ */
+
+struct ckpt_pgarr {
+	unsigned long *vaddrs;
+	struct page **pages;
+	unsigned int nr_used;
+	struct list_head list;
+};
+
+#define CKPT_PGARR_TOTAL  (PAGE_SIZE / sizeof(void *))
+#define CKPT_PGARR_BATCH  (16 * CKPT_PGARR_TOTAL)
+
+static inline int pgarr_is_full(struct ckpt_pgarr *pgarr)
+{
+	return (pgarr->nr_used == CKPT_PGARR_TOTAL);
+}
+
+static inline int pgarr_nr_free(struct ckpt_pgarr *pgarr)
+{
+	return CKPT_PGARR_TOTAL - pgarr->nr_used;
+}
+
+/*
+ * utilities to alloc, free, and handle 'struct ckpt_pgarr' (page-arrays)
+ * (common to ckpt_mem.c and rstr_mem.c).
+ *
+ * The checkpoint context structure has two members for page-arrays:
+ *   ctx->pgarr_list: list head of populated page-array chain
+ *   ctx->pgarr_pool: list head of empty page-array pool chain
+ *
+ * During checkpoint (and restart) the chain tracks the dirty pages (page
+ * pointer and virtual address) of each MM. For a particular MM, these are
+ * always added to the head of the page-array chain (ctx->pgarr_list).
+ * Before the next chunk of pages, the chain is reset (by dereferencing
+ * all pages) but not freed; instead, empty descsriptors are kept in pool.
+ *
+ * The head of the chain page-array ("current") advances as necessary. When
+ * it gets full, a new page-array descriptor is pushed in front of it. The
+ * new descriptor is taken from first empty descriptor (if one exists, for
+ * instance, after a chain reset), or allocated on-demand.
+ *
+ * When dumping the data, the chain is traversed in reverse order.
+ */
+
+/* return first page-array in the chain */
+static inline struct ckpt_pgarr *pgarr_first(struct ckpt_ctx *ctx)
+{
+	if (list_empty(&ctx->pgarr_list))
+		return NULL;
+	return list_first_entry(&ctx->pgarr_list, struct ckpt_pgarr, list);
+}
+
+/* return (and detach) first empty page-array in the pool, if exists */
+static inline struct ckpt_pgarr *pgarr_from_pool(struct ckpt_ctx *ctx)
+{
+	struct ckpt_pgarr *pgarr;
+
+	if (list_empty(&ctx->pgarr_pool))
+		return NULL;
+	pgarr = list_first_entry(&ctx->pgarr_pool, struct ckpt_pgarr, list);
+	list_del(&pgarr->list);
+	return pgarr;
+}
+
+/* release pages referenced by a page-array */
+static void pgarr_release_pages(struct ckpt_pgarr *pgarr)
+{
+	ckpt_debug("total pages %d\n", pgarr->nr_used);
+	/*
+	 * both checkpoint and restart use 'nr_used', however we only
+	 * collect pages during checkpoint; in restart we simply return
+	 * because pgarr->pages remains NULL.
+	 */
+	if (pgarr->pages) {
+		struct page **pages = pgarr->pages;
+		int nr = pgarr->nr_used;
+
+		while (nr--)
+			page_cache_release(pages[nr]);
+	}
+
+	pgarr->nr_used = 0;
+}
+
+/* free a single page-array object */
+static void pgarr_free_one(struct ckpt_pgarr *pgarr)
+{
+	pgarr_release_pages(pgarr);
+	kfree(pgarr->pages);
+	kfree(pgarr->vaddrs);
+	kfree(pgarr);
+}
+
+/* free the chains of page-arrays (populated and empty pool) */
+void ckpt_pgarr_free(struct ckpt_ctx *ctx)
+{
+	struct ckpt_pgarr *pgarr, *tmp;
+
+	list_for_each_entry_safe(pgarr, tmp, &ctx->pgarr_list, list) {
+		list_del(&pgarr->list);
+		pgarr_free_one(pgarr);
+	}
+
+	list_for_each_entry_safe(pgarr, tmp, &ctx->pgarr_pool, list) {
+		list_del(&pgarr->list);
+		pgarr_free_one(pgarr);
+	}
+}
+
+/* allocate a single page-array object */
+static struct ckpt_pgarr *pgarr_alloc_one(unsigned long flags)
+{
+	struct ckpt_pgarr *pgarr;
+
+	pgarr = kzalloc(sizeof(*pgarr), GFP_KERNEL);
+	if (!pgarr)
+		return NULL;
+	pgarr->vaddrs = kmalloc(CKPT_PGARR_TOTAL * sizeof(unsigned long),
+				GFP_KERNEL);
+	if (!pgarr->vaddrs)
+		goto nomem;
+
+	/* pgarr->pages is needed only for checkpoint */
+	if (flags & CKPT_CTX_CHECKPOINT) {
+		pgarr->pages = kmalloc(CKPT_PGARR_TOTAL *
+				       sizeof(struct page *), GFP_KERNEL);
+		if (!pgarr->pages)
+			goto nomem;
+	}
+
+	return pgarr;
+ nomem:
+	pgarr_free_one(pgarr);
+	return NULL;
+}
+
+/* pgarr_current - return the next available page-array in the chain
+ * @ctx: checkpoint context
+ *
+ * Returns the first page-array in the list that has space. Otherwise,
+ * try the next page-array after the last non-empty one, and move it to
+ * the front of the chain. Extends the list if none has space.
+ */
+static struct ckpt_pgarr *pgarr_current(struct ckpt_ctx *ctx)
+{
+	struct ckpt_pgarr *pgarr;
+
+	pgarr = pgarr_first(ctx);
+	if (pgarr && !pgarr_is_full(pgarr))
+		return pgarr;
+
+	pgarr = pgarr_from_pool(ctx);
+	if (!pgarr)
+		pgarr = pgarr_alloc_one(ctx->kflags);
+	if (!pgarr)
+		return NULL;
+
+	list_add(&pgarr->list, &ctx->pgarr_list);
+	return pgarr;
+}
+
+/* reset the page-array chain (dropping page references if necessary) */
+static void pgarr_reset_all(struct ckpt_ctx *ctx)
+{
+	struct ckpt_pgarr *pgarr;
+
+	list_for_each_entry(pgarr, &ctx->pgarr_list, list)
+		pgarr_release_pages(pgarr);
+	list_splice_init(&ctx->pgarr_list, &ctx->pgarr_pool);
+}
+
+/**************************************************************************
+ * Checkpoint
+ *
+ * Checkpoint is outside the context of the checkpointee, so one cannot
+ * simply read pages from user-space. Instead, we scan the address space
+ * of the target to cherry-pick pages of interest. Selected pages are
+ * enlisted in a page-array chain (attached to the checkpoint context).
+ * To save their contents, each page is mapped to kernel memory and then
+ * dumped to the file descriptor.
+ */
+
+
+/**
+ * private_follow_page - return page pointer for dirty pages
+ * @vma - target vma
+ * @addr - page address
+ *
+ * Looks up the page that correspond to the address in the vma, and
+ * returns the page if it was modified (and grabs a reference to it),
+ * or otherwise returns NULL (or error).
+ */
+static struct page *consider_private_page(struct vm_area_struct *vma,
+					  unsigned long addr)
+{
+	struct page *page;
+
+	/*
+	 * simplified version of get_user_pages(): already have vma,
+	 * only need FOLL_ANON, and (for now) ignore fault stats.
+	 *
+	 * follow_page() will return NULL if the page is not present
+	 * (swapped), ZERO_PAGE(0) if the pte wasn't allocated, and
+	 * the actual page pointer otherwise.
+	 *
+	 * FIXME: consolidate with get_user_pages()
+	 */
+
+	cond_resched();
+	while (!(page = follow_page(vma, addr, FOLL_ANON | FOLL_GET))) {
+		int ret;
+
+		/* the page is swapped out - bring it in (optimize ?) */
+		ret = handle_mm_fault(vma->vm_mm, vma, addr, 0);
+		if (ret & VM_FAULT_ERROR) {
+			if (ret & VM_FAULT_OOM)
+				return ERR_PTR(-ENOMEM);
+			else if (ret & VM_FAULT_SIGBUS)
+				return ERR_PTR(-EFAULT);
+			else
+				BUG();
+			break;
+		}
+		cond_resched();
+	}
+
+	if (IS_ERR(page))
+		return page;
+
+	/*
+	 * Only care about dirty pages: either anonymous non-zero pages,
+	 * or file-backed COW (copy-on-write) pages that were modified.
+	 * A clean COW page is not interesting because its contents are
+	 * identical to the backing file; ignore such pages.
+	 * A file-backed broken COW is identified by its page_mapping()
+	 * being unset (NULL) because the page will no longer be mapped
+	 * to the original file after having been modified.
+	 */
+	if (page == ZERO_PAGE(0)) {
+		/* this is the zero page: ignore */
+		page_cache_release(page);
+		page = NULL;
+	} else if (vma->vm_file && (page_mapping(page) != NULL)) {
+		/* file backed clean cow: ignore */
+		page_cache_release(page);
+		page = NULL;
+	}
+
+	return page;
+}
+
+/**
+ * vma_fill_pgarr - fill a page-array with addr/page tuples
+ * @ctx - checkpoint context
+ * @vma - vma to scan
+ * @start - start address (updated)
+ *
+ * Returns the number of pages collected
+ */
+static int vma_fill_pgarr(struct ckpt_ctx *ctx,
+			  struct vm_area_struct *vma,
+			  unsigned long *start)
+{
+	unsigned long end = vma->vm_end;
+	unsigned long addr = *start;
+	struct ckpt_pgarr *pgarr;
+	int nr_used;
+	int cnt = 0;
+
+	/* this function is only for private memory (anon or file-mapped) */
+	BUG_ON(vma->vm_flags & (VM_SHARED | VM_MAYSHARE));
+
+	do {
+		pgarr = pgarr_current(ctx);
+		if (!pgarr)
+			return -ENOMEM;
+
+		nr_used = pgarr->nr_used;
+
+		while (addr < end) {
+			struct page *page;
+
+			page = consider_private_page(vma, addr);
+			if (IS_ERR(page))
+				return PTR_ERR(page);
+
+			if (page) {
+				_ckpt_debug(CKPT_DPAGE,
+					    "got page %#lx\n", addr);
+				pgarr->pages[pgarr->nr_used] = page;
+				pgarr->vaddrs[pgarr->nr_used] = addr;
+				pgarr->nr_used++;
+			}
+
+			addr += PAGE_SIZE;
+
+			if (pgarr_is_full(pgarr))
+				break;
+		}
+
+		cnt += pgarr->nr_used - nr_used;
+
+	} while ((cnt < CKPT_PGARR_BATCH) && (addr < end));
+
+	*start = addr;
+	return cnt;
+}
+
+/* dump contents of a pages: use kmap_atomic() to avoid TLB flush */
+static int checkpoint_dump_page(struct ckpt_ctx *ctx,
+				struct page *page, char *buf)
+{
+	void *ptr;
+
+	ptr = kmap_atomic(page, KM_USER1);
+	memcpy(buf, ptr, PAGE_SIZE);
+	kunmap_atomic(ptr, KM_USER1);
+
+	return ckpt_kwrite(ctx, buf, PAGE_SIZE);
+}
+
+/**
+ * vma_dump_pages - dump pages listed in the ctx page-array chain
+ * @ctx - checkpoint context
+ * @total - total number of pages
+ *
+ * First dump all virtual addresses, followed by the contents of all pages
+ */
+static int vma_dump_pages(struct ckpt_ctx *ctx, int total)
+{
+	struct ckpt_pgarr *pgarr;
+	void *buf;
+	int i, ret = 0;
+
+	if (!total)
+		return 0;
+
+	i =  total * (sizeof(unsigned long) + PAGE_SIZE);
+	ret = ckpt_write_obj_type(ctx, NULL, i, CKPT_HDR_BUFFER);
+	if (ret < 0)
+		return ret;
+
+	list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
+		ret = ckpt_kwrite(ctx, pgarr->vaddrs,
+				  pgarr->nr_used * sizeof(unsigned long));
+		if (ret < 0)
+			return ret;
+	}
+
+	buf = (void *) __get_free_page(GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
+		for (i = 0; i < pgarr->nr_used; i++) {
+			ret = checkpoint_dump_page(ctx, pgarr->pages[i], buf);
+			if (ret < 0)
+				goto out;
+		}
+	}
+ out:
+	free_page((unsigned long) buf);
+	return ret;
+}
+
+/**
+ * checkpoint_memory_contents - dump contents of a VMA with private memory
+ * @ctx - checkpoint context
+ * @vma - vma to scan
+ *
+ * Collect lists of pages that needs to be dumped, and corresponding
+ * virtual addresses into ctx->pgarr_list page-array chain. Then dump
+ * the addresses, followed by the page contents.
+ */
+static int checkpoint_memory_contents(struct ckpt_ctx *ctx,
+				      struct vm_area_struct *vma)
+{
+	struct ckpt_hdr_pgarr *h;
+	unsigned long addr, end;
+	int cnt, ret;
+
+	addr = vma->vm_start;
+	end = vma->vm_end;
+
+	/*
+	 * Work iteratively, collecting and dumping at most CKPT_PGARR_BATCH
+	 * in each round. Each iterations is divided into two steps:
+	 *
+	 * (1) scan: scan through the PTEs of the vma to collect the pages
+	 * to dump (later we'll also make them COW), while keeping a list
+	 * of pages and their corresponding addresses on ctx->pgarr_list.
+	 *
+	 * (2) dump: write out a header specifying how many pages, followed
+	 * by the addresses of all pages in ctx->pgarr_list, followed by
+	 * the actual contents of all pages. (Then, release the references
+	 * to the pages and reset the page-array chain).
+	 *
+	 * (This split makes the logic simpler by first counting the pages
+	 * that need saving. More importantly, it allows for a future
+	 * optimization that will reduce application downtime by deferring
+	 * the actual write-out of the data to after the application is
+	 * allowed to resume execution).
+	 *
+	 * After dumping the entire contents, conclude with a header that
+	 * specifies 0 pages to mark the end of the contents.
+	 */
+
+	while (addr < end) {
+		cnt = vma_fill_pgarr(ctx, vma, &addr);
+		if (cnt == 0)
+			break;
+		else if (cnt < 0)
+			return cnt;
+
+		ckpt_debug("collected %d pages\n", cnt);
+
+		h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
+		if (!h)
+			return -ENOMEM;
+
+		h->nr_pages = cnt;
+		ret = ckpt_write_obj(ctx, &h->h);
+		ckpt_hdr_put(ctx, h);
+		if (ret < 0)
+			return ret;
+
+		ret = vma_dump_pages(ctx, cnt);
+		if (ret < 0)
+			return ret;
+
+		pgarr_reset_all(ctx);
+	}
+
+	/* mark end of contents with header saying "0" pages */
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
+	if (!h)
+		return -ENOMEM;
+	h->nr_pages = 0;
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+/**
+ * generic_vma_checkpoint - dump metadata of vma
+ * @ctx: checkpoint context
+ * @vma: vma object
+ * @type: vma type
+ * @vma_objref: vma objref
+ */
+int generic_vma_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma,
+			   enum vma_type type, int vma_objref)
+{
+	struct ckpt_hdr_vma *h;
+	int ret;
+
+	ckpt_debug("vma %#lx-%#lx flags %#lx type %d\n",
+		 vma->vm_start, vma->vm_end, vma->vm_flags, type);
+
+	if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
+		pr_warning("c/r: unsupported VMA %#lx\n", vma->vm_flags);
+		return -ENOSYS;
+	}
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_VMA);
+	if (!h)
+		return -ENOMEM;
+
+	h->vma_type = type;
+	h->vma_objref = vma_objref;
+	h->vm_start = vma->vm_start;
+	h->vm_end = vma->vm_end;
+	h->vm_page_prot = pgprot_val(vma->vm_page_prot);
+	h->vm_flags = vma->vm_flags;
+	h->vm_pgoff = vma->vm_pgoff;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+/**
+ * private_vma_checkpoint - dump contents of private (anon, file) vma
+ * @ctx: checkpoint context
+ * @vma: vma object
+ * @type: vma type
+ * @vma_objref: vma objref
+ */
+int private_vma_checkpoint(struct ckpt_ctx *ctx,
+			   struct vm_area_struct *vma,
+			   enum vma_type type, int vma_objref)
+{
+	int ret;
+
+	BUG_ON(vma->vm_flags & (VM_SHARED | VM_MAYSHARE));
+
+	ret = generic_vma_checkpoint(ctx, vma, type, vma_objref);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_memory_contents(ctx, vma);
+ out:
+	return ret;
+}
+
+/**
+ * anonymous_checkpoint - dump contents of private-anonymous vma
+ * @ctx: checkpoint context
+ * @vma: vma object
+ */
+static int anonymous_checkpoint(struct ckpt_ctx *ctx,
+				struct vm_area_struct *vma)
+{
+	/* should be private anonymous ... verify that this is the case */
+	if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
+		pr_warning("c/r: unsupported VMA %#lx\n", vma->vm_flags);
+		return -ENOSYS;
+	}
+
+	BUG_ON(vma->vm_file);
+
+	return private_vma_checkpoint(ctx, vma, CKPT_VMA_ANON, 0);
+}
+
+static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+	struct ckpt_hdr_mm *h;
+	struct vm_area_struct *vma;
+	int exe_objref = 0;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_MM);
+	if (!h)
+		return -ENOMEM;
+
+	down_read(&mm->mmap_sem);
+
+	/* FIX: need also mm->flags */
+
+	h->start_code = mm->start_code;
+	h->end_code = mm->end_code;
+	h->start_data = mm->start_data;
+	h->end_data = mm->end_data;
+	h->start_brk = mm->start_brk;
+	h->brk = mm->brk;
+	h->start_stack = mm->start_stack;
+	h->arg_start = mm->arg_start;
+	h->arg_end = mm->arg_end;
+	h->env_start = mm->env_start;
+	h->env_end = mm->env_end;
+
+	h->map_count = mm->map_count;
+
+	/* checkpoint the ->exe_file */
+	if (mm->exe_file) {
+		exe_objref = checkpoint_obj(ctx, mm->exe_file, CKPT_OBJ_FILE);
+		if (exe_objref < 0) {
+			ret = exe_objref;
+			goto out;
+		}
+		h->exe_objref = exe_objref;
+	}
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	if (ret < 0)
+		goto out;
+
+	/* write the vma's */
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		ckpt_debug("vma %#lx-%#lx flags %#lx\n",
+			 vma->vm_start, vma->vm_end, vma->vm_flags);
+		if (!vma->vm_ops)
+			ret = anonymous_checkpoint(ctx, vma);
+		else if (vma->vm_ops->checkpoint)
+			ret = (*vma->vm_ops->checkpoint)(ctx, vma);
+		else
+			ret = -ENOSYS;
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = checkpoint_mm_context(ctx, mm);
+ out:
+	ckpt_hdr_put(ctx, h);
+	up_read(&mm->mmap_sem);
+	return ret;
+}
+
+int checkpoint_mm(struct ckpt_ctx *ctx, void *ptr)
+{
+	return do_checkpoint_mm(ctx, (struct mm_struct *) ptr);
+}
+
+int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct mm_struct *mm;
+	int objref;
+
+	mm = get_task_mm(t);
+	objref = checkpoint_obj(ctx, mm, CKPT_OBJ_MM);
+	mmput(mm);
+
+	return objref;
+}
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index f5c9f7c..4113c7d 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -77,6 +77,17 @@ static void obj_file_drop(void *ptr)
 	fput((struct file *) ptr);
 }
 
+static int obj_mm_grab(void *ptr)
+{
+	atomic_inc(&((struct mm_struct *) ptr)->mm_users);
+	return 0;
+}
+
+static void obj_mm_drop(void *ptr)
+{
+	mmput((struct mm_struct *) ptr);
+}
+
 static struct ckpt_obj_ops ckpt_obj_ops[] = {
 	/* ignored object */
 	{
@@ -103,6 +114,14 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.checkpoint = checkpoint_file,
 		.restore = restore_file,
 	},
+	/* mm object */
+	{
+		.obj_name = "MM",
+		.obj_type = CKPT_OBJ_MM,
+		.ref_drop = obj_mm_drop,
+		.ref_grab = obj_mm_grab,
+		.checkpoint = checkpoint_mm,
+	},
 };
 
 
diff --git a/checkpoint/process.c b/checkpoint/process.c
index 5fd1573..a0b1bf9 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -50,6 +50,7 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
 {
 	struct ckpt_hdr_task_objs *h;
 	int files_objref;
+	int mm_objref;
 	int ret;
 
 	files_objref = checkpoint_obj_file_table(ctx, t);
@@ -60,10 +61,19 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
 		return files_objref;
 	}
 
+	mm_objref = checkpoint_obj_mm(ctx, t);
+	ckpt_debug("mm: objref %d\n", mm_objref);
+	if (mm_objref < 0) {
+		ckpt_write_err(ctx, "task %d (%s), mm_struct: %d",
+			       task_pid_vnr(t), t->comm, mm_objref);
+		return mm_objref;
+	}
+
 	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_OBJS);
 	if (!h)
 		return -ENOMEM;
 	h->files_objref = files_objref;
+	h->mm_objref = mm_objref;
 	ret = ckpt_write_obj(ctx, &h->h);
 	ckpt_hdr_put(ctx, h);
 
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index c2d3d90..7bf70e4 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -171,6 +171,7 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 
 	ckpt_obj_hash_free(ctx);
 	path_put(&ctx->fs_mnt);
+	ckpt_pgarr_free(ctx);
 
 	kfree(ctx);
 }
@@ -188,6 +189,9 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
 	ctx->uflags = uflags;
 	ctx->kflags = kflags;
 
+	INIT_LIST_HEAD(&ctx->pgarr_list);
+	INIT_LIST_HEAD(&ctx->pgarr_pool);
+
 	err = -EBADF;
 	ctx->file = fget(fd);
 	if (!ctx->file)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index bb14a66..30c22f6 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -62,6 +62,7 @@ extern int restore_task(struct ckpt_ctx *ctx);
 extern int checkpoint_write_header_arch(struct ckpt_ctx *ctx);
 extern int checkpoint_thread(struct ckpt_ctx *ctx, struct task_struct *t);
 extern int checkpoint_cpu(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int checkpoint_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm);
 
 extern int restore_read_header_arch(struct ckpt_ctx *ctx);
 extern int restore_thread(struct ckpt_ctx *ctx);
@@ -83,6 +84,27 @@ extern int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
 extern int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
 			       struct ckpt_hdr_file *h);
 
+/* memory */
+extern void ckpt_pgarr_free(struct ckpt_ctx *ctx);
+
+extern int generic_vma_checkpoint(struct ckpt_ctx *ctx,
+				  struct vm_area_struct *vma,
+				  enum vma_type type,
+				  int vma_objref);
+extern int private_vma_checkpoint(struct ckpt_ctx *ctx,
+				  struct vm_area_struct *vma,
+				  enum vma_type type,
+				  int vma_objref);
+
+extern int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int checkpoint_mm(struct ckpt_ctx *ctx, void *ptr);
+
+#define CKPT_VMA_NOT_SUPPORTED					\
+	(VM_SHARED | VM_MAYSHARE | VM_IO | VM_HUGETLB |		\
+	 VM_NONLINEAR | VM_PFNMAP | VM_RESERVED | VM_NORESERVE	\
+	 | VM_HUGETLB | VM_NONLINEAR | VM_MAPPED_COPY |		\
+	 VM_INSERTPAGE | VM_MIXEDMAP | VM_SAO)
+
 
 /* debugging flags */
 #define CKPT_DBASE	0x1		/* anything */
@@ -90,8 +112,10 @@ extern int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
 #define CKPT_DRW	0x4		/* image read/write */
 #define CKPT_DOBJ	0x8		/* shared objects */
 #define CKPT_DFILE	0x10		/* files and filesystem */
+#define CKPT_DMEM	0x20		/* memory state */
+#define CKPT_DPAGE	0x40		/* memory pages */
 
-#define CKPT_DDEFAULT	0x17		/* default debug level */
+#define CKPT_DDEFAULT	0x37		/* default debug level */
 
 #ifndef CKPT_DFLAG
 #define CKPT_DFLAG	0x0		/* nothing */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 5b4fc52..f8a4173 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -58,6 +58,11 @@ enum {
 	CKPT_HDR_FILE_NAME,
 	CKPT_HDR_FILE,
 
+	CKPT_HDR_MM = 401,
+	CKPT_HDR_VMA,
+	CKPT_HDR_PGARR,
+	CKPT_HDR_MM_CONTEXT,
+
 	CKPT_HDR_TAIL = 9001,
 
 	CKPT_HDR_ERROR = 9999,
@@ -80,6 +85,7 @@ enum obj_type {
 	CKPT_OBJ_IGNORE = 0,
 	CKPT_OBJ_FILE_TABLE,
 	CKPT_OBJ_FILE,
+	CKPT_OBJ_MM,
 	CKPT_OBJ_MAX
 };
 
@@ -133,6 +139,7 @@ struct ckpt_hdr_task {
 struct ckpt_hdr_task_objs {
 	struct ckpt_hdr h;
 	__s32 files_objref;
+	__s32 mm_objref;
 } __attribute__((aligned(8)));
 
 /* file system */
@@ -170,4 +177,44 @@ struct ckpt_hdr_file_generic {
 	struct ckpt_hdr_file common;
 } __attribute__((aligned(8)));
 
+/* memory layout */
+struct ckpt_hdr_mm {
+	struct ckpt_hdr h;
+	__u32 map_count;
+	__s32 exe_objref;
+
+	__u64 start_code, end_code, start_data, end_data;
+	__u64 start_brk, brk, start_stack;
+	__u64 arg_start, arg_end, env_start, env_end;
+} __attribute__((aligned(8)));
+
+/* vma subtypes */
+enum vma_type {
+	CKPT_VMA_IGNORE = 0,
+	CKPT_VMA_VDSO,		/* special vdso vma */
+	CKPT_VMA_ANON,		/* private anonymous */
+	CKPT_VMA_FILE,		/* private mapped file */
+	CKPT_VMA_MAX
+};
+
+/* vma descriptor */
+struct ckpt_hdr_vma {
+	struct ckpt_hdr h;
+	__u32 vma_type;
+	__s32 vma_objref;	/* objref of backing file */
+
+	__u64 vm_start;
+	__u64 vm_end;
+	__u64 vm_page_prot;
+	__u64 vm_flags;
+	__u64 vm_pgoff;
+} __attribute__((aligned(8)));
+
+/* page array */
+struct ckpt_hdr_pgarr {
+	struct ckpt_hdr h;
+	__u64 nr_pages;		/* number of pages to saved */
+} __attribute__((aligned(8)));
+
+
 #endif /* _CHECKPOINT_CKPT_HDR_H_ */
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index 067d579..c30d7bf 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -40,6 +40,9 @@ struct ckpt_ctx {
 	struct path fs_mnt;     /* container root (FIXME) */
 
 	char err_string[256];	/* checkpoint: error string */
+
+	struct list_head pgarr_list;	/* page array to dump VMA contents */
+	struct list_head pgarr_pool;	/* pool of empty page arrays chain */
 };
 
 /* ckpt_ctx: kflags */
diff --git a/mm/filemap.c b/mm/filemap.c
index 379ff0b..fdeb4b7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -36,6 +36,10 @@
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
 #include "internal.h"
 
+#include <linux/checkpoint_types.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/checkpoint.h>
+
 /*
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
@@ -1625,8 +1629,32 @@ page_not_uptodate:
 }
 EXPORT_SYMBOL(filemap_fault);
 
+#ifdef CONFIG_CHECKPOINT
+static int filemap_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	int vma_objref;
+
+	if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
+		pr_warning("c/r: unsupported VMA %#lx\n", vma->vm_flags);
+		return -ENOSYS;
+	}
+
+	BUG_ON(!file);
+
+	vma_objref = checkpoint_obj(ctx, file, CKPT_OBJ_FILE);
+	if (vma_objref < 0)
+		return vma_objref;
+
+	return private_vma_checkpoint(ctx, vma, CKPT_VMA_FILE, vma_objref);
+}
+#endif /* CONFIG_CHECKPOINT */
+
 struct vm_operations_struct generic_file_vm_ops = {
 	.fault		= filemap_fault,
+#ifdef CONFIG_CHECKPOINT
+	.checkpoint	= filemap_checkpoint,
+#endif
 };
 
 /* This is used for a general mmap of a disk file */
diff --git a/mm/mmap.c b/mm/mmap.c
index 6b7b1a9..003354b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -34,6 +34,10 @@
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
 
+#include <linux/checkpoint_types.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/checkpoint.h>
+
 #include "internal.h"
 
 #ifndef arch_mmap_check
@@ -2264,9 +2268,36 @@ static void special_mapping_close(struct vm_area_struct *vma)
 {
 }
 
+#ifdef CONFIG_CHECKPOINT
+static int special_mapping_checkpoint(struct ckpt_ctx *ctx,
+				      struct vm_area_struct *vma)
+{
+	const char *name;
+
+	/*
+	 * FIX:
+	 * Currently, we only handle VDSO/vsyscall special handling.
+	 * Even that, is very basic - we just skip the contents and
+	 * hope for the best in terms of compatilibity upon restart.
+	 */
+
+	if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED)
+		return -ENOSYS;
+
+	name = arch_vma_name(vma);
+	if (!name || strcmp(name, "[vdso]"))
+		return -ENOSYS;
+
+	return generic_vma_checkpoint(ctx, vma, CKPT_VMA_VDSO, 0);
+}
+#endif /* CONFIG_CHECKPOINT */
+
 static struct vm_operations_struct special_mapping_vmops = {
 	.close = special_mapping_close,
 	.fault = special_mapping_fault,
+#ifdef CONFIG_CHECKPOINT
+	.checkpoint = special_mapping_checkpoint,
+#endif
 };
 
 /*
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 17/43] c/r: dump anonymous- and file-mapped- shared memory
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (12 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 14/43] c/r: dump memory address space (private memory) Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 19/43] c/r: external checkpoint of a task other than ourself Oren Laadan
                   ` (20 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
We now handle anonymous and file-mapped shared memory. Support for IPC
shared memory requires support for IPC first. We extend ckpt_write_vma()
to detect shared memory VMAs and handle it separately than private
memory.
There is not much to do for file-mapped shared memory, except to force
msync() on the region to ensure that the file system is consistent
with the checkpoint image. Use our internal type CKPT_VMA_SHM_FILE.
Anonymous shared memory is always backed by inode in shmem filesystem.
We use that inode to look up the VMA in the objhash and register it if
not found (on first encounter). In this case, the type of the VMA is
CKPT_VMA_SHM_ANON, and we dump the contents. On the other hand, if it is
found there, we must have already saved it before, so we change the
type to CKPT_VMA_SHM_ANON_SKIP and skip it.
To dump the contents of a shmem VMA, we loop through the pages of the
inode in the shmem filesystem, and dump the contents of each dirty
(allocated) page - unallocated pages must be clean.
Note that we save the original size of a shmem VMA because it may have
been re-mapped partially. The format itself remains like with private
VMAs, except that instead of addresses we record _indices_ (page nr)
into the backing inode.
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 checkpoint/memory.c            |  143 +++++++++++++++++++++++++++++++++++----
 checkpoint/objhash.c           |   29 ++++++++
 include/linux/checkpoint.h     |   15 +++--
 include/linux/checkpoint_hdr.h |    8 ++
 mm/filemap.c                   |   39 +++++++++++-
 mm/mmap.c                      |    2 +-
 mm/shmem.c                     |   33 +++++++++
 7 files changed, 246 insertions(+), 23 deletions(-)
diff --git a/checkpoint/memory.c b/checkpoint/memory.c
index 99bafaa..2b73abc 100644
--- a/checkpoint/memory.c
+++ b/checkpoint/memory.c
@@ -21,6 +21,7 @@
 #include <linux/pagemap.h>
 #include <linux/mm_types.h>
 #include <linux/proc_fs.h>
+#include <linux/swap.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 
@@ -281,6 +282,54 @@ static struct page *consider_private_page(struct vm_area_struct *vma,
 }
 
 /**
+ * consider_shared_page - return page pointer for dirty pages
+ * @ino - inode of shmem object
+ * @idx - page index in shmem object
+ *
+ * Looks up the page that corresponds to the index in the shmem object,
+ * and returns the page if it was modified (and grabs a reference to it),
+ * or otherwise returns NULL (or error).
+ */
+static struct page *consider_shared_page(struct inode *ino, unsigned long idx)
+{
+	struct page *page = NULL;
+	int ret;
+
+	/*
+	 * Inspired by do_shmem_file_read(): very simplified version.
+	 *
+	 * FIXME: consolidate with do_shmem_file_read()
+	 */
+
+	ret = shmem_getpage(ino, idx, &page, SGP_READ, NULL);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	/*
+	 * Only care about dirty pages; shmem_getpage() only returns
+	 * pages that have been allocated, so they must be dirty. The
+	 * pages returned are locked and referenced.
+	 */
+
+	if (page) {
+		unlock_page(page);
+		/*
+		 * If users can be writing to this page using arbitrary
+		 * virtual addresses, take care about potential aliasing
+		 * before reading the page on the kernel side.
+		 */
+		if (mapping_writably_mapped(ino->i_mapping))
+			flush_dcache_page(page);
+		/*
+		 * Mark the page accessed if we read the beginning.
+		 */
+		mark_page_accessed(page);
+	}
+
+	return page;
+}
+
+/**
  * vma_fill_pgarr - fill a page-array with addr/page tuples
  * @ctx - checkpoint context
  * @vma - vma to scan
@@ -289,17 +338,16 @@ static struct page *consider_private_page(struct vm_area_struct *vma,
  * Returns the number of pages collected
  */
 static int vma_fill_pgarr(struct ckpt_ctx *ctx,
-			  struct vm_area_struct *vma,
-			  unsigned long *start)
+			  struct vm_area_struct *vma, struct inode *inode,
+			  unsigned long *start, unsigned long end)
 {
-	unsigned long end = vma->vm_end;
 	unsigned long addr = *start;
 	struct ckpt_pgarr *pgarr;
 	int nr_used;
 	int cnt = 0;
 
 	/* this function is only for private memory (anon or file-mapped) */
-	BUG_ON(vma->vm_flags & (VM_SHARED | VM_MAYSHARE));
+	BUG_ON(inode && vma);
 
 	do {
 		pgarr = pgarr_current(ctx);
@@ -311,7 +359,11 @@ static int vma_fill_pgarr(struct ckpt_ctx *ctx,
 		while (addr < end) {
 			struct page *page;
 
-			page = consider_private_page(vma, addr);
+			if (vma)
+				page = consider_private_page(vma, addr);
+			else
+				page = consider_shared_page(inode, addr);
+
 			if (IS_ERR(page))
 				return PTR_ERR(page);
 
@@ -323,7 +375,10 @@ static int vma_fill_pgarr(struct ckpt_ctx *ctx,
 				pgarr->nr_used++;
 			}
 
-			addr += PAGE_SIZE;
+			if (vma)
+				addr += PAGE_SIZE;
+			else
+				addr++;
 
 			if (pgarr_is_full(pgarr))
 				break;
@@ -395,23 +450,32 @@ static int vma_dump_pages(struct ckpt_ctx *ctx, int total)
 }
 
 /**
- * checkpoint_memory_contents - dump contents of a VMA with private memory
+ * checkpoint_memory_contents - dump contents of a memory region
  * @ctx - checkpoint context
- * @vma - vma to scan
+ * @vma - vma to scan (--or--)
+ * @inode - inode to scan
  *
  * Collect lists of pages that needs to be dumped, and corresponding
  * virtual addresses into ctx->pgarr_list page-array chain. Then dump
  * the addresses, followed by the page contents.
  */
 static int checkpoint_memory_contents(struct ckpt_ctx *ctx,
-				      struct vm_area_struct *vma)
+				      struct vm_area_struct *vma,
+				      struct inode *inode)
 {
 	struct ckpt_hdr_pgarr *h;
 	unsigned long addr, end;
 	int cnt, ret;
 
-	addr = vma->vm_start;
-	end = vma->vm_end;
+	BUG_ON(vma && inode);
+
+	if (vma) {
+		addr = vma->vm_start;
+		end = vma->vm_end;
+	} else {
+		addr = 0;
+		end = PAGE_ALIGN(i_size_read(inode)) >> PAGE_CACHE_SHIFT;
+	}
 
 	/*
 	 * Work iteratively, collecting and dumping at most CKPT_PGARR_BATCH
@@ -437,7 +501,7 @@ static int checkpoint_memory_contents(struct ckpt_ctx *ctx,
 	 */
 
 	while (addr < end) {
-		cnt = vma_fill_pgarr(ctx, vma, &addr);
+		cnt = vma_fill_pgarr(ctx, vma, inode, &addr, end);
 		if (cnt == 0)
 			break;
 		else if (cnt < 0)
@@ -481,7 +545,7 @@ static int checkpoint_memory_contents(struct ckpt_ctx *ctx,
  * @vma_objref: vma objref
  */
 int generic_vma_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma,
-			   enum vma_type type, int vma_objref)
+			   enum vma_type type, int vma_objref, int ino_objref)
 {
 	struct ckpt_hdr_vma *h;
 	int ret;
@@ -500,6 +564,13 @@ int generic_vma_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma,
 
 	h->vma_type = type;
 	h->vma_objref = vma_objref;
+	h->ino_objref = ino_objref;
+
+	if (vma->vm_file)
+		h->ino_size = i_size_read(vma->vm_file->f_dentry->d_inode);
+	else
+		h->ino_size = 0;
+
 	h->vm_start = vma->vm_start;
 	h->vm_end = vma->vm_end;
 	h->vm_page_prot = pgprot_val(vma->vm_page_prot);
@@ -527,10 +598,37 @@ int private_vma_checkpoint(struct ckpt_ctx *ctx,
 
 	BUG_ON(vma->vm_flags & (VM_SHARED | VM_MAYSHARE));
 
-	ret = generic_vma_checkpoint(ctx, vma, type, vma_objref);
+	ret = generic_vma_checkpoint(ctx, vma, type, vma_objref, 0);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_memory_contents(ctx, vma, NULL);
+ out:
+	return ret;
+}
+
+/**
+ * shmem_vma_checkpoint - dump contents of private (anon, file) vma
+ * @ctx: checkpoint context
+ * @vma: vma object
+ * @type: vma type
+ * @objref: vma object id
+ */
+int shmem_vma_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma,
+			 enum vma_type type, int ino_objref)
+{
+	struct file *file = vma->vm_file;
+	int ret;
+
+	ckpt_debug("type %d, ino_ref %d\n", type, ino_objref);
+	BUG_ON(!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE)));
+	BUG_ON(!file);
+
+	ret = generic_vma_checkpoint(ctx, vma, type, 0, ino_objref);
 	if (ret < 0)
 		goto out;
-	ret = checkpoint_memory_contents(ctx, vma);
+	if (type == CKPT_VMA_SHM_ANON_SKIP)
+		goto out;
+	ret = checkpoint_memory_contents(ctx, NULL, file->f_dentry->d_inode);
  out:
 	return ret;
 }
@@ -929,6 +1027,21 @@ static struct restore_vma_ops restore_vma_ops[] = {
 		.vma_type = CKPT_VMA_FILE,
 		.restore = filemap_restore,
 	},
+	/* anonymous shared */
+	{
+		.vma_name = "ANON SHARED",
+		.vma_type = CKPT_VMA_SHM_ANON,
+	},
+	/* anonymous shared (skipped) */
+	{
+		.vma_name = "ANON SHARED (skip)",
+		.vma_type = CKPT_VMA_SHM_ANON_SKIP,
+	},
+	/* file-mapped shared */
+	{
+		.vma_name = "FILE SHARED",
+		.vma_type = CKPT_VMA_SHM_FILE,
+	},
 };
 
 /**
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index 79325d9..ff9388d 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -43,6 +43,16 @@ struct ckpt_obj_hash {
 	int next_free_objref;
 };
 
+int checkpoint_bad(struct ckpt_ctx *ctx, void *ptr)
+{
+	BUG();
+}
+
+void *restore_bad(struct ckpt_ctx *ctx)
+{
+	return ERR_PTR(-EINVAL);
+}
+
 /* helper grab/drop functions: */
 
 static void obj_no_drop(void *ptr)
@@ -55,6 +65,16 @@ static int obj_no_grab(void *ptr)
 	return 0;
 }
 
+static int obj_inode_grab(void *ptr)
+{
+	return igrab((struct inode *) ptr) ? 0 : -EBADF;
+}
+
+static void obj_inode_drop(void *ptr)
+{
+	iput((struct inode *) ptr);
+}
+
 static int obj_file_table_grab(void *ptr)
 {
 	atomic_inc(&((struct files_struct *) ptr)->count);
@@ -96,6 +116,15 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.ref_drop = obj_no_drop,
 		.ref_grab = obj_no_grab,
 	},
+	/* inode object */
+	{
+		.obj_name = "INODE",
+		.obj_type = CKPT_OBJ_INODE,
+		.ref_drop = obj_inode_drop,
+		.ref_grab = obj_inode_grab,
+		.checkpoint = checkpoint_bad,	/* no c/r at inode level */
+		.restore = restore_bad,		/* no c/r at inode level */
+	},
 	/* files_struct object */
 	{
 		.obj_name = "FILE_TABLE",
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index a22eb65..18b4941 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -91,11 +91,15 @@ extern void ckpt_pgarr_free(struct ckpt_ctx *ctx);
 extern int generic_vma_checkpoint(struct ckpt_ctx *ctx,
 				  struct vm_area_struct *vma,
 				  enum vma_type type,
-				  int vma_objref);
+				  int vma_objref, int ino_objref);
 extern int private_vma_checkpoint(struct ckpt_ctx *ctx,
 				  struct vm_area_struct *vma,
 				  enum vma_type type,
 				  int vma_objref);
+extern int shmem_vma_checkpoint(struct ckpt_ctx *ctx,
+				struct vm_area_struct *vma,
+				enum vma_type type,
+				int ino_objref);
 
 extern int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t);
 extern int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref);
@@ -106,11 +110,10 @@ extern int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
 			       struct file *file, struct ckpt_hdr_vma *h);
 
 
-#define CKPT_VMA_NOT_SUPPORTED					\
-	(VM_SHARED | VM_MAYSHARE | VM_IO | VM_HUGETLB |		\
-	 VM_NONLINEAR | VM_PFNMAP | VM_RESERVED | VM_NORESERVE	\
-	 | VM_HUGETLB | VM_NONLINEAR | VM_MAPPED_COPY |		\
-	 VM_INSERTPAGE | VM_MIXEDMAP | VM_SAO)
+#define CKPT_VMA_NOT_SUPPORTED						\
+	(VM_IO | VM_HUGETLB | VM_NONLINEAR | VM_PFNMAP |		\
+	 VM_RESERVED | VM_NORESERVE | VM_HUGETLB | VM_NONLINEAR |	\
+	 VM_MAPPED_COPY | VM_INSERTPAGE | VM_MIXEDMAP | VM_SAO)
 
 
 /* debugging flags */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 671dcab..6ab3c8b 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -83,6 +83,7 @@ struct ckpt_hdr_objref {
 /* shared objects types */
 enum obj_type {
 	CKPT_OBJ_IGNORE = 0,
+	CKPT_OBJ_INODE,
 	CKPT_OBJ_FILE_TABLE,
 	CKPT_OBJ_FILE,
 	CKPT_OBJ_MM,
@@ -138,6 +139,7 @@ struct ckpt_hdr_task {
 /* task's shared resources */
 struct ckpt_hdr_task_objs {
 	struct ckpt_hdr h;
+
 	__s32 files_objref;
 	__s32 mm_objref;
 } __attribute__((aligned(8)));
@@ -194,6 +196,9 @@ enum vma_type {
 	CKPT_VMA_VDSO,		/* special vdso vma */
 	CKPT_VMA_ANON,		/* private anonymous */
 	CKPT_VMA_FILE,		/* private mapped file */
+	CKPT_VMA_SHM_ANON,	/* shared anonymous */
+	CKPT_VMA_SHM_ANON_SKIP,	/* shared anonymous (skip contents) */
+	CKPT_VMA_SHM_FILE,	/* shared mapped file, only msync */
 	CKPT_VMA_MAX
 };
 
@@ -202,6 +207,9 @@ struct ckpt_hdr_vma {
 	struct ckpt_hdr h;
 	__u32 vma_type;
 	__s32 vma_objref;	/* objref of backing file */
+	__s32 ino_objref;	/* objref of shared segment */
+	__u32 _padding;
+	__u64 ino_size;		/* size of shared segment */
 
 	__u64 vm_start;
 	__u64 vm_end;
diff --git a/mm/filemap.c b/mm/filemap.c
index 86da5d5..0d28481 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1634,6 +1634,8 @@ static int filemap_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma)
 {
 	struct file *file = vma->vm_file;
 	int vma_objref;
+	int ino_objref;
+	int first, ret;
 
 	if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
 		pr_warning("c/r: unsupported VMA %#lx\n", vma->vm_flags);
@@ -1646,7 +1648,42 @@ static int filemap_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma)
 	if (vma_objref < 0)
 		return vma_objref;
 
-	return private_vma_checkpoint(ctx, vma, CKPT_VMA_FILE, vma_objref);
+	if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+		/*
+		 * Citing mmap(2): "Updates to the mapping are visible
+		 * to other processes that map this file, and are
+		 * carried through to the underlying file. The file
+		 * may not actually be updated until msync(2) or
+		 * munmap(2) is called"
+		 *
+		 * Citing msync(2): "Without use of this call there is
+		 * no guarantee that changes are written back before
+		 * munmap(2) is called."
+		 *
+		 * Force msync for region of shared mapped files, to
+		 * ensure that that the file system is consistent with
+		 * the checkpoint image.  (inspired by sys_msync).
+		 */
+
+		ino_objref = ckpt_obj_lookup_add(ctx, file->f_dentry->d_inode,
+					       CKPT_OBJ_INODE, &first);
+		if (ino_objref < 0)
+			return ino_objref;
+
+		if (first) {
+			ret = vfs_fsync(file, file->f_path.dentry, 0);
+			if (ret < 0)
+				return ret;
+		}
+
+		ret = generic_vma_checkpoint(ctx, vma, CKPT_VMA_SHM_FILE,
+					     vma_objref, ino_objref);
+	} else {
+		ret = private_vma_checkpoint(ctx, vma, CKPT_VMA_FILE,
+					     vma_objref);
+	}
+
+	return ret;
 }
 
 int filemap_restore(struct ckpt_ctx *ctx,
diff --git a/mm/mmap.c b/mm/mmap.c
index e8e9124..0820a9b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2296,7 +2296,7 @@ static int special_mapping_checkpoint(struct ckpt_ctx *ctx,
 	if (!name || strcmp(name, "[vdso]"))
 		return -ENOSYS;
 
-	return generic_vma_checkpoint(ctx, vma, CKPT_VMA_VDSO, 0);
+	return generic_vma_checkpoint(ctx, vma, CKPT_VMA_VDSO, 0, 0);
 }
 
 int special_mapping_restore(struct ckpt_ctx *ctx,
diff --git a/mm/shmem.c b/mm/shmem.c
index f260336..d349c10 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -31,6 +31,10 @@
 #include <linux/swap.h>
 #include <linux/ima.h>
 
+#include <linux/checkpoint_types.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/checkpoint.h>
+
 static struct vfsmount *shm_mnt;
 
 #ifdef CONFIG_SHMEM
@@ -2381,6 +2385,32 @@ static void shmem_destroy_inode(struct inode *inode)
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
+#ifdef CONFIG_CHECKPOINT
+static int shmem_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma)
+{
+	enum vma_type vma_type;
+	int ino_objref;
+	int first;
+
+	/* should be private anonymous ... verify that this is the case */
+	if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
+		pr_warning("c/r: unsupported VMA %#lx\n", vma->vm_flags);
+		return -ENOSYS;
+	}
+
+	BUG_ON(!vma->vm_file);
+
+	ino_objref = ckpt_obj_lookup_add(ctx, vma->vm_file->f_dentry->d_inode,
+					 CKPT_OBJ_INODE, &first);
+	if (ino_objref < 0)
+		return ino_objref;
+
+	vma_type = (first ? CKPT_VMA_SHM_ANON : CKPT_VMA_SHM_ANON_SKIP);
+
+	return shmem_vma_checkpoint(ctx, vma, vma_type, ino_objref);
+}
+#endif /* CONFIG_CHECKPOINT */
+
 static void init_once(void *foo)
 {
 	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
@@ -2496,6 +2526,9 @@ static struct vm_operations_struct shmem_vm_ops = {
 	.set_policy     = shmem_set_policy,
 	.get_policy     = shmem_get_policy,
 #endif
+#ifdef CONFIG_CHECKPOINT
+	.checkpoint	= shmem_checkpoint,
+#endif
 };
 
 
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 19/43] c/r: external checkpoint of a task other than ourself
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (13 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 17/43] c/r: dump anonymous- and file-mapped- shared memory Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 21:19   ` Alexey Dobriyan
  2009-05-27 17:32 ` [RFC v16][PATCH 20/43] c/r: export functionality used in next patch for restart-blocks Oren Laadan
                   ` (19 subsequent siblings)
  34 siblings, 1 reply; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Now we can do "external" checkpoint, i.e. act on another task.
sys_checkpoint() now looks up the target pid (in our namespace) and
checkpoints that corresponding task. That task should be the root of
a container, unless CHECKPOINT_SUBTREE flag is given.
sys_restart() remains the same, as the restart is always done in the
context of the restarting task.
Changelog[v16]:
  - Use CHECKPOINT_SUBTREE to allow subtree (partial container)
Changelog[v14]:
  - Refuse non-self checkpoint if target task isn't frozen
Changelog[v12]:
  - Replace obsolete ckpt_debug() with pr_debug()
Changelog[v11]:
  - Copy contents of 'init->fs->root' instead of pointing to them
Changelog[v10]:
  - Grab vfs root of container init, rather than current process
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 checkpoint/checkpoint.c          |   79 +++++++++++++++++++++++++++++++++++++-
 checkpoint/restart.c             |    4 +-
 checkpoint/sys.c                 |    6 +++
 include/linux/checkpoint_types.h |    2 +
 4 files changed, 87 insertions(+), 4 deletions(-)
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index a346b7e..086f2d9 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -12,6 +12,9 @@
 #define CKPT_DFLAG  CKPT_DSYS
 
 #include <linux/version.h>
+#include <linux/sched.h>
+#include <linux/freezer.h>
+#include <linux/ptrace.h>
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/file.h>
@@ -245,23 +248,95 @@ static int checkpoint_write_tail(struct ckpt_ctx *ctx)
 	return ret;
 }
 
+static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	if (t->state == TASK_DEAD) {
+		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
+		return -EAGAIN;
+	}
+
+	if (!ptrace_may_access(t, PTRACE_MODE_READ)) {
+		__ckpt_write_err(ctx, "access to task %d (%s) denied",
+				 task_pid_vnr(t), t->comm);
+		return -EPERM;
+	}
+
+	/* verify that the task is frozen (unless self) */
+	if (t != current && !frozen(t)) {
+		__ckpt_write_err(ctx, "task %d (%s) is not frozen",
+				 task_pid_vnr(t), t->comm);
+		return -EBUSY;
+	}
+
+	/* FIX: add support for ptraced tasks */
+	if (task_ptrace(t)) {
+		__ckpt_write_err(ctx, "task %d (%s) is ptraced",
+				 task_pid_vnr(t), t->comm);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int get_container(struct ckpt_ctx *ctx, pid_t pid)
+{
+	struct task_struct *task = NULL;
+	struct nsproxy *nsproxy = NULL;
+	int ret;
+
+	ctx->root_pid = pid;
+
+	read_lock(&tasklist_lock);
+	task = find_task_by_vpid(pid);
+	if (task)
+		get_task_struct(task);
+	read_unlock(&tasklist_lock);
+
+	if (!task)
+		return -ESRCH;
+
+	ret = may_checkpoint_task(ctx, task);
+	if (ret) {
+		ckpt_write_err(ctx, NULL);
+		put_task_struct(task);
+		return ret;
+	}
+
+	rcu_read_lock();
+	nsproxy = task_nsproxy(task);
+	get_nsproxy(nsproxy);
+	rcu_read_unlock();
+
+	ctx->root_task = task;
+	ctx->root_nsproxy = nsproxy;
+
+	return 0;
+}
+
 /* setup checkpoint-specific parts of ctx */
 static int init_checkpoint_ctx(struct ckpt_ctx *ctx, pid_t pid)
 {
 	struct fs_struct *fs;
+	int ret;
 
 	ctx->root_pid = pid;
 
+	ret = get_container(ctx, pid);
+	if (ret < 0)
+		return ret;
+
 	/*
 	 * assume checkpointer is in container's root vfs
 	 * FIXME: this works for now, but will change with real containers
 	 */
 
-	fs = current->fs;
+	task_lock(ctx->root_task);
+	fs = ctx->root_task->fs;
 	read_lock(&fs->lock);
 	ctx->fs_mnt = fs->root;
 	path_get(&ctx->fs_mnt);
 	read_unlock(&fs->lock);
+	task_unlock(ctx->root_task);
 
 	return 0;
 }
@@ -276,7 +351,7 @@ int do_checkpoint(struct ckpt_ctx *ctx, pid_t pid)
 	ret = checkpoint_write_header(ctx);
 	if (ret < 0)
 		goto out;
-	ret = checkpoint_task(ctx, current);
+	ret = checkpoint_task(ctx, ctx->root_task);
 	if (ret < 0)
 		goto out;
 	ret = checkpoint_write_tail(ctx);
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index d3d6c5e..ca33539 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -352,7 +352,7 @@ static int restore_read_tail(struct ckpt_ctx *ctx)
 }
 
 /* setup restart-specific parts of ctx */
-static int init_restart_ctx(struct ckpt_ctx *ctx)
+static int init_restart_ctx(struct ckpt_ctx *ctx, pid_t pid)
 {
 	return 0;
 }
@@ -361,7 +361,7 @@ int do_restart(struct ckpt_ctx *ctx, pid_t pid)
 {
 	int ret;
 
-	ret = init_restart_ctx(ctx);
+	ret = init_restart_ctx(ctx, pid);
 	if (ret < 0)
 		return ret;
 	ret = restore_read_header(ctx);
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 7bf70e4..c809120 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -12,6 +12,7 @@
 #define CKPT_DFLAG  CKPT_DSYS
 
 #include <linux/sched.h>
+#include <linux/nsproxy.h>
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
@@ -173,6 +174,11 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 	path_put(&ctx->fs_mnt);
 	ckpt_pgarr_free(ctx);
 
+	if (ctx->root_nsproxy)
+		put_nsproxy(ctx->root_nsproxy);
+	if (ctx->root_task)
+		put_task_struct(ctx->root_task);
+
 	kfree(ctx);
 }
 
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index a0ea5f6..4369f90 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -28,6 +28,8 @@ struct ckpt_ctx {
 	int crid;		/* unique checkpoint id */
 
 	pid_t root_pid;		/* container identifier */
+	struct task_struct *root_task;	/* container root task */
+	struct nsproxy *root_nsproxy;	/* container root nsproxy */
 
 	unsigned long kflags;	/* kerenl flags */
 	unsigned long uflags;	/* user flags */
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * Re: [RFC v16][PATCH 19/43] c/r: external checkpoint of a task other than ourself
  2009-05-27 17:32 ` [RFC v16][PATCH 19/43] c/r: external checkpoint of a task other than ourself Oren Laadan
@ 2009-05-27 21:19   ` Alexey Dobriyan
  2009-05-27 22:32     ` Oren Laadan
  0 siblings, 1 reply; 50+ messages in thread
From: Alexey Dobriyan @ 2009-05-27 21:19 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Andrew Morton, Linus Torvalds, containers, linux-kernel, linux-mm,
	linux-api, Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov
On Wed, May 27, 2009 at 01:32:45PM -0400, Oren Laadan wrote:
> Now we can do "external" checkpoint, i.e. act on another task.
> +static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
> +{
> +	if (t->state == TASK_DEAD) {
> +		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
> +		return -EAGAIN;
> +	}
> +
> +	if (!ptrace_may_access(t, PTRACE_MODE_READ)) {
> +		__ckpt_write_err(ctx, "access to task %d (%s) denied",
> +				 task_pid_vnr(t), t->comm);
> +		return -EPERM;
> +	}
> +
> +	/* verify that the task is frozen (unless self) */
> +	if (t != current && !frozen(t)) {
> +		__ckpt_write_err(ctx, "task %d (%s) is not frozen",
> +				 task_pid_vnr(t), t->comm);
> +		return -EBUSY;
> +	}
> +
> +	/* FIX: add support for ptraced tasks */
> +	if (task_ptrace(t)) {
> +		__ckpt_write_err(ctx, "task %d (%s) is ptraced",
> +				 task_pid_vnr(t), t->comm);
> +		return -EBUSY;
> +	}
> +
> +	return 0;
> +}
> +
> +static int get_container(struct ckpt_ctx *ctx, pid_t pid)
> +{
> +	struct task_struct *task = NULL;
> +	struct nsproxy *nsproxy = NULL;
> +	int ret;
> +
> +	ctx->root_pid = pid;
> +
> +	read_lock(&tasklist_lock);
> +	task = find_task_by_vpid(pid);
> +	if (task)
> +		get_task_struct(task);
> +	read_unlock(&tasklist_lock);
> +
> +	if (!task)
> +		return -ESRCH;
> +
> +	ret = may_checkpoint_task(ctx, task);
> +	if (ret) {
> +		ckpt_write_err(ctx, NULL);
> +		put_task_struct(task);
> +		return ret;
> +	}
> +
> +	rcu_read_lock();
> +	nsproxy = task_nsproxy(task);
> +	get_nsproxy(nsproxy);
Will oops if init is multi-threaded and thread group leader exited
(nsproxy = NULL). I need to think what to do, too.
> +	rcu_read_unlock();
> +
> +	ctx->root_task = task;
> +	ctx->root_nsproxy = nsproxy;
> +
> +	return 0;
> +}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 50+ messages in thread
- * Re: [RFC v16][PATCH 19/43] c/r: external checkpoint of a task other than ourself
  2009-05-27 21:19   ` Alexey Dobriyan
@ 2009-05-27 22:32     ` Oren Laadan
       [not found]       ` <Pine.LNX.4.64.0905271831030.7284-CXF6herHY6ykSYb+qCZC/1i27PF6R63G9nwVQlTi/Pw@public.gmane.org>
  0 siblings, 1 reply; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 22:32 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Andrew Morton, Linus Torvalds, containers, linux-kernel, linux-mm,
	linux-api, Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov
On Thu, 28 May 2009, Alexey Dobriyan wrote:
> On Wed, May 27, 2009 at 01:32:45PM -0400, Oren Laadan wrote:
> > Now we can do "external" checkpoint, i.e. act on another task.
> 
> > +static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
> > +{
> > +	if (t->state == TASK_DEAD) {
> > +		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
> > +		return -EAGAIN;
> > +	}
> > +
> > +	if (!ptrace_may_access(t, PTRACE_MODE_READ)) {
> > +		__ckpt_write_err(ctx, "access to task %d (%s) denied",
> > +				 task_pid_vnr(t), t->comm);
> > +		return -EPERM;
> > +	}
> > +
> > +	/* verify that the task is frozen (unless self) */
> > +	if (t != current && !frozen(t)) {
> > +		__ckpt_write_err(ctx, "task %d (%s) is not frozen",
> > +				 task_pid_vnr(t), t->comm);
> > +		return -EBUSY;
> > +	}
> > +
> > +	/* FIX: add support for ptraced tasks */
> > +	if (task_ptrace(t)) {
> > +		__ckpt_write_err(ctx, "task %d (%s) is ptraced",
> > +				 task_pid_vnr(t), t->comm);
> > +		return -EBUSY;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int get_container(struct ckpt_ctx *ctx, pid_t pid)
> > +{
> > +	struct task_struct *task = NULL;
> > +	struct nsproxy *nsproxy = NULL;
> > +	int ret;
> > +
> > +	ctx->root_pid = pid;
> > +
> > +	read_lock(&tasklist_lock);
> > +	task = find_task_by_vpid(pid);
> > +	if (task)
> > +		get_task_struct(task);
> > +	read_unlock(&tasklist_lock);
> > +
> > +	if (!task)
> > +		return -ESRCH;
> > +
> > +	ret = may_checkpoint_task(ctx, task);
> > +	if (ret) {
> > +		ckpt_write_err(ctx, NULL);
> > +		put_task_struct(task);
> > +		return ret;
> > +	}
> > +
> > +	rcu_read_lock();
> > +	nsproxy = task_nsproxy(task);
> > +	get_nsproxy(nsproxy);
> 
> Will oops if init is multi-threaded and thread group leader exited
> (nsproxy = NULL). I need to think what to do, too.
ood catch. Since all threads share same nsproxy (except those
who exits.. duh) we can test for this case, and get the nsproxy
from any of the other threads, something like this (untested):
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index afc7300..b303876 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -522,9 +522,33 @@ static int get_container(struct ckpt_ctx *ctx, pid_t pid)
 
 	rcu_read_lock();
 	nsproxy = task_nsproxy(task);
-	get_nsproxy(nsproxy);
+	if (nsproxy)
+		get_nsproxy(nsproxy);
 	rcu_read_unlock();
 
+	/*
+	 * If we hit a zombie thread-group-leader, nsproxy will be NULL,
+	 * and we instead grab it from one of the other threads.
+	 */
+	if (!nsproxy) {
+		struct task_struct *p = next_thread(task);
+
+		BUG_ON(task->state != TASK_DEAD);
+		read_lock(&tasklist_lock);
+		while (p != task && !task_nsproxy(p))
+			p = next_thread(p);
+		nsproxy = get_nsproxy(p);
+		if (nsproxy)
+			get_nsproxy(nsproxy);
+		read_unlock(&tasklist_lock);
+	}
+
+	/* still not ... too bad ... */
+	if (!nsproxy) {
+		put_task_struct(task);
+		return -ESRCH;
+	}
+
 	ctx->root_task = task;
 	ctx->root_nsproxy = nsproxy;
 	ctx->root_init = is_container_init(task);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
 
 
- * [RFC v16][PATCH 20/43] c/r: export functionality used in next patch for restart-blocks
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (14 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 19/43] c/r: external checkpoint of a task other than ourself Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 21/43] c/r: restart-blocks Oren Laadan
                   ` (18 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
To support c/r of restart-blocks (system call that need to be
restarted because they were interrupted but there was no userspace
visible side-effect), export restart-block callbacks for poll()
and futex() syscalls.
More details on c/r of restart-blocks and how it works in the
following patch.
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge Hallyn <serue@us.ibm.com>
---
 fs/select.c                  |    2 +-
 include/linux/futex.h        |   10 ++++++++++
 include/linux/poll.h         |    3 +++
 include/linux/posix-timers.h |    2 ++
 kernel/futex.c               |   11 +----------
 kernel/posix-timers.c        |    2 +-
 6 files changed, 18 insertions(+), 12 deletions(-)
diff --git a/fs/select.c b/fs/select.c
index 0fe0e14..e64ddc6 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -833,7 +833,7 @@ out_fds:
 	return err;
 }
 
-static long do_restart_poll(struct restart_block *restart_block)
+long do_restart_poll(struct restart_block *restart_block)
 {
 	struct pollfd __user *ufds = restart_block->poll.ufds;
 	int nfds = restart_block->poll.nfds;
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 3bf5bb5..dd0e06b 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -130,6 +130,16 @@ extern int
 handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
 
 /*
+ * In case we must use restart_block to restart a futex_wait,
+ * we encode in the 'flags' shared capability
+ */
+#define FLAGS_SHARED		0x01
+#define FLAGS_CLOCKRT		0x02
+
+/* referenced by checkpoint/restart */
+extern long futex_wait_restart(struct restart_block *restart);
+
+/*
  * Futexes are matched on equal values of this key.
  * The key type depends on whether it's a shared or private mapping.
  * Don't rearrange members without looking at hash_futex().
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 8c24ef8..97f95a7 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -131,6 +131,9 @@ extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 
 extern int poll_select_set_timeout(struct timespec *to, long sec, long nsec);
 
+/* used by checkpoint/restart */
+extern long do_restart_poll(struct restart_block *restart_block);
+
 #endif /* KERNEL */
 
 #endif /* _LINUX_POLL_H */
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 4f71bf4..3d0e946 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -119,4 +119,6 @@ long clock_nanosleep_restart(struct restart_block *restart_block);
 
 void update_rlimit_cpu(unsigned long rlim_new);
 
+int invalid_clockid(const clockid_t which_clock);
+
 #endif
diff --git a/kernel/futex.c b/kernel/futex.c
index d546b2d..f405c73 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1113,15 +1113,6 @@ handle_fault:
 	goto retry;
 }
 
-/*
- * In case we must use restart_block to restart a futex_wait,
- * we encode in the 'flags' shared capability
- */
-#define FLAGS_SHARED		0x01
-#define FLAGS_CLOCKRT		0x02
-
-static long futex_wait_restart(struct restart_block *restart);
-
 static int futex_wait(u32 __user *uaddr, int fshared,
 		      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
 {
@@ -1286,7 +1277,7 @@ out:
 }
 
 
-static long futex_wait_restart(struct restart_block *restart)
+long futex_wait_restart(struct restart_block *restart)
 {
 	u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
 	int fshared = 0;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 052ec4d..589aed2 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -205,7 +205,7 @@ static int no_timer_create(struct k_itimer *new_timer)
 /*
  * Return nonzero if we know a priori this clockid_t value is bogus.
  */
-static inline int invalid_clockid(const clockid_t which_clock)
+int invalid_clockid(const clockid_t which_clock)
 {
 	if (which_clock < 0)	/* CPU clock, posix_cpu_* will check it */
 		return 0;
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 21/43] c/r: restart-blocks
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (15 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 20/43] c/r: export functionality used in next patch for restart-blocks Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 22/43] c/r: checkpoint multiple processes Oren Laadan
                   ` (17 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
(Paraphrasing what's said this message:
http://lists.openwall.net/linux-kernel/2007/12/05/64)
Restart blocks are callbacks used cause a system call to be restarted
with the arguments specified in the system call restart block. It is
useful for system call that are not idempotent, i.e. the argument(s)
might be a relative timeout, where some adjustments are required when
restarting the system call. It relies on the system call itself to set
up its restart point and the argument save area.  They are rare: an
actual signal would turn that it an EINTR. The only case that should
ever trigger this is some kernel action that interrupts the system
call, but does not actually result in any user-visible state changes -
like freeze and thaw.
So restart blocks are about time remaining for the system call to
sleep/wait. Generally in c/r, there are two possible time models that
we can follow: absolute, relative. Here, I chose to save the relative
timeout, measured from the beginning of the checkpoint. The time when
the checkpoint (and restart) begin is also saved. This information is
sufficient to restart in either model (absolute or negative).
Which model to use should eventually be a per application choice (and
possible configurable via cradvise() or some sort). For now, we adopt
the relative model, namely, at restart the timeout is set relative to
the beginning of the restart.
To checkpoint, we check if a task has a valid restart block, and if so
we save the *remaining* time that is has to wait/sleep, and the type
of the restart block.
To restart, we fill in the data required at the proper place in the
thread information. If the system call return an error (which is
possibly an -ERESTARTSYS eg), we not only use that error as our own
return value, but also arrange for the task to execute the signal
handler (by faking a signal). The handler, in turn, already has the
code to handle these restart request gracefully.
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 arch/x86/include/asm/checkpoint_hdr.h |    1 -
 arch/x86/mm/checkpoint.c              |   10 +-
 checkpoint/checkpoint.c               |    1 +
 checkpoint/process.c                  |  226 +++++++++++++++++++++++++++++++++
 checkpoint/restart.c                  |   35 +++++-
 checkpoint/sys.c                      |    1 +
 include/linux/checkpoint.h            |    4 +
 include/linux/checkpoint_hdr.h        |   22 +++
 include/linux/checkpoint_types.h      |    3 +
 9 files changed, 293 insertions(+), 10 deletions(-)
diff --git a/arch/x86/include/asm/checkpoint_hdr.h b/arch/x86/include/asm/checkpoint_hdr.h
index cf90170..ee23df9 100644
--- a/arch/x86/include/asm/checkpoint_hdr.h
+++ b/arch/x86/include/asm/checkpoint_hdr.h
@@ -57,7 +57,6 @@ struct ckpt_hdr_header_arch {
 
 struct ckpt_hdr_thread {
 	struct ckpt_hdr h;
-	/* FIXME: restart blocks */
 	__u16 gdt_entry_tls_entries;
 	__u16 sizeof_tls_array;
 	__u16 ntls;	/* number of TLS entries to follow */
diff --git a/arch/x86/mm/checkpoint.c b/arch/x86/mm/checkpoint.c
index c781416..7cd7494 100644
--- a/arch/x86/mm/checkpoint.c
+++ b/arch/x86/mm/checkpoint.c
@@ -63,13 +63,9 @@ int checkpoint_thread(struct ckpt_ctx *ctx, struct task_struct *t)
 	 * FIXME: the TLS descriptors in the GDT should be called out and
 	 * not tied to the in-kernel representation.
 	 */
-	ret = ckpt_write_obj_type(ctx, thread->tls_array,
-				  sizeof(thread->tls_array),
-				  CKPT_HDR_THREAD_TLS);
-
-	/* IGNORE RESTART BLOCKS FOR NOW ... */
-
-	return ret;
+	return ckpt_write_obj_type(ctx, thread->tls_array,
+				   sizeof(thread->tls_array),
+				   CKPT_HDR_THREAD_TLS);
 }
 
 #ifndef CONFIG_X86_64
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index 086f2d9..3999d80 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -23,6 +23,7 @@
 #include <linux/mount.h>
 #include <linux/utsname.h>
 #include <linux/magic.h>
+#include <linux/hrtimer.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 
diff --git a/checkpoint/process.c b/checkpoint/process.c
index 3ce82cb..876be3e 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -12,6 +12,9 @@
 #define CKPT_DFLAG  CKPT_DSYS
 
 #include <linux/sched.h>
+#include <linux/posix-timers.h>
+#include <linux/futex.h>
+#include <linux/poll.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 
@@ -80,6 +83,116 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
 	return ret;
 }
 
+/* dump the task_struct of a given task */
+int checkpoint_restart_block(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct ckpt_hdr_restart_block *h;
+	struct restart_block *restart_block;
+	long (*fn)(struct restart_block *);
+	s64 base, expire = 0;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_RESTART_BLOCK);
+	if (!h)
+		return -ENOMEM;
+
+	base = ktime_to_ns(ctx->ktime_begin);
+	restart_block = &task_thread_info(t)->restart_block;
+	fn = restart_block->fn;
+
+	/* FIX: enumerate clockid_t so we're immune to changes */
+
+	if (fn == do_no_restart_syscall) {
+
+		h->function_type = CKPT_RESTART_BLOCK_NONE;
+		ckpt_debug("restart_block: non\n");
+
+	} else if (fn == hrtimer_nanosleep_restart) {
+
+		h->function_type = CKPT_RESTART_BLOCK_HRTIMER_NANOSLEEP;
+		h->arg_0 = restart_block->nanosleep.index;
+		h->arg_1 = (unsigned long) restart_block->nanosleep.rmtp;
+		expire = restart_block->nanosleep.expires;
+		ckpt_debug("restart_block: hrtimer expire %lld now %lld\n",
+			 expire, base);
+
+	} else if (fn == posix_cpu_nsleep_restart) {
+		struct timespec ts;
+
+		h->function_type = CKPT_RESTART_BLOCK_POSIX_CPU_NANOSLEEP;
+		h->arg_0 = restart_block->arg0;
+		h->arg_1 = restart_block->arg1;
+		ts.tv_sec = restart_block->arg2;
+		ts.tv_nsec = restart_block->arg3;
+		expire = timespec_to_ns(&ts);
+		ckpt_debug("restart_block: posix_cpu expire %lld now %lld\n",
+			 expire, base);
+
+#ifdef CONFIG_COMPAT
+	} else if (fn == compat_nanosleep_restart) {
+
+		h->function_type = CKPT_RESTART_BLOCK_NANOSLEEP;
+		h->arg_0 = restart_block->nanosleep.index;
+		h->arg_1 = (unsigned long)restart_block->nanosleep.rmtp;
+		h->arg_2 = (unsigned long)restart_block->nanosleep.compat_rmtp;
+		expire = restart_block->nanosleep.expires;
+		ckpt_debug("restart_block: compat expire %lld now %lld\n",
+			 expire, base);
+
+	} else if (fn == compat_clock_nanosleep_restart) {
+
+		h->function_type = CKPT_RESTART_BLOCK_COMPAT_CLOCK_NANOSLEEP;
+		h->arg_0 = restart_block->nanosleep.index;
+		h->arg_1 = (unsigned long)restart_block->nanosleep.rmtp;
+		h->arg_2 = (unsigned long)restart_block->nanosleep.compat_rmtp;
+		expire = restart_block->nanosleep.expires;
+		ckpt_debug("restart_block: compat_clock expire %lld now %lld\n",
+			 expire, base);
+
+#endif
+	} else if (fn == futex_wait_restart) {
+
+		h->function_type = CKPT_RESTART_BLOCK_FUTEX;
+		h->arg_0 = (unsigned long) restart_block->futex.uaddr;
+		h->arg_1 = restart_block->futex.val;
+		h->arg_2 = restart_block->futex.flags;
+		h->arg_3 = restart_block->futex.bitset;
+		expire = restart_block->futex.time;
+		ckpt_debug("restart_block: futex expire %lld now %lld\n",
+			 expire, base);
+
+	} else if (fn == do_restart_poll) {
+		struct timespec ts;
+
+		h->function_type = CKPT_RESTART_BLOCK_POLL;
+		h->arg_0 = (unsigned long) restart_block->poll.ufds;
+		h->arg_1 = restart_block->poll.nfds;
+		h->arg_2 = restart_block->poll.has_timeout;
+		ts.tv_sec = restart_block->poll.tv_sec;
+		ts.tv_nsec = restart_block->poll.tv_nsec;
+		expire = timespec_to_ns(&ts);
+		ckpt_debug("restart_block: poll expire %lld now %lld\n",
+			 expire, base);
+
+	} else {
+
+		BUG();
+
+	}
+
+	/* common to all restart blocks: */
+	h->arg_4 = (base < expire ? expire - base : 0);
+
+	ckpt_debug("restart_block: args %#llx %#llx %#llx %#llx %#llx\n",
+		 h->arg_0, h->arg_1, h->arg_2, h->arg_3, h->arg_4);
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+
+	ckpt_debug("restart_block ret %d\n", ret);
+	return ret;
+}
+
 /* dump the entire state of a given task */
 int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 {
@@ -97,6 +210,10 @@ int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 	ckpt_debug("thread %d\n", ret);
 	if (ret < 0)
 		goto out;
+	ret = checkpoint_restart_block(ctx, t);
+	ckpt_debug("restart-blocks %d\n", ret);
+	if (ret < 0)
+		goto out;
 	ret = checkpoint_cpu(ctx, t);
 	ckpt_debug("cpu %d\n", ret);
  out:
@@ -150,6 +267,111 @@ static int restore_task_objs(struct ckpt_ctx *ctx)
 	return ret;
 }
 
+int restore_restart_block(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_restart_block *h;
+	struct restart_block restart_block;
+	struct timespec ts;
+	clockid_t clockid;
+	s64 expire;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_RESTART_BLOCK);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	expire = ktime_to_ns(ctx->ktime_begin) + h->arg_4;
+	restart_block.fn = NULL;
+
+	ckpt_debug("restart_block: expire %lld begin %lld\n",
+		 expire, ktime_to_ns(ctx->ktime_begin));
+	ckpt_debug("restart_block: args %#llx %#llx %#llx %#llx %#llx\n",
+		 h->arg_0, h->arg_1, h->arg_2, h->arg_3, h->arg_4);
+
+	switch (h->function_type) {
+	case CKPT_RESTART_BLOCK_NONE:
+		restart_block.fn = do_no_restart_syscall;
+		break;
+	case CKPT_RESTART_BLOCK_HRTIMER_NANOSLEEP:
+		clockid = h->arg_0;
+		if (clockid < 0 || invalid_clockid(clockid))
+			break;
+		restart_block.fn = hrtimer_nanosleep_restart;
+		restart_block.nanosleep.index = clockid;
+		restart_block.nanosleep.rmtp =
+			(struct timespec __user *) (unsigned long) h->arg_1;
+		restart_block.nanosleep.expires = expire;
+		break;
+	case CKPT_RESTART_BLOCK_POSIX_CPU_NANOSLEEP:
+		clockid = h->arg_0;
+		if (clockid < 0 || invalid_clockid(clockid))
+			break;
+		restart_block.fn = posix_cpu_nsleep_restart;
+		restart_block.arg0 = clockid;
+		restart_block.arg1 = h->arg_1;
+		ts = ns_to_timespec(expire);
+		restart_block.arg2 = ts.tv_sec;
+		restart_block.arg3 = ts.tv_nsec;
+		break;
+#ifdef CONFIG_COMPAT
+	case CKPT_RESTART_BLOCK_COMPAT_NANOSLEEP:
+		clockid = h->arg_0;
+		if (clockid < 0 || invalid_clockid(clockid))
+			break;
+		restart_block.fn = compat_nanosleep_restart;
+		restart_block.nanosleep.index = clockid;
+		restart_block.nanosleep.rmtp =
+			(struct timespec __user *) (unsigned long) h->arg_1;
+		restart_block.nanosleep.compat_rmtp =
+			(struct compat_timespec __user *)
+				(unsigned long) h->arg_2;
+		resatrt_block.nanosleep.expires = expire;
+		break;
+	case CKPT_RESTART_BLOCK_COMPAT_CLOCK_NANOSLEEP:
+		clockid = h->arg_0;
+		if (clockid < 0 || invalid_clockid(clockid))
+			break;
+		restart_block.fn = compat_clock_nanosleep_restart;
+		restart_block.nanosleep.index = clockid;
+		restart_block.nanosleep.rmtp =
+			(struct timespec __user *) (unsigned long) h->arg_1;
+		restart_block.nanosleep.compat_rmtp =
+			(struct compat_timespec __user *)
+				(unsigned long) h->arg_2;
+		resatrt_block.nanosleep.expires = expire;
+		break;
+#endif
+	case CKPT_RESTART_BLOCK_FUTEX:
+		restart_block.fn = futex_wait_restart;
+		restart_block.futex.uaddr = (u32 *) (unsigned long) h->arg_0;
+		restart_block.futex.val = h->arg_1;
+		restart_block.futex.flags = h->arg_2;
+		restart_block.futex.bitset = h->arg_3;
+		restart_block.futex.time = expire;
+		break;
+	case CKPT_RESTART_BLOCK_POLL:
+		restart_block.fn = do_restart_poll;
+		restart_block.poll.ufds =
+			(struct pollfd __user *) (unsigned long) h->arg_0;
+		restart_block.poll.nfds = h->arg_1;
+		restart_block.poll.has_timeout = h->arg_2;
+		ts = ns_to_timespec(expire);
+		restart_block.poll.tv_sec = ts.tv_sec;
+		restart_block.poll.tv_nsec = ts.tv_nsec;
+		break;
+	default:
+		break;
+	}
+
+	if (restart_block.fn)
+		task_thread_info(current)->restart_block = restart_block;
+	else
+		ret = -EINVAL;
+
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
 /* read the entire state of the current task */
 int restore_task(struct ckpt_ctx *ctx)
 {
@@ -167,6 +389,10 @@ int restore_task(struct ckpt_ctx *ctx)
 	ckpt_debug("thread %d\n", ret);
 	if (ret < 0)
 		goto out;
+	ret = restore_restart_block(ctx);
+	ckpt_debug("restart-blocks %d\n", ret);
+	if (ret < 0)
+		goto out;
 	ret = restore_cpu(ctx);
 	ckpt_debug("cpu %d\n", ret);
  out:
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index ca33539..e5067a9 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -16,6 +16,8 @@
 #include <linux/file.h>
 #include <linux/magic.h>
 #include <linux/utsname.h>
+#include <asm/syscall.h>
+#include <linux/elf.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 
@@ -357,6 +359,34 @@ static int init_restart_ctx(struct ckpt_ctx *ctx, pid_t pid)
 	return 0;
 }
 
+static int restore_retval(void)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+	int ret = 0;
+
+	/*
+	 * The retval should be either zero if the checkpointed task
+	 * had been in user-space when frozen, or the retval from the
+	 * syscall that had been interrupted then.
+	 *
+	 * In the latter, if the syscall succeeded (perhaps partially)
+	 * then the retval is non-negative. If it failed, the error
+	 * may be one of -ERESTART... gang, interpreted in the signal
+	 * handling code. In restart it must happen, too.
+	 *
+	 * To force execution of the signal handler now, too, we fake
+	 * a signal to ourselves (a la freeze/thaw) when ret < 0.
+	 */
+
+	/* were we from a system call?  if so, get old error/retval */
+	if (syscall_get_nr(current, regs) >= 0)
+		ret = syscall_get_error(current, regs);
+	/* old error ?  if so, make sure signal handling kicks in */
+	if (ret < 0)
+		set_tsk_thread_flag(current, TIF_SIGPENDING);
+	return ret;
+}
+
 int do_restart(struct ckpt_ctx *ctx, pid_t pid)
 {
 	int ret;
@@ -371,7 +401,8 @@ int do_restart(struct ckpt_ctx *ctx, pid_t pid)
 	if (ret < 0)
 		return ret;
 	ret = restore_read_tail(ctx);
+	if (ret < 0)
+		return ret;
 
-	/* on success, adjust the return value if needed [TODO] */
-	return ret;
+	return restore_retval();
 }
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index c809120..5b91a39 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -194,6 +194,7 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
 
 	ctx->uflags = uflags;
 	ctx->kflags = kflags;
+	ctx->ktime_begin = ktime_get();
 
 	INIT_LIST_HEAD(&ctx->pgarr_list);
 	INIT_LIST_HEAD(&ctx->pgarr_pool);
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 169aa70..e1204cf 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -69,6 +69,10 @@ extern int restore_thread(struct ckpt_ctx *ctx);
 extern int restore_cpu(struct ckpt_ctx *ctx);
 extern int restore_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm);
 
+extern int checkpoint_restart_block(struct ckpt_ctx *ctx,
+				    struct task_struct *t);
+extern int restore_restart_block(struct ckpt_ctx *ctx);
+
 /* file table */
 extern int checkpoint_obj_file_table(struct ckpt_ctx *ctx,
 				     struct task_struct *t);
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 6ab3c8b..939f4e3 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -48,6 +48,7 @@ enum {
 
 	CKPT_HDR_TASK = 101,
 	CKPT_HDR_TASK_OBJS,
+	CKPT_HDR_RESTART_BLOCK,
 	CKPT_HDR_THREAD,
 	CKPT_HDR_CPU,
 
@@ -144,6 +145,27 @@ struct ckpt_hdr_task_objs {
 	__s32 mm_objref;
 } __attribute__((aligned(8)));
 
+/* restart blocks */
+struct ckpt_hdr_restart_block {
+	struct ckpt_hdr h;
+	__u64 function_type;
+	__u64 arg_0;
+	__u64 arg_1;
+	__u64 arg_2;
+	__u64 arg_3;
+	__u64 arg_4;
+} __attribute__((aligned(8)));
+
+enum restart_block_type {
+	CKPT_RESTART_BLOCK_NONE = 1,
+	CKPT_RESTART_BLOCK_HRTIMER_NANOSLEEP,
+	CKPT_RESTART_BLOCK_POSIX_CPU_NANOSLEEP,
+	CKPT_RESTART_BLOCK_COMPAT_NANOSLEEP,
+	CKPT_RESTART_BLOCK_COMPAT_CLOCK_NANOSLEEP,
+	CKPT_RESTART_BLOCK_POLL,
+	CKPT_RESTART_BLOCK_FUTEX
+};
+
 /* file system */
 struct ckpt_hdr_file_table {
 	struct ckpt_hdr h;
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index 4369f90..72052ef 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -21,12 +21,15 @@ struct ckpt_hdr_vma;
 #include <linux/list.h>
 #include <linux/path.h>
 #include <linux/fs.h>
+#include <linux/ktime.h>
 
 #include <linux/sched.h>
 
 struct ckpt_ctx {
 	int crid;		/* unique checkpoint id */
 
+	ktime_t ktime_begin;	/* checkpoint start time */
+
 	pid_t root_pid;		/* container identifier */
 	struct task_struct *root_task;	/* container root task */
 	struct nsproxy *root_nsproxy;	/* container root nsproxy */
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 22/43] c/r: checkpoint multiple processes
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (16 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 21/43] c/r: restart-blocks Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 23/43] c/r: restart " Oren Laadan
                   ` (16 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Checkpointing of multiple processes works by recording the tasks tree
structure below a given "root" task. The root task is expected to be a
container init, and then an entire container is checkpointed. However,
passing CHECKPOINT_SUBTREE to checkpoint(2) relaxes this requirement
and allows to checkpoint a subtree of processes from the root task.
For a given root task, do a DFS scan of the tasks tree and collect them
into an array (keeping a reference to each task). Using DFS simplifies
the recreation of tasks either in user space or kernel space. For each
task collected, test if it can be checkpointed, and save its pid, tgid,
and ppid.
The actual work is divided into two passes: a first scan counts the
tasks, then memory is allocated and a second scan fills the array.
Whether checkpoints and restarts require CAP_SYS_ADMIN is determined
by sysctl 'ckpt_unpriv_allowed': if 1, then regular permission checks
are intended to prevent privilege escalation, however if 0 it prevents
unprivileged users from exploiting any privilege escalation bugs.
The logic is suitable for creation of processes during restart either
in userspace or by the kernel.
Currently we ignore threads and zombies.
Changelog[v16]:
  - CHECKPOINT_SUBTREE flags allows subtree (not whole container)
  - sysctl variable 'ckpt_unpriv_allowed' controls needed privileges
Changelog[v14]:
  - Refuse non-self checkpoint if target task isn't frozen
  - Refuse checkpoint (for now) if task is ptraced
  - Revert change to pr_debug(), back to ckpt_debug()
  - Use only unsigned fields in checkpoint headers
  - Check retval of ckpt_tree_count_tasks() in ckpt_build_tree()
  - Discard 'h.parent' field
  - Check whether calls to ckpt_hbuf_get() fail
  - Disallow threads or siblings to container init
Changelog[v13]:
  - Release tasklist_lock in error path in ckpt_tree_count_tasks()
  - Use separate index for 'tasks_arr' and 'hh' in ckpt_write_pids()
Changelog[v12]:
  - Replace obsolete ckpt_debug() with pr_debug()
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 checkpoint/checkpoint.c          |  237 ++++++++++++++++++++++++++++++++++++--
 checkpoint/restart.c             |    2 +-
 checkpoint/sys.c                 |   33 +++++-
 include/linux/checkpoint_hdr.h   |   16 +++-
 include/linux/checkpoint_types.h |   16 ++-
 kernel/sysctl.c                  |   17 +++
 6 files changed, 305 insertions(+), 16 deletions(-)
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index 3999d80..92f219e 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -249,8 +249,27 @@ static int checkpoint_write_tail(struct ckpt_ctx *ctx)
 	return ret;
 }
 
+/* dump all tasks in ctx->tasks_arr[] */
+static int checkpoint_all_tasks(struct ckpt_ctx *ctx)
+{
+	int n, ret = 0;
+
+	for (n = 0; n < ctx->nr_tasks; n++) {
+		ckpt_debug("dumping task #%d\n", n);
+		ret = checkpoint_task(ctx, ctx->tasks_arr[n]);
+		if (ret < 0)
+			break;
+	}
+
+	return ret;
+}
+
 static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 {
+	struct task_struct *root = ctx->root_task;
+
+	ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
+
 	if (t->state == TASK_DEAD) {
 		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
 		return -EAGAIN;
@@ -276,14 +295,211 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 		return -EBUSY;
 	}
 
+	/*
+	 * FIX: for now, disallow siblings of container init created
+	 * via CLONE_PARENT (unclear if they will remain possible)
+	 */
+	if (ctx->root_init && t != root && t->tgid != root->tgid &&
+	    t->real_parent == root->real_parent) {
+		__ckpt_write_err(ctx, "task %d (%s) is sibling of root",
+				 task_pid_vnr(t), t->comm);
+		return -EINVAL;
+	}
+
+	/* FIX: change this when namespaces are added */
+	if (task_nsproxy(t) != ctx->root_nsproxy)
+		return -EPERM;
+
+	return 0;
+}
+
+#define CKPT_HDR_PIDS_CHUNK	256
+
+static int checkpoint_pids(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_pids *h;
+	struct pid_namespace *ns;
+	struct task_struct *task;
+	struct task_struct **tasks_arr;
+	int nr_tasks, n, pos = 0, ret = 0;
+
+	ns = ctx->root_nsproxy->pid_ns;
+	tasks_arr = ctx->tasks_arr;
+	nr_tasks = ctx->nr_tasks;
+	BUG_ON(nr_tasks <= 0);
+
+	ret = ckpt_write_obj_type(ctx, NULL,
+				  sizeof(*h) * nr_tasks,
+				  CKPT_HDR_BUFFER);
+	if (ret < 0)
+		return ret;
+
+	h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
+	if (!h)
+		return -ENOMEM;
+
+	do {
+		rcu_read_lock();
+		for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
+			task = tasks_arr[pos];
+
+			h[n].vpid = task_pid_nr_ns(task, ns);
+			h[n].vtgid = task_tgid_nr_ns(task, ns);
+			h[n].vpgid = task_pgrp_nr_ns(task, ns);
+			h[n].vsid = task_session_nr_ns(task, ns);
+			h[n].vppid = task_tgid_nr_ns(task->real_parent, ns);
+			ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
+				   pos, h[n].vpid, h[n].vtgid, h[n].vppid);
+			pos++;
+		}
+		rcu_read_unlock();
+
+		n = min(nr_tasks, CKPT_HDR_PIDS_CHUNK);
+		ret = ckpt_kwrite(ctx, h, n * sizeof(*h));
+		if (ret < 0)
+			break;
+
+		nr_tasks -= n;
+	} while (nr_tasks > 0);
+
+	_ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
+	return ret;
+}
+
+/* count number of tasks in tree (and optionally fill pid's in array) */
+static int tree_count_tasks(struct ckpt_ctx *ctx)
+{
+	struct task_struct *root;
+	struct task_struct *task;
+	struct task_struct *parent;
+	struct task_struct **tasks_arr = ctx->tasks_arr;
+	int nr_tasks = ctx->nr_tasks;
+	int nr = 0;
+	int ret;
+
+	read_lock(&tasklist_lock);
+
+	/* we hold the lock, so root_task->real_parent can't change */
+	task = ctx->root_task;
+	if (ctx->root_init) {
+		/* container-init: start from container parent */
+		parent = task->real_parent;
+		root = parent;
+	} else {
+		/* non-container-init: start from root task and down */
+		parent = NULL;
+		root = task;
+	}
+
+	/* count tasks via DFS scan of the tree */
+	while (1) {
+		/* is this task cool ? */
+		ret = may_checkpoint_task(ctx, task);
+		if (ret < 0) {
+			nr = ret;
+			break;
+		}
+		if (tasks_arr) {
+			/* unlikely... but if so then try again later */
+			if (nr == nr_tasks) {
+				nr = -EAGAIN; /* cleanup in ckpt_ctx_free() */
+				break;
+			}
+			tasks_arr[nr] = task;
+			get_task_struct(task);
+		}
+		nr++;
+		/* if has children - proceed with child */
+		if (!list_empty(&task->children)) {
+			parent = task;
+			task = list_entry(task->children.next,
+					  struct task_struct, sibling);
+			continue;
+		}
+		while (task != root) {
+			/* if has sibling - proceed with sibling */
+			if (!list_is_last(&task->sibling, &parent->children)) {
+				task = list_entry(task->sibling.next,
+						  struct task_struct, sibling);
+				break;
+			}
+
+			/* else, trace back to parent and proceed */
+			task = parent;
+			parent = parent->real_parent;
+		}
+		if (task == root)
+			break;
+	}
+
+	read_unlock(&tasklist_lock);
+
+	if (nr < 0)
+		ckpt_write_err(ctx, NULL);
+	return nr;
+}
+
+/*
+ * build_tree - scan the tasks tree in DFS order and fill in array
+ * @ctx: checkpoint context
+ *
+ * Using DFS order simplifies the restart logic to re-create the tasks.
+ *
+ * On success, ctx->tasks_arr will be allocated and populated with all
+ * tasks (reference taken), and ctx->nr_tasks will hold the total count.
+ * The array is cleaned up by ckpt_ctx_free().
+ */
+static int build_tree(struct ckpt_ctx *ctx)
+{
+	int n, m;
+
+	/* count tasks (no side effects) */
+	n = tree_count_tasks(ctx);
+	if (n < 0)
+		return n;
+
+	ctx->nr_tasks = n;
+	ctx->tasks_arr = kzalloc(n * sizeof(*ctx->tasks_arr), GFP_KERNEL);
+	if (!ctx->tasks_arr)
+		return -ENOMEM;
+
+	/* count again (now will fill array) */
+	m = tree_count_tasks(ctx);
+
+	/* unlikely, but ... (cleanup in ckpt_ctx_free) */
+	if (m < 0)
+		return m;
+	else if (m != n)
+		return -EBUSY;
+
 	return 0;
 }
 
+/* dump the array that describes the tasks tree */
+static int checkpoint_tree(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_tree *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TREE);
+	if (!h)
+		return -ENOMEM;
+
+	h->nr_tasks = ctx->nr_tasks;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ret;
+
+	ret = checkpoint_pids(ctx);
+	return ret;
+}
+
 static int get_container(struct ckpt_ctx *ctx, pid_t pid)
 {
 	struct task_struct *task = NULL;
 	struct nsproxy *nsproxy = NULL;
-	int ret;
 
 	ctx->root_pid = pid;
 
@@ -296,13 +512,6 @@ static int get_container(struct ckpt_ctx *ctx, pid_t pid)
 	if (!task)
 		return -ESRCH;
 
-	ret = may_checkpoint_task(ctx, task);
-	if (ret) {
-		ckpt_write_err(ctx, NULL);
-		put_task_struct(task);
-		return ret;
-	}
-
 	rcu_read_lock();
 	nsproxy = task_nsproxy(task);
 	get_nsproxy(nsproxy);
@@ -310,6 +519,10 @@ static int get_container(struct ckpt_ctx *ctx, pid_t pid)
 
 	ctx->root_task = task;
 	ctx->root_nsproxy = nsproxy;
+	ctx->root_init = is_container_init(task);
+
+	if (!(ctx->uflags & CHECKPOINT_SUBTREE) && !ctx->root_init)
+		return -EINVAL;  /* cleanup by ckpt_ctx_free() */
 
 	return 0;
 }
@@ -349,10 +562,16 @@ int do_checkpoint(struct ckpt_ctx *ctx, pid_t pid)
 	ret = init_checkpoint_ctx(ctx, pid);
 	if (ret < 0)
 		goto out;
+	ret = build_tree(ctx);
+	if (ret < 0)
+		goto out;
 	ret = checkpoint_write_header(ctx);
 	if (ret < 0)
 		goto out;
-	ret = checkpoint_task(ctx, ctx->root_task);
+	ret = checkpoint_tree(ctx);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_all_tasks(ctx);
 	if (ret < 0)
 		goto out;
 	ret = checkpoint_write_tail(ctx);
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index e5067a9..8b8229e 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -304,7 +304,7 @@ static int restore_read_header(struct ckpt_ctx *ctx)
 	    h->minor != ((LINUX_VERSION_CODE >> 8) & 0xff) ||
 	    h->patch != ((LINUX_VERSION_CODE) & 0xff))
 		goto out;
-	if (h->uflags)
+	if (h->uflags & ~CHECKPOINT_USER_FLAGS)
 		goto out;
 	if (h->uts_release_len != sizeof(uts->release) ||
 	    h->uts_version_len != sizeof(uts->version) ||
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 5b91a39..46eadf5 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -22,6 +22,14 @@
 #include <linux/checkpoint.h>
 
 /*
+ * ckpt_unpriv_allowed - sysctl controlled, do not allow checkpoints or
+ * restarts unless caller has CAP_SYS_ADMIN, if 0 (prevent unprivileged
+ * useres from expoitling any privilege escalation bugs). If it is 1,
+ * then regular permissions checks are intended to do the job.
+ */
+int ckpt_unpriv_allowed = 1;	/* default: allow */
+
+/*
  * Helpers to write(read) from(to) kernel space to(from) the checkpoint
  * image file descriptor (similar to how a core-dump is performed).
  *
@@ -165,6 +173,19 @@ void *ckpt_hdr_get_type(struct ckpt_ctx *ctx, int len, int type)
  * restart operation, and persists until the operation is completed.
  */
 
+static void task_arr_free(struct ckpt_ctx *ctx)
+{
+	int n;
+
+	for (n = 0; n < ctx->nr_tasks; n++) {
+		if (ctx->tasks_arr[n]) {
+			put_task_struct(ctx->tasks_arr[n]);
+			ctx->tasks_arr[n] = NULL;
+		}
+	}
+	kfree(ctx->tasks_arr);
+}
+
 static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 {
 	if (ctx->file)
@@ -174,6 +195,9 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 	path_put(&ctx->fs_mnt);
 	ckpt_pgarr_free(ctx);
 
+	if (ctx->tasks_arr)
+		task_arr_free(ctx);
+
 	if (ctx->root_nsproxy)
 		put_nsproxy(ctx->root_nsproxy);
 	if (ctx->root_task)
@@ -228,10 +252,12 @@ SYSCALL_DEFINE3(checkpoint, pid_t, pid, int, fd, unsigned long, flags)
 	struct ckpt_ctx *ctx;
 	int ret;
 
-	/* no flags for now */
-	if (flags)
+	if (flags & ~CHECKPOINT_USER_FLAGS)
 		return -EINVAL;
 
+	if (!ckpt_unpriv_allowed && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if (pid == 0)
 		pid = task_pid_vnr(current);
 	ctx = ckpt_ctx_alloc(fd, flags, CKPT_CTX_CHECKPOINT);
@@ -265,6 +291,9 @@ SYSCALL_DEFINE3(restart, pid_t, pid, int, fd, unsigned long, flags)
 	if (flags)
 		return -EINVAL;
 
+	if (!ckpt_unpriv_allowed && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	ctx = ckpt_ctx_alloc(fd, flags, CKPT_CTX_RESTART);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 939f4e3..36328e1 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -46,7 +46,8 @@ enum {
 	CKPT_HDR_STRING,
 	CKPT_HDR_OBJREF,
 
-	CKPT_HDR_TASK = 101,
+	CKPT_HDR_TREE = 101,
+	CKPT_HDR_TASK,
 	CKPT_HDR_TASK_OBJS,
 	CKPT_HDR_RESTART_BLOCK,
 	CKPT_HDR_THREAD,
@@ -125,6 +126,19 @@ struct ckpt_hdr_tail {
 	__u64 magic;
 } __attribute__((aligned(8)));
 
+/* task tree */
+struct ckpt_hdr_tree {
+	struct ckpt_hdr h;
+	__s32 nr_tasks;
+} __attribute__((aligned(8)));
+
+struct ckpt_hdr_pids {
+	__s32 vpid;
+	__s32 vppid;
+	__s32 vtgid;
+	__s32 vpgid;
+	__s32 vsid;
+} __attribute__((aligned(8)));
 
 /* task data */
 struct ckpt_hdr_task {
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index 72052ef..d5db5c9 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -13,6 +13,9 @@
 #define CHECKPOINT_VERSION  1
 
 
+#define CHECKPOINT_SUBTREE	0x1
+
+
 #ifdef __KERNEL__
 struct ckpt_ctx;
 struct ckpt_hdr_file;
@@ -30,9 +33,10 @@ struct ckpt_ctx {
 
 	ktime_t ktime_begin;	/* checkpoint start time */
 
-	pid_t root_pid;		/* container identifier */
-	struct task_struct *root_task;	/* container root task */
-	struct nsproxy *root_nsproxy;	/* container root nsproxy */
+	int root_init;		/* is root a container init ? */
+	pid_t root_pid;		/* (container) root identifier */
+	struct task_struct *root_task;	/* (container) root task */
+	struct nsproxy *root_nsproxy;	/* (container) root nsproxy */
 
 	unsigned long kflags;	/* kerenl flags */
 	unsigned long uflags;	/* user flags */
@@ -41,6 +45,9 @@ struct ckpt_ctx {
 	struct file *file;	/* input/output file */
 	int total;		/* total read/written */
 
+	struct task_struct **tasks_arr;	/* array of all tasks in container */
+	int nr_tasks;			/* size of tasks array */
+
 	struct ckpt_obj_hash *obj_hash;	/* repository for shared objects */
 
 	struct path fs_mnt;     /* container root (FIXME) */
@@ -58,6 +65,9 @@ struct ckpt_ctx {
 #define CKPT_CTX_CHECKPOINT	(1 << CKPT_CTX_CHECKPOINT_BIT)
 #define CKPT_CTX_RESTART	(1 << CKPT_CTX_RESTART_BIT)
 
+/* ckpt ctx: uflags */
+#define CHECKPOINT_USER_FLAGS		CHECKPOINT_SUBTREE
+
 #endif /* __KERNEL__ */
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2970d5..c476f7c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -194,6 +194,10 @@ int sysctl_legacy_va_layout;
 extern int prove_locking;
 extern int lock_stat;
 
+#ifdef CONFIG_CHECKPOINT
+extern int ckpt_unpriv_allowed;
+#endif
+
 /* The default sysctl tables: */
 
 static struct ctl_table root_table[] = {
@@ -912,6 +916,19 @@ static struct ctl_table kern_table[] = {
 		.child		= slow_work_sysctls,
 	},
 #endif
+#ifdef CONFIG_CHECKPOINT
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "ckpt_unpriv_allowed",
+		.data		= &ckpt_unpriv_allowed,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 23/43] c/r: restart multiple processes
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (17 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 22/43] c/r: checkpoint multiple processes Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
       [not found]   ` <1243445589-32388-24-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-05-27 17:32 ` [RFC v16][PATCH 24/43] c/r: detect resource leaks for whole-container checkpoint Oren Laadan
                   ` (15 subsequent siblings)
  34 siblings, 1 reply; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Restarting of multiple processes expects all restarting tasks to call
sys_restart(). Once inside the system call, each task will restart
itself at the same order that they were saved. The internals of the
syscall will take care of in-kernel synchronization bewteen tasks.
This patch does _not_ create the task tree in the kernel. Instead it
assumes that all tasks are created in some way and then invoke the
restart syscall. You can use the userspace mktree.c program to do
that.
The init task (*) has a special role: it allocates the restart context
(ctx), and coordinates the operation. In particular, it first waits
until all participating tasks enter the kernel, and provides them the
common restart context. Once everyone in ready, it begins to restart
itself.
In contrast, the other tasks enter the kernel, locate the init task (*)
and grab its restart context, and then wait for their turn to restore.
When a task (init or not) completes its restart, it hands the control
over to the next in line, by waking that task.
An array of pids (the one saved during the checkpoint) is used to
synchronize the operation. The first task in the array is the init
task (*). The restart context (ctx) maintain a "current position" in
the array, which indicates which task is currently active. Once the
currently active task completes its own restart, it increments that
position and wakes up the next task.
Restart assumes that userspace provides meaningful data, otherwise
it's garbage-in-garbage-out. In this case, the syscall may block
indefinitely, but in TASK_INTERRUPTIBLE, so the user can ctrl-c or
otherwise kill the stray restarting tasks.
In terms of security, restart runs as the user the invokes it, so it
will not allow a user to do more than is otherwise permitted by the
usual system semantics and policy.
Currently we ignore threads and zombies, as well as session ids.
Add support for multiple processes
(*) For containers, restart should be called inside a fresh container
by the init task of that container. However, it is also possible to
restart applications not necessarily inside a container, and without
restoring the original pids of the processes (that is, provided that
the application can tolerate such behavior). This is useful to allow
multi-process restart of tasks not isolated inside a container, and
also for debugging.
Changelog[v14]:
  - Revert change to pr_debug(), back to ckpt_debug()
  - Discard field 'h.parent'
  - Check whether calls to ckpt_hbuf_get() fail
Changelog[v13]:
  - Clear root_task->checkpoint_ctx regardless of error condition
  - Remove unused argument 'ctx' from do_restore_task() prototype
  - Remove unused member 'pids_err' from 'struct ckpt_ctx'
Changelog[v12]:
  - Replace obsolete ckpt_debug() with pr_debug()
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 checkpoint/restart.c             |  242 ++++++++++++++++++++++++++++++++++++--
 checkpoint/sys.c                 |   27 ++++-
 include/linux/checkpoint.h       |    3 +
 include/linux/checkpoint_types.h |   17 +++-
 include/linux/sched.h            |    4 +
 5 files changed, 277 insertions(+), 16 deletions(-)
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 8b8229e..5e68835 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -13,6 +13,7 @@
 
 #include <linux/version.h>
 #include <linux/sched.h>
+#include <linux/wait.h>
 #include <linux/file.h>
 #include <linux/magic.h>
 #include <linux/utsname.h>
@@ -353,12 +354,233 @@ static int restore_read_tail(struct ckpt_ctx *ctx)
 	return ret;
 }
 
+/* restore_read_tree - read the tasks tree into the checkpoint context */
+static int restore_read_tree(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_tree *h;
+	int size, ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TREE);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ret = -EINVAL;
+	if (h->nr_tasks < 0)
+		goto out;
+
+	ctx->nr_pids = h->nr_tasks;
+	size = sizeof(*ctx->pids_arr) * ctx->nr_pids;
+	if (size < 0)		/* overflow ? */
+		goto out;
+
+	ctx->pids_arr = kmalloc(size, GFP_KERNEL);
+	if (!ctx->pids_arr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = _ckpt_read_buffer(ctx, ctx->pids_arr, size);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static inline pid_t active_pid(struct ckpt_ctx *ctx)
+{
+	return ctx->pids_arr[ctx->active_pid].vpid;
+}
+
+static int restore_wait_task(struct ckpt_ctx *ctx)
+{
+	pid_t pid = task_pid_vnr(current);
+
+	ckpt_debug("pid %d waiting\n", pid);
+	return wait_event_interruptible(ctx->waitq, active_pid(ctx) == pid);
+}
+
+static int restore_next_task(struct ckpt_ctx *ctx)
+{
+	struct task_struct *task;
+
+	ctx->active_pid++;
+
+	ckpt_debug("active_pid %d of %d\n", ctx->active_pid, ctx->nr_pids);
+	if (ctx->active_pid == ctx->nr_pids) {
+		complete(&ctx->complete);
+		return 0;
+	}
+
+	ckpt_debug("pids_next %d\n", active_pid(ctx));
+
+	rcu_read_lock();
+	task = find_task_by_pid_ns(active_pid(ctx), ctx->root_nsproxy->pid_ns);
+	if (task)
+		wake_up_process(task);
+	rcu_read_unlock();
+
+	if (!task) {
+		complete(&ctx->complete);
+		return -ESRCH;
+	}
+
+	return 0;
+}
+
+/* FIXME: this should be per container */
+DECLARE_WAIT_QUEUE_HEAD(restore_waitq);
+
+static int do_restore_task(pid_t pid)
+{
+	struct task_struct *root_task;
+	struct ckpt_ctx *ctx = NULL;
+	int ret;
+
+	rcu_read_lock();
+	root_task = find_task_by_pid_ns(pid, current->nsproxy->pid_ns);
+	if (root_task)
+		get_task_struct(root_task);
+	rcu_read_unlock();
+
+	if (!root_task)
+		return -EINVAL;
+
+	/*
+	 * wait for container init to initialize the restart context, then
+	 * grab a reference to that context, and if we're the last task to
+	 * do it, notify the container init.
+	 */
+	ret = wait_event_interruptible(restore_waitq,
+				       root_task->checkpoint_ctx);
+	if (ret < 0)
+		goto out;
+
+	task_lock(root_task);
+	ctx = root_task->checkpoint_ctx;
+	if (ctx)
+		ckpt_ctx_get(ctx);
+	task_unlock(root_task);
+
+	if (!ctx) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	if (atomic_dec_and_test(&ctx->tasks_count))
+		complete(&ctx->complete);
+
+	/* wait for our turn, do the restore, and tell next task in line */
+	ret = restore_wait_task(ctx);
+	if (ret < 0)
+		goto out;
+
+	ret = restore_task(ctx);
+	if (ret < 0)
+		goto out;
+
+	ret = restore_next_task(ctx);
+ out:
+	ckpt_ctx_put(ctx);
+	put_task_struct(root_task);
+	return ret;
+}
+
+/**
+ * wait_all_tasks_start - wait for all tasks to enter sys_restart()
+ * @ctx: checkpoint context
+ *
+ * Called by the container root to wait until all restarting tasks
+ * are ready to restore their state. Temporarily advertises the 'ctx'
+ * on 'current->checkpoint_ctx' so that others can grab a reference
+ * to it, and clears it once synchronization completes. See also the
+ * related code in do_restore_task().
+ */
+static int wait_all_tasks_start(struct ckpt_ctx *ctx)
+{
+	int ret;
+
+	if (ctx->nr_pids == 1)
+		return 0;
+
+	init_completion(&ctx->complete);
+	current->checkpoint_ctx = ctx;
+
+	wake_up_all(&restore_waitq);
+
+	ret = wait_for_completion_interruptible(&ctx->complete);
+
+	task_lock(current);
+	current->checkpoint_ctx = NULL;
+	task_unlock(current);
+
+	return ret;
+}
+
+static int wait_all_tasks_finish(struct ckpt_ctx *ctx)
+{
+	int ret;
+
+	if (ctx->nr_pids == 1)
+		return 0;
+
+	init_completion(&ctx->complete);
+
+	ret = restore_next_task(ctx);
+	if (ret < 0)
+		return ret;
+
+	ret = wait_for_completion_interruptible(&ctx->complete);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
 /* setup restart-specific parts of ctx */
 static int init_restart_ctx(struct ckpt_ctx *ctx, pid_t pid)
 {
+	ctx->root_pid = pid;
+	ctx->root_task = current;
+	ctx->root_nsproxy = current->nsproxy;
+
+	get_task_struct(ctx->root_task);
+	get_nsproxy(ctx->root_nsproxy);
+
+	atomic_set(&ctx->tasks_count, ctx->nr_pids - 1);
+
 	return 0;
 }
 
+static int do_restore_root(struct ckpt_ctx *ctx, pid_t pid)
+{
+	int ret;
+
+	ret = restore_read_header(ctx);
+	if (ret < 0)
+		return ret;
+	ret = restore_read_tree(ctx);
+	if (ret < 0)
+		return ret;
+
+	ret = init_restart_ctx(ctx, pid);
+	if (ret < 0)
+		return ret;
+
+	/* wait for all other tasks to enter do_restore_task() */
+	ret = wait_all_tasks_start(ctx);
+	if (ret < 0)
+		return ret;
+
+	ret = restore_task(ctx);
+	if (ret < 0)
+		return ret;
+
+	/* wait for all other tasks to complete do_restore_task() */
+	ret = wait_all_tasks_finish(ctx);
+	if (ret < 0)
+		return ret;
+
+	return restore_read_tail(ctx);
+}
+
 static int restore_retval(void)
 {
 	struct pt_regs *regs = task_pt_regs(current);
@@ -391,18 +613,18 @@ int do_restart(struct ckpt_ctx *ctx, pid_t pid)
 {
 	int ret;
 
-	ret = init_restart_ctx(ctx, pid);
-	if (ret < 0)
-		return ret;
-	ret = restore_read_header(ctx);
-	if (ret < 0)
-		return ret;
-	ret = restore_task(ctx);
-	if (ret < 0)
-		return ret;
-	ret = restore_read_tail(ctx);
+	if (ctx)
+		ret = do_restore_root(ctx, pid);
+	else
+		ret = do_restore_task(pid);
+
 	if (ret < 0)
 		return ret;
 
+	/*
+	 * The retval from either is what we return to the caller when all
+	 * goes well: this is the retval from the original syscall that was
+	 * interrupted during checkpoint (zero if the task was in userspace).
+	 */
 	return restore_retval();
 }
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 46eadf5..f6cf0ac 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -188,6 +188,8 @@ static void task_arr_free(struct ckpt_ctx *ctx)
 
 static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 {
+	BUG_ON(atomic_read(&ctx->refcount));
+
 	if (ctx->file)
 		fput(ctx->file);
 
@@ -203,6 +205,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 	if (ctx->root_task)
 		put_task_struct(ctx->root_task);
 
+	kfree(ctx->pids_arr);
+
 	kfree(ctx);
 }
 
@@ -220,8 +224,10 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
 	ctx->kflags = kflags;
 	ctx->ktime_begin = ktime_get();
 
+	atomic_set(&ctx->refcount, 0);
 	INIT_LIST_HEAD(&ctx->pgarr_list);
 	INIT_LIST_HEAD(&ctx->pgarr_pool);
+	init_waitqueue_head(&ctx->waitq);
 
 	err = -EBADF;
 	ctx->file = fget(fd);
@@ -232,12 +238,24 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
 	if (ckpt_obj_hash_alloc(ctx) < 0)
 		goto err;
 
+	atomic_inc(&ctx->refcount);
 	return ctx;
  err:
 	ckpt_ctx_free(ctx);
 	return ERR_PTR(err);
 }
 
+void ckpt_ctx_get(struct ckpt_ctx *ctx)
+{
+	atomic_inc(&ctx->refcount);
+}
+
+void ckpt_ctx_put(struct ckpt_ctx *ctx)
+{
+	if (ctx && atomic_dec_and_test(&ctx->refcount))
+		ckpt_ctx_free(ctx);
+}
+
 /**
  * sys_checkpoint - checkpoint a container
  * @pid: pid of the container init(1) process
@@ -269,7 +287,7 @@ SYSCALL_DEFINE3(checkpoint, pid_t, pid, int, fd, unsigned long, flags)
 	if (!ret)
 		ret = ctx->crid;
 
-	ckpt_ctx_free(ctx);
+	ckpt_ctx_put(ctx);
 	return ret;
 }
 
@@ -284,7 +302,7 @@ SYSCALL_DEFINE3(checkpoint, pid_t, pid, int, fd, unsigned long, flags)
  */
 SYSCALL_DEFINE3(restart, pid_t, pid, int, fd, unsigned long, flags)
 {
-	struct ckpt_ctx *ctx;
+	struct ckpt_ctx *ctx = NULL;
 	int ret;
 
 	/* no flags for now */
@@ -294,13 +312,14 @@ SYSCALL_DEFINE3(restart, pid_t, pid, int, fd, unsigned long, flags)
 	if (!ckpt_unpriv_allowed && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	ctx = ckpt_ctx_alloc(fd, flags, CKPT_CTX_RESTART);
+	if (pid == task_pid_vnr(current))
+		ctx = ckpt_ctx_alloc(fd, flags, CKPT_CTX_RESTART);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
 	ret = do_restart(ctx, pid);
 
-	ckpt_ctx_free(ctx);
+	ckpt_ctx_put(ctx);
 	return ret;
 }
 
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index e1204cf..e9efa34 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -51,6 +51,9 @@ extern int ckpt_obj_lookup_add(struct ckpt_ctx *ctx, void *ptr,
 extern int ckpt_obj_insert(struct ckpt_ctx *ctx, void *ptr, int objref,
 			   enum obj_type type);
 
+extern void ckpt_ctx_get(struct ckpt_ctx *ctx);
+extern void ckpt_ctx_put(struct ckpt_ctx *ctx);
+
 extern int do_checkpoint(struct ckpt_ctx *ctx, pid_t pid);
 extern int do_restart(struct ckpt_ctx *ctx, pid_t pid);
 
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index d5db5c9..f39e1c1 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -25,6 +25,8 @@ struct ckpt_hdr_vma;
 #include <linux/path.h>
 #include <linux/fs.h>
 #include <linux/ktime.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
 
 #include <linux/sched.h>
 
@@ -45,8 +47,7 @@ struct ckpt_ctx {
 	struct file *file;	/* input/output file */
 	int total;		/* total read/written */
 
-	struct task_struct **tasks_arr;	/* array of all tasks in container */
-	int nr_tasks;			/* size of tasks array */
+	atomic_t refcount;
 
 	struct ckpt_obj_hash *obj_hash;	/* repository for shared objects */
 
@@ -56,6 +57,18 @@ struct ckpt_ctx {
 
 	struct list_head pgarr_list;	/* page array to dump VMA contents */
 	struct list_head pgarr_pool;	/* pool of empty page arrays chain */
+
+	/* [multi-process checkpoint] */
+	struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
+	int nr_tasks;                   /* size of tasks array */
+
+	/* [multi-process restart] */
+	struct ckpt_hdr_pids *pids_arr;	/* array of all pids [restart] */
+	int nr_pids;			/* size of pids array */
+	int active_pid;			/* (next) position in pids array */
+	atomic_t tasks_count;		/* sync of tasks: used to coordinate */
+	struct completion complete;	/* container root and other tasks on */
+	wait_queue_head_t waitq;	/* start, end, and restart ordering */
 };
 
 /* ckpt_ctx: kflags */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc..d057e7a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1429,6 +1429,10 @@ struct task_struct {
 	/* state flags for use by tracers */
 	unsigned long trace;
 #endif
+
+#ifdef CONFIG_CHECKPOINT
+	struct ckpt_ctx *checkpoint_ctx;
+#endif
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 24/43] c/r: detect resource leaks for whole-container checkpoint
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (18 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 23/43] c/r: restart " Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 25/43] tee: don't return 0 when another task drains/fills a pipe Oren Laadan
                   ` (14 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Add a 'users' count to objhash items, and, for a !CHECKPOINT_SUBTREE
checkpoint, return an error code if the actual objects' counts are
higher, indicating leaks (references to the objects from a task not
being checkpointed).  Of course, by this time most of the checkpoint
image has been written out to disk, so this is purely advisory.  But
then, it's probably naive to argue that anything more than an advisory
'this went wrong' error code is useful.
The comparison of the objhash user counts to object refcounts as a
basis for checking for leaks comes from Alexey's OpenVZ-based c/r
patchset.
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 checkpoint/checkpoint.c    |    8 ++++
 checkpoint/objhash.c       |   82 ++++++++++++++++++++++++++++++++++++++++++--
 include/linux/checkpoint.h |    1 +
 3 files changed, 88 insertions(+), 3 deletions(-)
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index 92f219e..b70adf4 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -578,6 +578,14 @@ int do_checkpoint(struct ckpt_ctx *ctx, pid_t pid)
 	if (ret < 0)
 		goto out;
 
+	if (!(ctx->uflags & CHECKPOINT_SUBTREE)) {
+		/* verify that all objects are contained (no leaks) */
+		if (!ckpt_obj_contained(ctx)) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+
 	/* on success, return (unique) checkpoint identifier */
 	ctx->crid = atomic_inc_return(&ctx_count);
 	ret = ctx->crid;
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index ff9388d..e481911 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -27,19 +27,23 @@ struct ckpt_obj_ops {
 	enum obj_type obj_type;
 	void (*ref_drop)(void *ptr);
 	int (*ref_grab)(void *ptr);
+	int (*ref_users)(void *ptr);
 	int (*checkpoint)(struct ckpt_ctx *ctx, void *ptr);
 	void *(*restore)(struct ckpt_ctx *ctx);
 };
 
 struct ckpt_obj {
+	int users;
 	int objref;
 	void *ptr;
 	struct ckpt_obj_ops *ops;
 	struct hlist_node hash;
+	struct hlist_node next;
 };
 
 struct ckpt_obj_hash {
 	struct hlist_head *head;
+	struct hlist_head list;
 	int next_free_objref;
 };
 
@@ -53,7 +57,7 @@ void *restore_bad(struct ckpt_ctx *ctx)
 	return ERR_PTR(-EINVAL);
 }
 
-/* helper grab/drop functions: */
+/* helper grab/drop/users functions */
 
 static void obj_no_drop(void *ptr)
 {
@@ -86,6 +90,11 @@ static void obj_file_table_drop(void *ptr)
 	put_files_struct((struct files_struct *) ptr);
 }
 
+static int obj_file_table_users(void *ptr)
+{
+	return atomic_read(&((struct files_struct *) ptr)->count);
+}
+
 static int obj_file_grab(void *ptr)
 {
 	get_file((struct file *) ptr);
@@ -97,6 +106,11 @@ static void obj_file_drop(void *ptr)
 	fput((struct file *) ptr);
 }
 
+static int obj_file_users(void *ptr)
+{
+	return atomic_long_read(&((struct file *) ptr)->f_count);
+}
+
 static int obj_mm_grab(void *ptr)
 {
 	atomic_inc(&((struct mm_struct *) ptr)->mm_users);
@@ -108,6 +122,11 @@ static void obj_mm_drop(void *ptr)
 	mmput((struct mm_struct *) ptr);
 }
 
+static int obj_mm_users(void *ptr)
+{
+	return atomic_read(&((struct mm_struct *) ptr)->mm_users);
+}
+
 static struct ckpt_obj_ops ckpt_obj_ops[] = {
 	/* ignored object */
 	{
@@ -131,6 +150,7 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.obj_type = CKPT_OBJ_FILE_TABLE,
 		.ref_drop = obj_file_table_drop,
 		.ref_grab = obj_file_table_grab,
+		.ref_users = obj_file_table_users,
 		.checkpoint = checkpoint_file_table,
 		.restore = restore_file_table,
 	},
@@ -140,6 +160,7 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.obj_type = CKPT_OBJ_FILE,
 		.ref_drop = obj_file_drop,
 		.ref_grab = obj_file_grab,
+		.ref_users = obj_file_users,
 		.checkpoint = checkpoint_file,
 		.restore = restore_file,
 	},
@@ -149,6 +170,7 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.obj_type = CKPT_OBJ_MM,
 		.ref_drop = obj_mm_drop,
 		.ref_grab = obj_mm_grab,
+		.ref_users = obj_mm_users,
 		.checkpoint = checkpoint_mm,
 		.restore = restore_mm,
 	},
@@ -201,6 +223,7 @@ int ckpt_obj_hash_alloc(struct ckpt_ctx *ctx)
 
 	obj_hash->head = head;
 	obj_hash->next_free_objref = 1;
+	INIT_HLIST_HEAD(&obj_hash->list);
 
 	ctx->obj_hash = obj_hash;
 	return 0;
@@ -259,6 +282,7 @@ static int obj_new(struct ckpt_ctx *ctx, void *ptr, int objref,
 
 	obj->ptr = ptr;
 	obj->ops = ops;
+	obj->users = 2;  /* extra reference that objhash itself takes */
 
 	if (objref) {
 		/* use @obj->objref to index (restart) */
@@ -271,10 +295,12 @@ static int obj_new(struct ckpt_ctx *ctx, void *ptr, int objref,
 	}
 
 	ret = ops->ref_grab(obj->ptr);
-	if (ret < 0)
+	if (ret < 0) {
 		kfree(obj);
-	else
+	} else {
 		hlist_add_head(&obj->hash, &ctx->obj_hash->head[i]);
+		hlist_add_head(&obj->next, &ctx->obj_hash->list);
+	}
 
 	return (ret < 0 ? ret : obj->objref);
 }
@@ -335,6 +361,7 @@ int ckpt_obj_lookup_add(struct ckpt_ctx *ctx, void *ptr,
 		return -EINVAL;
 	} else {
 		objref = obj->objref;
+		obj->users++;
 		*first = 0;
 	}
 
@@ -342,6 +369,54 @@ int ckpt_obj_lookup_add(struct ckpt_ctx *ctx, void *ptr,
 	return objref;
 }
 
+/* increment the 'users' count of an object */
+static void ckpt_obj_users_inc(struct ckpt_ctx *ctx, void *ptr, int increment)
+{
+	struct ckpt_obj *obj;
+
+	obj = obj_find_by_ptr(ctx, ptr);
+	if (obj)
+		obj->users += increment;
+}
+
+/**
+ * ckpt_obj_contained - test if shared objects are "contained" in checkpoint
+ * @ctx: checkpoint
+ *
+ * Loops through all objects in the table and compares the number of
+ * references accumulated during checkpoint, with the reference count
+ * reported by the kernel.
+ *
+ * Return 1 if respective counts match for all objects, 0 otherwise.
+ */
+int ckpt_obj_contained(struct ckpt_ctx *ctx)
+{
+	struct ckpt_obj *obj;
+	struct hlist_node *node;
+
+	/* account for ctx->file reference (if in the table already) */
+	ckpt_obj_users_inc(ctx, ctx->file, 1);
+
+	hlist_for_each_entry(obj, node, &ctx->obj_hash->list, next) {
+		if (!obj->ops->ref_users)
+			continue;
+		if (obj->ops->ref_users(obj->ptr) != obj->users) {
+			ckpt_debug("usage leak: %s\n", obj->ops->obj_name);
+			ckpt_write_err(ctx, "%s leak: users %d != c/r %d\n",
+				       obj->ops->obj_name,
+				       obj->ops->ref_users(obj->ptr),
+				       obj->users);
+			printk(KERN_NOTICE "c/r: %s users %d != count %d\n",
+			       obj->ops->obj_name,
+			       obj->ops->ref_users(obj->ptr),
+			       obj->users);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
 /**
  * checkpoint_obj - if not already in hash, add object and checkpoint
  * @ctx: checkpoint context
@@ -371,6 +446,7 @@ int checkpoint_obj(struct ckpt_ctx *ctx, void *ptr, enum obj_type type)
 	obj = obj_find_by_ptr(ctx, ptr);
 	if (obj) {
 		BUG_ON(obj->ops->obj_type != type);
+		obj->users++;
 		return obj->objref;
 	}
 
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index e9efa34..171e92e 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -44,6 +44,7 @@ extern int ckpt_obj_hash_alloc(struct ckpt_ctx *ctx);
 
 extern int restore_obj(struct ckpt_ctx *ctx, struct ckpt_hdr_objref *h);
 extern int checkpoint_obj(struct ckpt_ctx *ctx, void *ptr, enum obj_type type);
+extern int ckpt_obj_contained(struct ckpt_ctx *ctx);
 extern void *ckpt_obj_fetch(struct ckpt_ctx *ctx, int objref,
 			    enum obj_type type);
 extern int ckpt_obj_lookup_add(struct ckpt_ctx *ctx, void *ptr,
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 25/43] tee: don't return 0 when another task drains/fills a pipe
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (19 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 24/43] c/r: detect resource leaks for whole-container checkpoint Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 26/43] splice: added support for pipe-to-pipe splice() Oren Laadan
                   ` (13 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
This patch is a modified version of Max Kellerman patch that fixes
a race in do_tee() (see http://patchwork/kernel/org/patch/21040).
It differs in that it rafactors link_pipe() so that the following
patch (that adds support for splice() between pipes, also based on
a patch by Max Kellerman), can better share code.
Below is Max's original description:
--
Cite from the tee() manual page:
 "A return value of 0 means that there was no data to transfer, and it
 would not make sense to block, because there are no writers connected
 to the write end of the pipe"
There is however a race condition in the tee() implementation, which
violates this definition:
- do_tee() ensures that ipipe is readable and opipe is writable by
  calling link_ipipe_prep() and link_opipe_prep()
- these two functions unlock the pipe after they have waited
- during this unlocked phase, there is a short window where other
  tasks may drain the input pipe or fill the output pipe
- do_tee() now calls link_pipe(), which re-locks both pipes
- link_pipe() sees that it is unable to read ("i >= ipipe->nrbufs ||
  opipe->nrbufs >= PIPE_BUFFERS") and breaks from the loop
- link_pipe() returns 0
Although there may be writers connected to the input pipe, tee() now
returns 0, and the caller (spuriously) assumes this is the end of the
stream.
This patch wraps the link_[io]pipe_prep() invocation in a loop within
link_pipe(), and loops until the result is reliable.
--
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 fs/splice.c |   80 +++++++++++++++++++++++++++++++++++++++++++++--------------
 1 files changed, 61 insertions(+), 19 deletions(-)
diff --git a/fs/splice.c b/fs/splice.c
index 666953d..92dd63c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1586,6 +1586,59 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 	return ret;
 }
 
+/**
+ * link_pipe_prep - make sure there's readable data and writable room
+ * @ipipe: the input pipe
+ * @opipe: the output pipe
+ * @flags: splice modifier flags
+ *
+ * Wrap the link_[io]pipe_prep() invocation in a loop until the result
+ * is reliable.
+ *
+ * Expects pipes to be unlocked, and on success returns them locked.
+ */
+static int link_pipe_prep(struct pipe_inode_info *ipipe,
+			  struct pipe_inode_info *opipe,
+			  unsigned int flags)
+{
+	int ret;
+
+	while (1) {
+		/* wait for ipipe to become ready to read */
+		ret = link_ipipe_prep(ipipe, flags);
+		if (ret)
+			return ret;
+
+		/* wait for opipe to become ready to write */
+		ret = link_opipe_prep(opipe, flags);
+		if (ret)
+			return ret;
+
+		/*
+		 * Potential ABBA deadlock, work around it by ordering
+		 * lock grabbing by inode address. Otherwise two
+		 * different processes could deadlock (one doing tee
+		 * from A -> B, the other from B -> A).
+		 */
+		pipe_double_lock(ipipe, opipe);
+
+		/* see if the tee() is still possible */
+		if ((ipipe->nrbufs > 0 || ipipe->writers == 0) &&
+		    opipe->nrbufs < PIPE_BUFFERS)
+			/* yes, it is - keep the locks and end this
+			   loop */
+			break;
+
+		/* no - someone has drained ipipe or has filled opipe
+		   between link_[io]pipe_pre()'s lock and our lock.
+		   Drop both locks and wait again. */
+		pipe_unlock(ipipe);
+		pipe_unlock(opipe);
+	}
+
+	return 0;
+}
+
 /*
  * Link contents of ipipe to opipe.
  */
@@ -1594,14 +1647,13 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 		     size_t len, unsigned int flags)
 {
 	struct pipe_buffer *ibuf, *obuf;
-	int ret = 0, i = 0, nbuf;
+	int ret, i = 0, nbuf;
 
-	/*
-	 * Potential ABBA deadlock, work around it by ordering lock
-	 * grabbing by pipe info address. Otherwise two different processes
-	 * could deadlock (one doing tee from A -> B, the other from B -> A).
-	 */
-	pipe_double_lock(ipipe, opipe);
+	ret = link_pipe_prep(ipipe, opipe, flags);
+	if (ret < 0)
+		return ret;
+
+	/* pipes are now locked */
 
 	do {
 		if (!opipe->readers) {
@@ -1685,18 +1737,8 @@ static long do_tee(struct file *in, struct file *out, size_t len,
 	 * Duplicate the contents of ipipe to opipe without actually
 	 * copying the data.
 	 */
-	if (ipipe && opipe && ipipe != opipe) {
-		/*
-		 * Keep going, unless we encounter an error. The ipipe/opipe
-		 * ordering doesn't really matter.
-		 */
-		ret = link_ipipe_prep(ipipe, flags);
-		if (!ret) {
-			ret = link_opipe_prep(opipe, flags);
-			if (!ret)
-				ret = link_pipe(ipipe, opipe, len, flags);
-		}
-	}
+	if (ipipe && opipe && ipipe != opipe)
+		ret = link_pipe(ipipe, opipe, len, flags);
 
 	return ret;
 }
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 26/43] splice: added support for pipe-to-pipe splice()
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (20 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 25/43] tee: don't return 0 when another task drains/fills a pipe Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 27/43] c/r: support for open pipes Oren Laadan
                   ` (12 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
This patch is a modified version of Max Kellerman patch that allows
splice() between pipes (see http://patchwork/kernel/org/patch/21042).
By refactoring link_pipe(), do_tee() and do_splice_pipes() shrink
considerably. Below is Max's original description:
--
This patch enables the splice() system call to copy buffers from one
pipe to another.  This obvious and trivial use case for splice() was
not supported until now.
It reuses the functions link_ipipe_prep() and link_opipe_prep() from
the tee() system call implementation.
--
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 fs/splice.c |  203 ++++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 files changed, 166 insertions(+), 37 deletions(-)
diff --git a/fs/splice.c b/fs/splice.c
index 92dd63c..96e0d58 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -903,13 +903,95 @@ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
 EXPORT_SYMBOL(generic_splice_sendpage);
 
 /*
+ * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
+ * location, so checking ->i_pipe is not enough to verify that this is a
+ * pipe.
+ */
+static inline struct pipe_inode_info *pipe_info(struct inode *inode)
+{
+	if (S_ISFIFO(inode->i_mode))
+		return inode->i_pipe;
+
+	return NULL;
+}
+
+static int link_pipe_prep(struct pipe_inode_info *ipipe,
+			  struct pipe_inode_info *opipe,
+			  unsigned int flags);
+static long do_link_pipe(struct pipe_inode_info *ipipe,
+			 struct pipe_inode_info *opipe,
+			 size_t len, unsigned int flags, int move);
+
+/**
+* Splice pages from one pipe to another.
+*
+* @ipipe the input pipe
+* @opipe the output pipe
+* @len the maximum number of bytes to move
+* @flags splice modifier flags
+*/
+static long do_splice_pipes(struct pipe_inode_info *ipipe,
+			    struct pipe_inode_info *opipe,
+			    size_t len, unsigned int flags)
+{
+	int do_wakeup;
+	long ret;
+
+	if (ipipe == opipe)
+		return -EINVAL;
+
+	ret = link_pipe_prep(ipipe, opipe, flags);
+	if (ret < 0)
+		return ret;
+
+	/* both pipes are now locked */
+
+	do_wakeup = ipipe->nrbufs;
+	ret = do_link_pipe(ipipe, opipe, len, flags, 1);
+	do_wakeup -= ipipe->nrbufs;
+
+	pipe_unlock(ipipe);
+	pipe_unlock(opipe);
+
+	if (do_wakeup) {
+		/* at least one buffer was removed from the
+		   input pipe: wake up potential writers */
+		smp_mb();
+		if (waitqueue_active(&ipipe->wait))
+			wake_up_interruptible(&ipipe->wait);
+		kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT);
+	}
+
+	/*
+	 * If we put data in the output pipe, wakeup any potential
+	 * readers.
+	 */
+	if (ret > 0) {
+		smp_mb();
+		if (waitqueue_active(&opipe->wait))
+			wake_up_interruptible(&opipe->wait);
+		kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
+	}
+
+	return ret;
+}
+
+/*
  * Attempt to initiate a splice from pipe to file.
  */
 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 			   loff_t *ppos, size_t len, unsigned int flags)
 {
+	struct pipe_inode_info *opipe;
 	int ret;
 
+	opipe = pipe_info(out->f_dentry->d_inode);
+	if (opipe) {
+		if (unlikely(!(out->f_mode & FMODE_WRITE)))
+			return -EBADF;
+		return do_splice_pipes(pipe, opipe, len, flags);
+	}
+
 	if (unlikely(!out->f_op || !out->f_op->splice_write))
 		return -EINVAL;
 
@@ -933,8 +1015,16 @@ static long do_splice_to(struct file *in, loff_t *ppos,
 			 struct pipe_inode_info *pipe, size_t len,
 			 unsigned int flags)
 {
+	struct pipe_inode_info *ipipe;
 	int ret;
 
+	ipipe = pipe_info(in->f_dentry->d_inode);
+	if (ipipe) {
+		if (unlikely(!(in->f_mode & FMODE_READ)))
+			return -EBADF;
+		return do_splice_pipes(ipipe, pipe, len, flags);
+	}
+
 	if (unlikely(!in->f_op || !in->f_op->splice_read))
 		return -EINVAL;
 
@@ -1113,19 +1203,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 }
 
 /*
- * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
- * location, so checking ->i_pipe is not enough to verify that this is a
- * pipe.
- */
-static inline struct pipe_inode_info *pipe_info(struct inode *inode)
-{
-	if (S_ISFIFO(inode->i_mode))
-		return inode->i_pipe;
-
-	return NULL;
-}
-
-/*
  * Determine where to splice to/from.
  */
 static long do_splice(struct file *in, loff_t __user *off_in,
@@ -1140,7 +1217,10 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 	if (pipe) {
 		if (off_in)
 			return -ESPIPE;
+
 		if (off_out) {
+			if (pipe_info(out->f_path.dentry->d_inode))
+				return -ESPIPE;
 			if (out->f_op->llseek == no_llseek)
 				return -EINVAL;
 			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
@@ -1639,21 +1719,36 @@ static int link_pipe_prep(struct pipe_inode_info *ipipe,
 	return 0;
 }
 
-/*
- * Link contents of ipipe to opipe.
- */
-static int link_pipe(struct pipe_inode_info *ipipe,
-		     struct pipe_inode_info *opipe,
-		     size_t len, unsigned int flags)
+/**
+* Returns the nth pipe buffer after the current one.
+*
+* @i the buffer index, relative to the current one
+*/
+static inline struct pipe_buffer *
+pipe_buffer_at(struct pipe_inode_info *pipe, unsigned i)
 {
-	struct pipe_buffer *ibuf, *obuf;
-	int ret, i = 0, nbuf;
+	BUG_ON(i >= PIPE_BUFFERS);
 
-	ret = link_pipe_prep(ipipe, opipe, flags);
-	if (ret < 0)
-		return ret;
+	return pipe->bufs + ((pipe->curbuf + i) & (PIPE_BUFFERS - 1));
+}
 
-	/* pipes are now locked */
+/**
+ * do_link_pipe - copy or move (splice) contents of ipipe to opipe.
+ * @ipipe: the input pipe
+ * @opipe: the output pipe
+ * @len: the maximum number of byte to copy/move
+ * @flags: splice modifier flags
+ * @move: indicates move operation (otherwise copy)
+ *
+ * expects both pipes to be locked
+ */
+static long do_link_pipe(struct pipe_inode_info *ipipe,
+			struct pipe_inode_info *opipe,
+			size_t len, unsigned int flags, int move)
+{
+	struct pipe_buffer *ibuf, *obuf;
+	long ret = 0;
+	int i = 0;
 
 	do {
 		if (!opipe->readers) {
@@ -1670,16 +1765,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 		if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
 			break;
 
-		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
-		nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
-
-		/*
-		 * Get a reference to this pipe buffer,
-		 * so we can copy the contents over.
-		 */
-		ibuf->ops->get(ipipe, ibuf);
-
-		obuf = opipe->bufs + nbuf;
+		ibuf = pipe_buffer_at(ipipe, i);
+		obuf = pipe_buffer_at(opipe, opipe->nrbufs);
 		*obuf = *ibuf;
 
 		/*
@@ -1688,13 +1775,35 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 		 */
 		obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
 
-		if (obuf->len > len)
+		/* increase the reference count */
+		obuf->ops->get(opipe, obuf);
+
+		/* partial or complete copy/move ? */
+		if (obuf->len > len) {
 			obuf->len = len;
+			if (move) {
+				/* remove portion from ibuf */
+				ibuf->offset += len;
+				ibuf->len -= len;
+			}
+		} else {
+			if (move) {
+				/* remove entirely from ibuf */
+				ibuf->ops->release(ipipe, ibuf);
+				ibuf->ops = NULL;
+
+				ipipe->curbuf = (ipipe->curbuf + 1) &
+					(PIPE_BUFFERS - 1);
+				ipipe->nrbufs--;
+			} else {
+				/* advance pointer in ibuf */
+				i++;
+			}
+		}
 
 		opipe->nrbufs++;
 		ret += obuf->len;
 		len -= obuf->len;
-		i++;
 	} while (len);
 
 	/*
@@ -1704,6 +1813,26 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 	if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
 		ret = -EAGAIN;
 
+	return ret;
+}
+
+/*
+ * Link contents of ipipe to opipe.
+ */
+long link_pipe(struct pipe_inode_info *ipipe,
+	       struct pipe_inode_info *opipe,
+	       size_t len, unsigned int flags)
+{
+	long ret;
+
+	ret = link_pipe_prep(ipipe, opipe, flags);
+	if (ret < 0)
+		return ret;
+
+	/* pipes are now locked */
+
+	ret = do_link_pipe(ipipe, opipe, len, flags, 0);
+
 	pipe_unlock(ipipe);
 	pipe_unlock(opipe);
 
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 27/43] c/r: support for open pipes
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (21 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 26/43] splice: added support for pipe-to-pipe splice() Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 28/43] c/r: make ckpt_may_checkpoint_task() check each namespace individually Oren Laadan
                   ` (11 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
A pipe is a double-headed inode with a buffer attached to it. We
checkpoint the pipe buffer only once, as soon as we hit one side of
the pipe, regardless whether it is read- or write- end.
To checkpoint a file descriptor that refers to a pipe (either end), we
first lookup the inode in the hash table: If not found, it is the
first encounter of this pipe. Besides the file descriptor, we also (a)
save the pipe data, and (b) register the pipe inode in the hash. If
found, it is the second encounter of this pipe, namely, as we hit the
other end of the same pipe. In both cases we write the pipe-objref of
the inode.
To restore, create a new pipe and thus have two file pointers (read-
and write- ends). We only use one of them, depending on which side was
checkpointed first. We register the file pointer of the other end in
the hash table, with the pipe_objref given for this pipe from the
checkpoint, to be used later when the other arrives. At this point we
also restore the contents of the pipe buffers.
To save the pipe buffer, given a source pipe, use do_tee() to clone
its contents into a temporary 'struct pipe_inode_info', and then use
do_splice_from() to transfer it directly to the checkpoint image file.
To restore the pipe buffer, with a fresh newly allocated target pipe,
use do_splice_to() to splice the data directly between the checkpoint
image file and the pipe.
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 checkpoint/files.c             |    7 ++
 fs/pipe.c                      |  173 ++++++++++++++++++++++++++++++++++++++++
 fs/splice.c                    |   10 +-
 include/linux/checkpoint_hdr.h |   12 +++
 include/linux/pipe_fs_i.h      |    6 ++
 include/linux/splice.h         |    9 ++
 6 files changed, 212 insertions(+), 5 deletions(-)
diff --git a/checkpoint/files.c b/checkpoint/files.c
index d7583d3..b264e40 100644
--- a/checkpoint/files.c
+++ b/checkpoint/files.c
@@ -17,6 +17,7 @@
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/fsnotify.h>
+#include <linux/pipe_fs_i.h>
 #include <linux/syscalls.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
@@ -433,6 +434,12 @@ static struct restore_file_ops restore_file_ops[] = {
 		.file_type = CKPT_FILE_GENERIC,
 		.restore = generic_file_restore,
 	},
+	/* pipes */
+	{
+		.file_name = "PIPE",
+		.file_type = CKPT_FILE_PIPE,
+		.restore = pipe_file_restore,
+	},
 };
 
 static struct file *do_restore_file(struct ckpt_ctx *ctx)
diff --git a/fs/pipe.c b/fs/pipe.c
index 13414ec..d0aba56 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -13,6 +13,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/splice.h>
 #include <linux/uio.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
@@ -22,6 +23,9 @@
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
 /*
  * We use a start+len construction, which provides full use of the 
  * allocated memory.
@@ -795,6 +799,172 @@ pipe_rdwr_open(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+#ifdef CONFIG_CHECKPOINT
+static int checkpoint_pipe(struct ckpt_ctx *ctx, struct inode *inode)
+{
+	struct ckpt_hdr_file_pipe_state *h;
+	struct pipe_inode_info *pipe;
+	int len, ret = -ENOMEM;
+
+	pipe = alloc_pipe_info(NULL);
+	if (!pipe)
+		return ret;
+
+	pipe->readers = 1;	/* bluff link_pipe() below */
+	len = link_pipe(inode->i_pipe, pipe, INT_MAX, SPLICE_F_NONBLOCK);
+	if (len == -EAGAIN)
+		len = 0;
+	if (len < 0) {
+		ret = len;
+		goto out;
+	}
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_PIPE);
+	if (!h)
+		goto out;
+	h->pipe_len = len;
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		goto out;
+
+	ret = do_splice_from(pipe, ctx->file, &ctx->file->f_pos, len, 0);
+	if (ret < 0)
+		goto out;
+	if (ret != len)
+		ret = -EPIPE;  /* can occur due to an error in target file */
+ out:
+	__free_pipe_info(pipe);
+	return ret;
+}
+
+static int pipe_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+	struct ckpt_hdr_file_pipe *h;
+	struct inode *inode = file->f_dentry->d_inode;
+	int objref, first, ret;
+
+	objref = ckpt_obj_lookup_add(ctx, inode, CKPT_OBJ_INODE, &first);
+	if (objref < 0)
+		return objref;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+	if (!h)
+		return -ENOMEM;
+
+	h->common.f_type = CKPT_FILE_PIPE;
+	h->pipe_objref = objref;
+
+	ret = checkpoint_file_common(ctx, file, &h->common);
+	if (ret < 0)
+		goto out;
+	ret = ckpt_write_obj(ctx, &h->common.h);
+	if (ret < 0)
+		goto out;
+
+	if (first)
+		ret = checkpoint_pipe(ctx, inode);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static int restore_pipe(struct ckpt_ctx *ctx, struct file *file)
+{
+	struct ckpt_hdr_file_pipe_state *h;
+	struct pipe_inode_info *pipe;
+	int len, ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FILE_PIPE);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	len = h->pipe_len;
+	ckpt_hdr_put(ctx, h);
+
+	if (len < 0)
+		return -EINVAL;
+
+	pipe = file->f_dentry->d_inode->i_pipe;
+	ret = do_splice_to(ctx->file, &ctx->file->f_pos, pipe, len, 0);
+
+	if (ret >= 0 && ret != len)
+		ret = -EPIPE;  /* can occur due to an error in source file */
+
+	return ret;
+}
+
+struct file *pipe_file_restore(struct ckpt_ctx *ctx, struct ckpt_hdr_file *ptr)
+{
+	struct ckpt_hdr_file_pipe *h = (struct ckpt_hdr_file_pipe *) ptr;
+	struct file *file;
+	int fds[2], which, ret;
+
+	if (ptr->h.type != CKPT_HDR_FILE  ||
+	    ptr->h.len != sizeof(*h) || ptr->f_type != CKPT_FILE_PIPE)
+		return ERR_PTR(-EINVAL);
+
+	if (h->pipe_objref <= 0)
+		return ERR_PTR(-EINVAL);
+
+	file = ckpt_obj_fetch(ctx, h->pipe_objref, CKPT_OBJ_FILE);
+	/*
+	 * If ckpt_obj_fetch() returned NULL, then this is the first
+	 * time we see this pipe so need to restore the contents.
+	 * Otherwise, use the file pointer skip forward.
+	 */
+	if (!IS_ERR(file)) {
+		get_file(file);
+	} else if (PTR_ERR(file) == -EINVAL) {
+		/* first encounter of this pipe: create it */
+		ret = do_pipe_flags(fds, 0);
+		if (ret < 0)
+			return file;
+
+		which = (ptr->f_flags & O_WRONLY ? 1 : 0);
+		/*
+		 * Below we return the file corersponding to one side
+		 * of the pipe for our caller to use. Now insert the
+		 * other side of the pipe to the hash, to be picked up
+		 * when that side is restored.
+		 */
+		file = fget(fds[1-which]);	/* the 'other' side */
+		if (!file)	/* this should _never_ happen ! */
+			return ERR_PTR(-EBADF);
+		ret = ckpt_obj_insert(ctx, file,
+				      h->pipe_objref, CKPT_OBJ_FILE);
+		if (ret < 0)
+			goto out;
+
+		ret = restore_pipe(ctx, file);
+		fput(file);
+		if (ret < 0)
+			return ERR_PTR(ret);
+
+		file = fget(fds[which]);	/* 'this' side */
+		if (!file)	/* this should _never_ happen ! */
+			return ERR_PTR(-EBADF);
+
+		/* get rid of the file descriptors (caller sets that) */
+		sys_close(fds[which]);
+		sys_close(fds[1-which]);
+	} else {
+		return file;
+	}
+
+	ret = restore_file_common(ctx, file, ptr);
+ out:
+	if (ret < 0) {
+		fput(file);
+		file = ERR_PTR(ret);
+	}
+
+	return file;
+}
+#else
+#define pipe_file_checkpoint  NULL
+#endif /* CONFIG_CHECKPOINT */
+
 /*
  * The file_operations structs are not static because they
  * are also used in linux/fs/fifo.c to do operations on FIFOs.
@@ -811,6 +981,7 @@ const struct file_operations read_pipefifo_fops = {
 	.open		= pipe_read_open,
 	.release	= pipe_read_release,
 	.fasync		= pipe_read_fasync,
+	.checkpoint	= pipe_file_checkpoint,
 };
 
 const struct file_operations write_pipefifo_fops = {
@@ -823,6 +994,7 @@ const struct file_operations write_pipefifo_fops = {
 	.open		= pipe_write_open,
 	.release	= pipe_write_release,
 	.fasync		= pipe_write_fasync,
+	.checkpoint	= pipe_file_checkpoint,
 };
 
 const struct file_operations rdwr_pipefifo_fops = {
@@ -836,6 +1008,7 @@ const struct file_operations rdwr_pipefifo_fops = {
 	.open		= pipe_rdwr_open,
 	.release	= pipe_rdwr_release,
 	.fasync		= pipe_rdwr_fasync,
+	.checkpoint	= pipe_file_checkpoint,
 };
 
 struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
diff --git a/fs/splice.c b/fs/splice.c
index 96e0d58..de34b7c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -979,8 +979,8 @@ static long do_splice_pipes(struct pipe_inode_info *ipipe,
 /*
  * Attempt to initiate a splice from pipe to file.
  */
-static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
-			   loff_t *ppos, size_t len, unsigned int flags)
+long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+		    loff_t *ppos, size_t len, unsigned int flags)
 {
 	struct pipe_inode_info *opipe;
 	int ret;
@@ -1011,9 +1011,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 /*
  * Attempt to initiate a splice from a file to a pipe.
  */
-static long do_splice_to(struct file *in, loff_t *ppos,
-			 struct pipe_inode_info *pipe, size_t len,
-			 unsigned int flags)
+long do_splice_to(struct file *in, loff_t *ppos,
+		  struct pipe_inode_info *pipe, size_t len,
+		  unsigned int flags)
 {
 	struct pipe_inode_info *ipipe;
 	int ret;
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 36328e1..14654e8 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -59,6 +59,7 @@ enum {
 	CKPT_HDR_FILE_DESC,
 	CKPT_HDR_FILE_NAME,
 	CKPT_HDR_FILE,
+	CKPT_HDR_FILE_PIPE,
 
 	CKPT_HDR_MM = 401,
 	CKPT_HDR_VMA,
@@ -197,6 +198,7 @@ struct ckpt_hdr_file_desc {
 enum file_type {
 	CKPT_FILE_IGNORE = 0,
 	CKPT_FILE_GENERIC,
+	CKPT_FILE_PIPE,
 	CKPT_FILE_MAX
 };
 
@@ -215,6 +217,16 @@ struct ckpt_hdr_file_generic {
 	struct ckpt_hdr_file common;
 } __attribute__((aligned(8)));
 
+struct ckpt_hdr_file_pipe {
+	struct ckpt_hdr_file common;
+	__s32 pipe_objref;
+} __attribute__((aligned(8)));
+
+struct ckpt_hdr_file_pipe_state {
+	struct ckpt_hdr h;
+	__s32 pipe_len;
+} __attribute__((aligned(8)));
+
 /* memory layout */
 struct ckpt_hdr_mm {
 	struct ckpt_hdr h;
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index c8f0385..b877598 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -153,4 +153,10 @@ void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
 
+/* checkpoint/restart */
+#ifdef CONFIG_CHECKPOINT
+extern struct file *pipe_file_restore(struct ckpt_ctx *ctx,
+				      struct ckpt_hdr_file *ptr);
+#endif
+
 #endif
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 5f3faa9..50b475d 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -83,4 +83,13 @@ extern ssize_t splice_to_pipe(struct pipe_inode_info *,
 extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
 				      splice_direct_actor *);
 
+extern long link_pipe(struct pipe_inode_info *ipipe,
+		      struct pipe_inode_info *opipe,
+		      size_t len, unsigned int flags);
+extern long do_splice_to(struct file *in, loff_t *ppos,
+			 struct pipe_inode_info *pipe, size_t len,
+			 unsigned int flags);
+extern long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+			   loff_t *ppos, size_t len, unsigned int flags);
+
 #endif
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 28/43] c/r: make ckpt_may_checkpoint_task() check each namespace individually
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (22 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 27/43] c/r: support for open pipes Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 29/43] c/r: support for UTS namespace Oren Laadan
                   ` (10 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Dan Smith,
	Oren Laadan
From: Dan Smith <danms@us.ibm.com>
Signed-off-by: Dan Smith <danms@us.ibm.com>
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 checkpoint/checkpoint.c        |   20 ++++++--
 checkpoint/objhash.c           |   28 +++++++++++
 checkpoint/process.c           |  101 ++++++++++++++++++++++++++++++++++++++++
 include/linux/checkpoint.h     |    4 ++
 include/linux/checkpoint_hdr.h |    8 +++
 5 files changed, 157 insertions(+), 4 deletions(-)
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index b70adf4..e66f82b 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -267,6 +267,8 @@ static int checkpoint_all_tasks(struct ckpt_ctx *ctx)
 static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 {
 	struct task_struct *root = ctx->root_task;
+	struct nsproxy *nsproxy;
+	int ret = 0;
 
 	ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
 
@@ -306,11 +308,21 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 		return -EINVAL;
 	}
 
-	/* FIX: change this when namespaces are added */
-	if (task_nsproxy(t) != ctx->root_nsproxy)
-		return -EPERM;
+	rcu_read_lock();
+	nsproxy = task_nsproxy(t);
+	if (nsproxy->uts_ns != ctx->root_nsproxy->uts_ns)
+		ret = -EPERM;
+	if (nsproxy->ipc_ns != ctx->root_nsproxy->ipc_ns)
+		ret = -EPERM;
+	if (nsproxy->mnt_ns != ctx->root_nsproxy->mnt_ns)
+		ret = -EPERM;
+	if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns)
+		ret = -EPERM;
+	if (nsproxy->net_ns != ctx->root_nsproxy->net_ns)
+		ret = -EPERM;
+	rcu_read_unlock();
 
-	return 0;
+	return ret;
 }
 
 #define CKPT_HDR_PIDS_CHUNK	256
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index e481911..56553ae 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -127,6 +127,22 @@ static int obj_mm_users(void *ptr)
 	return atomic_read(&((struct mm_struct *) ptr)->mm_users);
 }
 
+static int obj_ns_grab(void *ptr)
+{
+	get_nsproxy((struct nsproxy *) ptr);
+	return 0;
+}
+
+static void obj_ns_drop(void *ptr)
+{
+	put_nsproxy((struct nsproxy *) ptr);
+}
+
+static int obj_ns_users(void *ptr)
+{
+	return atomic_read(&((struct nsproxy *) ptr)->count);
+}
+
 static struct ckpt_obj_ops ckpt_obj_ops[] = {
 	/* ignored object */
 	{
@@ -174,6 +190,16 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.checkpoint = checkpoint_mm,
 		.restore = restore_mm,
 	},
+	/* ns object */
+	{
+		.obj_name = "NSPROXY",
+		.obj_type = CKPT_OBJ_NS,
+		.ref_drop = obj_ns_drop,
+		.ref_grab = obj_ns_grab,
+		.ref_users = obj_ns_users,
+		.checkpoint = checkpoint_ns,
+		.restore = restore_ns,
+	},
 };
 
 
@@ -396,6 +422,8 @@ int ckpt_obj_contained(struct ckpt_ctx *ctx)
 
 	/* account for ctx->file reference (if in the table already) */
 	ckpt_obj_users_inc(ctx, ctx->file, 1);
+	/* account for ctx->root_nsproxy reference (if in the table already) */
+	ckpt_obj_users_inc(ctx, ctx->root_nsproxy, 1);
 
 	hlist_for_each_entry(obj, node, &ctx->obj_hash->list, next) {
 		if (!obj->ops->ref_users)
diff --git a/checkpoint/process.c b/checkpoint/process.c
index 876be3e..fbe0d16 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -12,6 +12,7 @@
 #define CKPT_DFLAG  CKPT_DSYS
 
 #include <linux/sched.h>
+#include <linux/nsproxy.h>
 #include <linux/posix-timers.h>
 #include <linux/futex.h>
 #include <linux/poll.h>
@@ -49,6 +50,45 @@ static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
 	return ckpt_write_string(ctx, t->comm, TASK_COMM_LEN);
 }
 
+
+static int do_checkpoint_ns(struct ckpt_ctx *ctx, struct nsproxy *nsproxy)
+{
+	return 0;
+}
+
+int checkpoint_ns(struct ckpt_ctx *ctx, void *ptr)
+{
+	return do_checkpoint_ns(ctx, (struct nsproxy *) ptr);
+}
+
+static int checkpoint_task_ns(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct ckpt_hdr_task_ns *h;
+	struct nsproxy *nsproxy;
+	int ns_objref;
+	int ret;
+
+	rcu_read_lock();
+	nsproxy = task_nsproxy(t);
+	get_nsproxy(nsproxy);
+	rcu_read_unlock();
+
+	ns_objref = checkpoint_obj(ctx, nsproxy, CKPT_OBJ_NS);
+	put_nsproxy(nsproxy);
+
+	ckpt_debug("nsproxy: objref %d\n", ns_objref);
+	if (ns_objref < 0)
+		return ns_objref;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK_NS);
+	if (!h)
+		return -ENOMEM;
+	h->ns_objref = ns_objref;
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
 static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
 {
 	struct ckpt_hdr_task_objs *h;
@@ -56,6 +96,18 @@ static int checkpoint_task_objs(struct ckpt_ctx *ctx, struct task_struct *t)
 	int mm_objref;
 	int ret;
 
+	/*
+	 * Shared objects may have dependencies among them: task->mm
+	 * depends on task->nsproxy (by ipc_ns). Therefore first save
+	 * the namespaces, and then the remaining shared objects.
+	 * During restart a task will already have its namespaces
+	 * restored when it gets to restore, e.g. its memory.
+	 */
+
+	ret = checkpoint_task_ns(ctx, t);
+	if (ret < 0)
+		return ret;
+
 	files_objref = checkpoint_obj_file_table(ctx, t);
 	ckpt_debug("files: objref %d\n", files_objref);
 	if (files_objref < 0) {
@@ -248,11 +300,60 @@ static int restore_task_struct(struct ckpt_ctx *ctx)
 	return ret;
 }
 
+static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
+{
+	struct nsproxy *nsproxy;
+
+	nsproxy = task_nsproxy(current);
+	get_nsproxy(nsproxy);
+	return nsproxy;
+}
+
+void *restore_ns(struct ckpt_ctx *ctx)
+{
+	return (void *) do_restore_ns(ctx);
+}
+
+static int restore_task_ns(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_task_ns *h;
+	struct nsproxy *nsproxy;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_NS);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	nsproxy = ckpt_obj_fetch(ctx, h->ns_objref, CKPT_OBJ_NS);
+	if (IS_ERR(nsproxy)) {
+		ret = PTR_ERR(nsproxy);
+		goto out;
+	}
+
+	if (nsproxy != task_nsproxy(current)) {
+		get_nsproxy(nsproxy);
+		switch_task_namespaces(current, nsproxy);
+	}
+ out:
+	ckpt_debug("nsproxy: ret %d (%p)\n", ret, task_nsproxy(current));
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
 static int restore_task_objs(struct ckpt_ctx *ctx)
 {
 	struct ckpt_hdr_task_objs *h;
 	int ret;
 
+	/*
+	 * Namespaces come first, because ->mm depends on ->nsproxy,
+	 * and because shared objects are restored before they are
+	 * referenced. See comment in checkpoint_task_objs.
+	 */
+	ret = restore_task_ns(ctx);
+	if (ret < 0)
+		return ret;
+
 	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK_OBJS);
 	if (IS_ERR(h))
 		return PTR_ERR(h);
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 171e92e..a7125fc 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -77,6 +77,10 @@ extern int checkpoint_restart_block(struct ckpt_ctx *ctx,
 				    struct task_struct *t);
 extern int restore_restart_block(struct ckpt_ctx *ctx);
 
+/* namespaces */
+extern int checkpoint_ns(struct ckpt_ctx *ctx, void *ptr);
+extern void *restore_ns(struct ckpt_ctx *ctx);
+
 /* file table */
 extern int checkpoint_obj_file_table(struct ckpt_ctx *ctx,
 				     struct task_struct *t);
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 14654e8..da1ae79 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -48,6 +48,7 @@ enum {
 
 	CKPT_HDR_TREE = 101,
 	CKPT_HDR_TASK,
+	CKPT_HDR_TASK_NS,
 	CKPT_HDR_TASK_OBJS,
 	CKPT_HDR_RESTART_BLOCK,
 	CKPT_HDR_THREAD,
@@ -90,6 +91,7 @@ enum obj_type {
 	CKPT_OBJ_FILE_TABLE,
 	CKPT_OBJ_FILE,
 	CKPT_OBJ_MM,
+	CKPT_OBJ_NS,
 	CKPT_OBJ_MAX
 };
 
@@ -152,6 +154,12 @@ struct ckpt_hdr_task {
 	__u32 task_comm_len;
 } __attribute__((aligned(8)));
 
+/* namespaces */
+struct ckpt_hdr_task_ns {
+	struct ckpt_hdr h;
+	__s32 ns_objref;
+} __attribute__((aligned(8)));
+
 /* task's shared resources */
 struct ckpt_hdr_task_objs {
 	struct ckpt_hdr h;
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 29/43] c/r: support for UTS namespace
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (23 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 28/43] c/r: make ckpt_may_checkpoint_task() check each namespace individually Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 32/43] c/r (ipc): allow allocation of a desired ipc identifier Oren Laadan
                   ` (9 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Dan Smith,
	Oren Laadan
From: Dan Smith <danms@us.ibm.com>
This patch adds a "phase" of checkpoint that saves out information about any
namespaces the task(s) may have.  Do this by tracking the namespace objects
of the tasks and making sure that tasks with the same namespace that follow
get properly referenced in the checkpoint stream.
Changes:
  - Take uts_sem around access to uts data
  - Remove the kernel restore path
  - Punt on nested namespaces
  - Use __NEW_UTS_LEN in nodename and domainname buffers
  - Add a note to Documentation/checkpoint/internals.txt to indicate where
    in the save/restore process the UTS information is kept
  - Store (and track) the objref of the namespace itself instead of the
    nsproxy (based on comments from Dave on IRC)
  - Remove explicit check for non-root nsproxy
  - Store the nodename and domainname lengths and use ckpt_write_string()
    to store the actual name strings
  - Catch failure of ckpt_obj_add_ptr() in ckpt_write_namespaces()
  - Remove "types" bitfield and use the "is this new" flag to determine
    whether or not we should write out a new ns descriptor
  - Replace kernel restore path
  - Move the namespace information to be directly after the task
    information record
  - Update Documentation to reflect new location of namespace info
  - Support checkpoint and restart of nested UTS namespaces
Signed-off-by: Dan Smith <danms@us.ibm.com>
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 checkpoint/checkpoint.c        |    2 -
 checkpoint/objhash.c           |   26 +++++++
 checkpoint/process.c           |  160 +++++++++++++++++++++++++++++++++++++++-
 include/linux/checkpoint_hdr.h |   15 ++++
 4 files changed, 200 insertions(+), 3 deletions(-)
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index e66f82b..904f19b 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -310,8 +310,6 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 
 	rcu_read_lock();
 	nsproxy = task_nsproxy(t);
-	if (nsproxy->uts_ns != ctx->root_nsproxy->uts_ns)
-		ret = -EPERM;
 	if (nsproxy->ipc_ns != ctx->root_nsproxy->ipc_ns)
 		ret = -EPERM;
 	if (nsproxy->mnt_ns != ctx->root_nsproxy->mnt_ns)
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index 56553ae..8b7adc6 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -143,6 +143,22 @@ static int obj_ns_users(void *ptr)
 	return atomic_read(&((struct nsproxy *) ptr)->count);
 }
 
+static int obj_uts_ns_grab(void *ptr)
+{
+	get_uts_ns((struct uts_namespace *) ptr);
+	return 0;
+}
+
+static void obj_uts_ns_drop(void *ptr)
+{
+	put_uts_ns((struct uts_namespace *) ptr);
+}
+
+static int obj_uts_ns_users(void *ptr)
+{
+	return atomic_read(&((struct uts_namespace *) ptr)->kref.refcount);
+}
+
 static struct ckpt_obj_ops ckpt_obj_ops[] = {
 	/* ignored object */
 	{
@@ -200,6 +216,16 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.checkpoint = checkpoint_ns,
 		.restore = restore_ns,
 	},
+	/* uts_ns object */
+	{
+		.obj_name = "UTS_NS",
+		.obj_type = CKPT_OBJ_UTS_NS,
+		.ref_drop = obj_uts_ns_drop,
+		.ref_grab = obj_uts_ns_grab,
+		.ref_users = obj_uts_ns_users,
+		.checkpoint = checkpoint_bad,
+		.restore = restore_bad,
+	},
 };
 
 
diff --git a/checkpoint/process.c b/checkpoint/process.c
index fbe0d16..a827987 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -16,8 +16,10 @@
 #include <linux/posix-timers.h>
 #include <linux/futex.h>
 #include <linux/poll.h>
+#include <linux/utsname.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <linux/syscalls.h>
 
 /***********************************************************************
  * Checkpoint
@@ -50,10 +52,69 @@ static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
 	return ckpt_write_string(ctx, t->comm, TASK_COMM_LEN);
 }
 
+static int checkpoint_uts_ns(struct ckpt_ctx *ctx, struct uts_namespace *uts_ns)
+{
+	struct ckpt_hdr_utsns *h;
+	int domainname_len;
+	int nodename_len;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_UTS_NS);
+	if (!h)
+		return -ENOMEM;
+
+	nodename_len = sizeof(uts_ns->name.nodename);
+	domainname_len = sizeof(uts_ns->name.domainname);
+
+	h->nodename_len = nodename_len;
+	h->domainname_len = domainname_len;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ret;
+
+	down_read(&uts_sem);
+	ret = ckpt_write_string(ctx, uts_ns->name.nodename, nodename_len);
+	if (ret < 0)
+		goto up;
+	ret = ckpt_write_string(ctx, uts_ns->name.domainname, domainname_len);
+ up:
+	up_read(&uts_sem);
+	return ret;
+}
 
 static int do_checkpoint_ns(struct ckpt_ctx *ctx, struct nsproxy *nsproxy)
 {
-	return 0;
+	struct ckpt_hdr_ns *h;
+	int ns_flags = 0;
+	int uts_objref;
+	int first, ret;
+
+	uts_objref = ckpt_obj_lookup_add(ctx, nsproxy->uts_ns,
+					 CKPT_OBJ_UTS_NS, &first);
+	if (uts_objref <= 0)
+		return uts_objref;
+	if (first)
+		ns_flags |= CLONE_NEWUTS;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_NS);
+	if (!h)
+		return -ENOMEM;
+
+	h->flags = ns_flags;
+	h->uts_objref = uts_objref;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ret;
+
+	if (ns_flags & CLONE_NEWUTS)
+		ret = checkpoint_uts_ns(ctx, nsproxy->uts_ns);
+
+	/* FIX: Write other namespaces here */
+	return ret;
 }
 
 int checkpoint_ns(struct ckpt_ctx *ctx, void *ptr)
@@ -300,10 +361,107 @@ static int restore_task_struct(struct ckpt_ctx *ctx)
 	return ret;
 }
 
+static int do_restore_uts_ns(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_utsns *h;
+	struct uts_namespace *ns;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_UTS_NS);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ret = -EINVAL;
+	if (h->nodename_len > sizeof(ns->name.nodename) ||
+	    h->domainname_len > sizeof(ns->name.domainname))
+		goto out;
+
+	ns = current->nsproxy->uts_ns;
+
+	/* no need to take uts_sem because we are the sole users */
+
+	memset(ns->name.nodename, 0, sizeof(ns->name.nodename));
+	ret = _ckpt_read_string(ctx, ns->name.nodename, h->nodename_len);
+	if (ret < 0)
+		goto out;
+	memset(ns->name.domainname, 0, sizeof(ns->name.domainname));
+	ret = _ckpt_read_string(ctx, ns->name.domainname, h->domainname_len);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static int restore_uts_ns(struct ckpt_ctx *ctx, int ns_objref, int flags)
+{
+	struct uts_namespace *uts_ns;
+	int ret = 0;
+
+	uts_ns = ckpt_obj_fetch(ctx, ns_objref, CKPT_OBJ_UTS_NS);
+	if (PTR_ERR(uts_ns) == -EINVAL)
+		uts_ns = NULL;
+	else if (IS_ERR(uts_ns))
+		return PTR_ERR(uts_ns);
+
+	/* sanity: CLONE_NEWUTS if-and-only-if uts_ns is NULL (first timer) */
+	if (!!uts_ns ^ !(flags & CLONE_NEWUTS))
+		return -EINVAL;
+
+	if (!uts_ns) {
+		ret = do_restore_uts_ns(ctx);
+		if (ret < 0)
+			return ret;
+		ret = ckpt_obj_insert(ctx, current->nsproxy->uts_ns,
+				    ns_objref, CKPT_OBJ_UTS_NS);
+	} else {
+		struct uts_namespace *old_uts_ns;
+
+		/* safe because nsproxy->count must be 1 ... */
+		BUG_ON(atomic_read(¤t->nsproxy->count) != 1);
+
+		old_uts_ns = current->nsproxy->uts_ns;
+		current->nsproxy->uts_ns = uts_ns;
+		get_uts_ns(uts_ns);
+		put_uts_ns(old_uts_ns);
+	}
+
+	return ret;
+}
+
 static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
 {
+	struct ckpt_hdr_ns *h;
 	struct nsproxy *nsproxy;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_NS);
+	if (IS_ERR(h))
+		return (struct nsproxy *) h;
+
+	ret = -EINVAL;
+	if (h->uts_objref <= 0)
+		goto out;
+	if (h->flags & ~CLONE_NEWUTS)
+		goto out;
 
+	/* each unseen-before namespace will be un-shared now */
+	ret = sys_unshare(h->flags);
+	if (ret)
+		goto out;
+
+	/*
+	 * For each unseen-before namespace 'xxx', it is now safe to
+	 * modify the nsproxy->xxx_ns without locking because unshare()
+	 * gave a brand new nsproxy and nsproxy->xxx_ns, and we're the
+	 * sole users at this point.
+	 */
+	ret = restore_uts_ns(ctx, h->uts_objref, h->flags);
+	ckpt_debug("uts ns: %d\n", ret);
+
+	/* FIX: add more namespaces here */
+ out:
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ERR_PTR(ret);
 	nsproxy = task_nsproxy(current);
 	get_nsproxy(nsproxy);
 	return nsproxy;
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index da1ae79..1603279 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -53,6 +53,8 @@ enum {
 	CKPT_HDR_RESTART_BLOCK,
 	CKPT_HDR_THREAD,
 	CKPT_HDR_CPU,
+	CKPT_HDR_NS,
+	CKPT_HDR_UTS_NS,
 
 	/* 201-299: reserved for arch-dependent */
 
@@ -92,6 +94,7 @@ enum obj_type {
 	CKPT_OBJ_FILE,
 	CKPT_OBJ_MM,
 	CKPT_OBJ_NS,
+	CKPT_OBJ_UTS_NS,
 	CKPT_OBJ_MAX
 };
 
@@ -160,6 +163,12 @@ struct ckpt_hdr_task_ns {
 	__s32 ns_objref;
 } __attribute__((aligned(8)));
 
+struct ckpt_hdr_ns {
+	struct ckpt_hdr h;
+	__u32 flags;
+	__s32 uts_objref;
+} __attribute__((aligned(8)));
+
 /* task's shared resources */
 struct ckpt_hdr_task_objs {
 	struct ckpt_hdr h;
@@ -235,6 +244,12 @@ struct ckpt_hdr_file_pipe_state {
 	__s32 pipe_len;
 } __attribute__((aligned(8)));
 
+struct ckpt_hdr_utsns {
+	struct ckpt_hdr h;
+	__u32 nodename_len;
+	__u32 domainname_len;
+} __attribute__((aligned(8)));
+
 /* memory layout */
 struct ckpt_hdr_mm {
 	struct ckpt_hdr h;
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 32/43] c/r (ipc): allow allocation of a desired ipc identifier
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (24 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 29/43] c/r: support for UTS namespace Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
  2009-05-27 17:32 ` [RFC v16][PATCH 33/43] c/r (ipc): helpers to save and restore kern_ipc_perm structures Oren Laadan
                   ` (8 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
During restart, we need to allocate ipc objects that with the same
identifiers as recorded during checkpoint. Modify the allocation
code allow an in-kernel caller to request a specific ipc identifier.
The system call interface remains unchanged.
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 ipc/msg.c  |   17 ++++++++++++-----
 ipc/sem.c  |   17 ++++++++++++-----
 ipc/shm.c  |   19 +++++++++++++------
 ipc/util.c |   42 +++++++++++++++++++++++++++++-------------
 ipc/util.h |   12 +++++++++---
 5 files changed, 75 insertions(+), 32 deletions(-)
diff --git a/ipc/msg.c b/ipc/msg.c
index 2ceab7f..1db7c45 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -73,7 +73,7 @@ struct msg_sender {
 #define msg_unlock(msq)		ipc_unlock(&(msq)->q_perm)
 
 static void freeque(struct ipc_namespace *, struct kern_ipc_perm *);
-static int newque(struct ipc_namespace *, struct ipc_params *);
+static int newque(struct ipc_namespace *, struct ipc_params *, int);
 #ifdef CONFIG_PROC_FS
 static int sysvipc_msg_proc_show(struct seq_file *s, void *it);
 #endif
@@ -174,10 +174,12 @@ static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s)
  * newque - Create a new msg queue
  * @ns: namespace
  * @params: ptr to the structure that contains the key and msgflg
+ * @req_id: request desired id if available (-1 if don't care)
  *
  * Called with msg_ids.rw_mutex held (writer)
  */
-static int newque(struct ipc_namespace *ns, struct ipc_params *params)
+static int
+newque(struct ipc_namespace *ns, struct ipc_params *params, int req_id)
 {
 	struct msg_queue *msq;
 	int id, retval;
@@ -201,7 +203,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)
 	/*
 	 * ipc_addid() locks msq
 	 */
-	id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);
+	id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni, req_id);
 	if (id < 0) {
 		security_msg_queue_free(msq);
 		ipc_rcu_putref(msq);
@@ -309,7 +311,7 @@ static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg)
 	return security_msg_queue_associate(msq, msgflg);
 }
 
-SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg)
+int do_msgget(key_t key, int msgflg, int req_id)
 {
 	struct ipc_namespace *ns;
 	struct ipc_ops msg_ops;
@@ -324,7 +326,12 @@ SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg)
 	msg_params.key = key;
 	msg_params.flg = msgflg;
 
-	return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params);
+	return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params, req_id);
+}
+
+SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg)
+{
+	return do_msgget(key, msgflg, -1);
 }
 
 static inline unsigned long
diff --git a/ipc/sem.c b/ipc/sem.c
index 16a2189..207dbbb 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -92,7 +92,7 @@
 #define sem_unlock(sma)		ipc_unlock(&(sma)->sem_perm)
 #define sem_checkid(sma, semid)	ipc_checkid(&sma->sem_perm, semid)
 
-static int newary(struct ipc_namespace *, struct ipc_params *);
+static int newary(struct ipc_namespace *, struct ipc_params *, int);
 static void freeary(struct ipc_namespace *, struct kern_ipc_perm *);
 #ifdef CONFIG_PROC_FS
 static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
@@ -227,11 +227,13 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
  * newary - Create a new semaphore set
  * @ns: namespace
  * @params: ptr to the structure that contains key, semflg and nsems
+ * @req_id: request desired id if available (-1 if don't care)
  *
  * Called with sem_ids.rw_mutex held (as a writer)
  */
 
-static int newary(struct ipc_namespace *ns, struct ipc_params *params)
+static int
+newary(struct ipc_namespace *ns, struct ipc_params *params, int req_id)
 {
 	int id;
 	int retval;
@@ -263,7 +265,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 		return retval;
 	}
 
-	id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
+	id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni, req_id);
 	if (id < 0) {
 		security_sem_free(sma);
 		ipc_rcu_putref(sma);
@@ -308,7 +310,7 @@ static inline int sem_more_checks(struct kern_ipc_perm *ipcp,
 	return 0;
 }
 
-SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
+int do_semget(key_t key, int nsems, int semflg, int req_id)
 {
 	struct ipc_namespace *ns;
 	struct ipc_ops sem_ops;
@@ -327,7 +329,12 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
 	sem_params.flg = semflg;
 	sem_params.u.nsems = nsems;
 
-	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
+	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params, req_id);
+}
+
+SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
+{
+	return do_semget(key, nsems, semflg, -1);
 }
 
 /*
diff --git a/ipc/shm.c b/ipc/shm.c
index faa46da..7dd5f0c 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -62,7 +62,7 @@ static struct vm_operations_struct shm_vm_ops;
 #define shm_unlock(shp)			\
 	ipc_unlock(&(shp)->shm_perm)
 
-static int newseg(struct ipc_namespace *, struct ipc_params *);
+static int newseg(struct ipc_namespace *, struct ipc_params *, int);
 static void shm_open(struct vm_area_struct *vma);
 static void shm_close(struct vm_area_struct *vma);
 static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp);
@@ -83,7 +83,7 @@ void shm_init_ns(struct ipc_namespace *ns)
  * Called with shm_ids.rw_mutex (writer) and the shp structure locked.
  * Only shm_ids.rw_mutex remains locked on exit.
  */
-static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
+void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 {
 	struct shmid_kernel *shp;
 	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
@@ -326,11 +326,13 @@ static struct vm_operations_struct shm_vm_ops = {
  * newseg - Create a new shared memory segment
  * @ns: namespace
  * @params: ptr to the structure that contains key, size and shmflg
+ * @req_id: request desired id if available (-1 if don't care)
  *
  * Called with shm_ids.rw_mutex held as a writer.
  */
 
-static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
+static int
+newseg(struct ipc_namespace *ns, struct ipc_params *params, int req_id)
 {
 	key_t key = params->key;
 	int shmflg = params->flg;
@@ -386,7 +388,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 		goto no_file;
 	ima_shm_check(file);
 
-	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
+	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni, req_id);
 	if (id < 0) {
 		error = id;
 		goto no_id;
@@ -444,7 +446,7 @@ static inline int shm_more_checks(struct kern_ipc_perm *ipcp,
 	return 0;
 }
 
-SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
+int do_shmget(key_t key, size_t size, int shmflg, int req_id)
 {
 	struct ipc_namespace *ns;
 	struct ipc_ops shm_ops;
@@ -460,7 +462,12 @@ SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
 	shm_params.flg = shmflg;
 	shm_params.u.size = size;
 
-	return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
+	return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params, req_id);
+}
+
+SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
+{
+	return do_shmget(key, size, shmflg, -1);
 }
 
 static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version)
diff --git a/ipc/util.c b/ipc/util.c
index b8e4ba9..ca248ec 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -247,10 +247,12 @@ int ipc_get_maxid(struct ipc_ids *ids)
  *	Called with ipc_ids.rw_mutex held as a writer.
  */
  
-int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
+int
+ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size, int req_id)
 {
 	uid_t euid;
 	gid_t egid;
+	int lid = 0;
 	int id, err;
 
 	if (size > IPCMNI)
@@ -259,28 +261,41 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
 	if (ids->in_use >= size)
 		return -ENOSPC;
 
+	if (req_id >= 0)
+		lid = ipcid_to_idx(req_id);
+
 	spin_lock_init(&new->lock);
 	new->deleted = 0;
 	rcu_read_lock();
 	spin_lock(&new->lock);
 
-	err = idr_get_new(&ids->ipcs_idr, new, &id);
+	err = idr_get_new_above(&ids->ipcs_idr, new, lid, &id);
 	if (err) {
 		spin_unlock(&new->lock);
 		rcu_read_unlock();
 		return err;
 	}
 
+	if (req_id >= 0) {
+		if (id != lid) {
+			idr_remove(&ids->ipcs_idr, id);
+			spin_unlock(&new->lock);
+			rcu_read_unlock();
+			return -EBUSY;
+		}
+		new->seq = req_id / SEQ_MULTIPLIER;
+	} else {
+		new->seq = ids->seq++;
+		if (ids->seq > ids->seq_max)
+			ids->seq = 0;
+	}
+
 	ids->in_use++;
 
 	current_euid_egid(&euid, &egid);
 	new->cuid = new->uid = euid;
 	new->gid = new->cgid = egid;
 
-	new->seq = ids->seq++;
-	if(ids->seq > ids->seq_max)
-		ids->seq = 0;
-
 	new->id = ipc_buildid(id, new->seq);
 	return id;
 }
@@ -296,7 +311,7 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
  *	when the key is IPC_PRIVATE.
  */
 static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids,
-		struct ipc_ops *ops, struct ipc_params *params)
+		struct ipc_ops *ops, struct ipc_params *params, int req_id)
 {
 	int err;
 retry:
@@ -306,7 +321,7 @@ retry:
 		return -ENOMEM;
 
 	down_write(&ids->rw_mutex);
-	err = ops->getnew(ns, params);
+	err = ops->getnew(ns, params, req_id);
 	up_write(&ids->rw_mutex);
 
 	if (err == -EAGAIN)
@@ -351,6 +366,7 @@ static int ipc_check_perms(struct kern_ipc_perm *ipcp, struct ipc_ops *ops,
  *	@ids: IPC identifer set
  *	@ops: the actual creation routine to call
  *	@params: its parameters
+ *	@req_id: request desired id if available (-1 if don't care)
  *
  *	This routine is called by sys_msgget, sys_semget() and sys_shmget()
  *	when the key is not IPC_PRIVATE.
@@ -360,7 +376,7 @@ static int ipc_check_perms(struct kern_ipc_perm *ipcp, struct ipc_ops *ops,
  *	On success, the ipc id is returned.
  */
 static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
-		struct ipc_ops *ops, struct ipc_params *params)
+		struct ipc_ops *ops, struct ipc_params *params, int req_id)
 {
 	struct kern_ipc_perm *ipcp;
 	int flg = params->flg;
@@ -381,7 +397,7 @@ retry:
 		else if (!err)
 			err = -ENOMEM;
 		else
-			err = ops->getnew(ns, params);
+			err = ops->getnew(ns, params, req_id);
 	} else {
 		/* ipc object has been locked by ipc_findkey() */
 
@@ -742,12 +758,12 @@ struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id)
  * Common routine called by sys_msgget(), sys_semget() and sys_shmget().
  */
 int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
-			struct ipc_ops *ops, struct ipc_params *params)
+		struct ipc_ops *ops, struct ipc_params *params, int req_id)
 {
 	if (params->key == IPC_PRIVATE)
-		return ipcget_new(ns, ids, ops, params);
+		return ipcget_new(ns, ids, ops, params, req_id);
 	else
-		return ipcget_public(ns, ids, ops, params);
+		return ipcget_public(ns, ids, ops, params, req_id);
 }
 
 /**
diff --git a/ipc/util.h b/ipc/util.h
index 1187332..c75e3b2 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -70,7 +70,7 @@ struct ipc_params {
  *      . routine to call for an extra check if needed
  */
 struct ipc_ops {
-	int (*getnew) (struct ipc_namespace *, struct ipc_params *);
+	int (*getnew) (struct ipc_namespace *, struct ipc_params *, int);
 	int (*associate) (struct kern_ipc_perm *, int);
 	int (*more_checks) (struct kern_ipc_perm *, struct ipc_params *);
 };
@@ -93,7 +93,7 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
 #define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER)
 
 /* must be called with ids->rw_mutex acquired for writing */
-int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
+int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int, int);
 
 /* must be called with ids->rw_mutex acquired for reading */
 int ipc_get_maxid(struct ipc_ids *);
@@ -170,6 +170,12 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm)
 
 struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id);
 int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
-			struct ipc_ops *ops, struct ipc_params *params);
+		struct ipc_ops *ops, struct ipc_params *params, int req_id);
+
+/* for checkpoint/restart */
+extern int do_shmget(key_t key, size_t size, int shmflg, int req_id);
+
+extern void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp);
+
 
 #endif
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 33/43] c/r (ipc): helpers to save and restore kern_ipc_perm structures
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (25 preceding siblings ...)
  2009-05-27 17:32 ` [RFC v16][PATCH 32/43] c/r (ipc): allow allocation of a desired ipc identifier Oren Laadan
@ 2009-05-27 17:32 ` Oren Laadan
       [not found] ` <1243445589-32388-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
                   ` (7 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Add the helpers to save and restore the contents of 'struct
kern_ipc_perm'. Add header structures for ipc state. Put
place-holders to save and restore ipc state.
TODO:
This patch does _not_ address the issues of users/groups and the
related security issues. For now, it saves the old user/group of
ipc objects, but does not restore them during restart.
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 include/linux/checkpoint.h     |    7 +++-
 include/linux/checkpoint_hdr.h |   29 ++++++++++++++
 ipc/Makefile                   |    1 +
 ipc/checkpoint.c               |   81 ++++++++++++++++++++++++++++++++++++++++
 ipc/util.h                     |    8 ++++
 5 files changed, 125 insertions(+), 1 deletions(-)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 5a42399..9a7517f 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -10,6 +10,10 @@
  *  distribution for more details.
  */
 
+#include <linux/sched.h>
+#include <linux/nsproxy.h>
+#include <linux/ipc_namespace.h>
+
 #include <linux/checkpoint_types.h>
 #include <linux/checkpoint_hdr.h>
 #include <asm/checkpoint_hdr.h>
@@ -157,8 +161,9 @@ extern int restore_memory_contents(struct ckpt_ctx *ctx, struct inode *inode);
 #define CKPT_DFILE	0x10		/* files and filesystem */
 #define CKPT_DMEM	0x20		/* memory state */
 #define CKPT_DPAGE	0x40		/* memory pages */
+#define CKPT_DIPC	0x80		/* sysvipc */
 
-#define CKPT_DDEFAULT	0x37		/* default debug level */
+#define CKPT_DDEFAULT	0xb7		/* default debug level */
 
 #ifndef CKPT_DFLAG
 #define CKPT_DFLAG	0x0		/* nothing */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 44a48dc..05769f4 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -70,6 +70,11 @@ enum {
 	CKPT_HDR_PGARR,
 	CKPT_HDR_MM_CONTEXT,
 
+	CKPT_HDR_IPC = 501,
+	CKPT_HDR_IPC_SHM,
+	CKPT_HDR_IPC_MSG,
+	CKPT_HDR_IPC_SEM,
+
 	CKPT_HDR_TAIL = 9001,
 
 	CKPT_HDR_ERROR = 9999,
@@ -299,4 +304,28 @@ struct ckpt_hdr_pgarr {
 } __attribute__((aligned(8)));
 
 
+/* ipc commons */
+struct ckpt_hdr_ipc_perms {
+	__s32 id;
+	__u32 key;
+	__u32 uid;
+	__u32 gid;
+	__u32 cuid;
+	__u32 cgid;
+	__u32 mode;
+	__u32 _padding;
+	__u64 seq;
+} __attribute__((aligned(8)));
+
+
+#define CKPT_TST_OVERFLOW_16(a, b) \
+	((sizeof(a) > sizeof(b)) && ((a) > SHORT_MAX))
+
+#define CKPT_TST_OVERFLOW_32(a, b) \
+	((sizeof(a) > sizeof(b)) && ((a) > INT_MAX))
+
+#define CKPT_TST_OVERFLOW_64(a, b) \
+	((sizeof(a) > sizeof(b)) && ((a) > LONG_MAX))
+
+
 #endif /* _CHECKPOINT_CKPT_HDR_H_ */
diff --git a/ipc/Makefile b/ipc/Makefile
index 4e1955e..aa6c8dd 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -9,4 +9,5 @@ obj_mq-$(CONFIG_COMPAT) += compat_mq.o
 obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y)
 obj-$(CONFIG_IPC_NS) += namespace.o
 obj-$(CONFIG_POSIX_MQUEUE_SYSCTL) += mq_sysctl.o
+obj-$(CONFIG_CHECKPOINT) += checkpoint.o
 
diff --git a/ipc/checkpoint.c b/ipc/checkpoint.c
new file mode 100644
index 0000000..b7b48b0
--- /dev/null
+++ b/ipc/checkpoint.c
@@ -0,0 +1,81 @@
+/*
+ *  Checkpoint logic and helpers
+ *
+ *  Copyright (C) 2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DIPC
+
+#include <linux/ipc.h>
+#include <linux/msg.h>
+#include <linux/sched.h>
+#include <linux/ipc_namespace.h>
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+#include "util.h"
+
+int checkpoint_ipcns(struct ckpt_ctx *ctx, struct ipc_namespace *ipc_ns)
+{
+	return 0;
+}
+
+int restore_ipcns(struct ckpt_ctx *ctx)
+{
+	return 0;
+}
+
+int checkpoint_fill_ipc_perms(struct ckpt_hdr_ipc_perms *h,
+			      struct kern_ipc_perm *perm)
+{
+	if (ipcperms(perm, S_IROTH))
+		return -EACCES;
+
+	h->id = perm->id;
+	h->key = perm->key;
+	h->uid = perm->uid;
+	h->gid = perm->gid;
+	h->cuid = perm->cuid;
+	h->cgid = perm->cgid;
+	h->mode = perm->mode & S_IRWXUGO;
+	h->seq = perm->seq;
+
+	return 0;
+}
+
+int restore_load_ipc_perms(struct ckpt_hdr_ipc_perms *h,
+			   struct kern_ipc_perm *perm)
+{
+	if (h->id < 0)
+		return -EINVAL;
+	if (CKPT_TST_OVERFLOW_16(h->uid, perm->uid) ||
+	    CKPT_TST_OVERFLOW_16(h->gid, perm->gid) ||
+	    CKPT_TST_OVERFLOW_16(h->cuid, perm->cuid) ||
+	    CKPT_TST_OVERFLOW_16(h->cgid, perm->cgid) ||
+	    CKPT_TST_OVERFLOW_16(h->mode, perm->mode))
+		return -EINVAL;
+	if (h->seq >= USHORT_MAX)
+		return -EINVAL;
+	if (h->mode & ~S_IRWXUGO)
+		return -EINVAL;
+
+	/* FIX: verify the ->mode field makes sense */
+
+	perm->id = h->id;
+	perm->key = h->key;
+#if 0 /* FIX: requires security checks */
+	perm->uid = h->uid;
+	perm->gid = h->gid;
+	perm->cuid = h->cuid;
+	perm->cgid = h->cgid;
+#endif
+	perm->mode = h->mode;
+	perm->seq = h->seq;
+
+	return 0;
+}
diff --git a/ipc/util.h b/ipc/util.h
index c75e3b2..1356909 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -11,6 +11,7 @@
 #define _IPC_UTIL_H
 
 #include <linux/err.h>
+#include <linux/checkpoint_hdr.h>
 
 #define SEQ_MULTIPLIER	(IPCMNI)
 
@@ -177,5 +178,12 @@ extern int do_shmget(key_t key, size_t size, int shmflg, int req_id);
 
 extern void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp);
 
+#ifdef CONFIG_CHECKPOINT
+extern int checkpoint_fill_ipc_perms(struct ckpt_hdr_ipc_perms *h,
+				     struct kern_ipc_perm *perm);
+extern int restore_load_ipc_perms(struct ckpt_hdr_ipc_perms *h,
+				  struct kern_ipc_perm *perm);
+#endif
+
 
 #endif
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- [parent not found: <1243445589-32388-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>] 
- * [RFC v16][PATCH 13/43] c/r: introduce method '->checkpoint()' in struct vm_operations_struct
       [not found] ` <1243445589-32388-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-05-27 17:32   ` Oren Laadan
  2009-05-27 17:32   ` [RFC v16][PATCH 15/43] c/r: restore memory address space (private memory) Oren Laadan
                     ` (7 subsequent siblings)
  8 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, linux-api-u79uwXL29TY76Z2rM5mHXA,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 include/linux/mm.h |    5 +++++
 1 files changed, 5 insertions(+), 0 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bff1f0d..05f0ed9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -14,6 +14,8 @@
 #include <linux/debug_locks.h>
 #include <linux/mm_types.h>
 
+#include <linux/checkpoint_types.h>
+
 struct mempolicy;
 struct anon_vma;
 struct file_ra_state;
@@ -220,6 +222,9 @@ struct vm_operations_struct {
 	int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
 		const nodemask_t *to, unsigned long flags);
 #endif
+#ifdef CONFIG_CHECKPOINT
+	int (*checkpoint)(struct ckpt_ctx *ctx, struct vm_area_struct *vma);
+#endif
 };
 
 struct mmu_gather;
-- 
1.6.0.4
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 15/43] c/r: restore memory address space (private memory)
       [not found] ` <1243445589-32388-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-05-27 17:32   ` [RFC v16][PATCH 13/43] c/r: introduce method '->checkpoint()' in struct vm_operations_struct Oren Laadan
@ 2009-05-27 17:32   ` Oren Laadan
  2009-05-27 17:32   ` [RFC v16][PATCH 16/43] c/r: export shmem_getpage() to support shared memory Oren Laadan
                     ` (6 subsequent siblings)
  8 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, linux-api-u79uwXL29TY76Z2rM5mHXA,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Restoring the memory address space begins with nuking the existing one
of the current process, and then reading the vma state and contents.
Call do_mmap_pgoffset() for each vma and then read in the data.
Changelog[v16]:
  - Restore mm->exe_file
Changelog[v14]:
  - Introduce per vma-type restore() function
  - Merge restart code into same file as checkpoint (memory.c)
  - Compare saved 'vdso' field of mm_context with current value
  - Check whether calls to ckpt_hbuf_get() fail
  - Discard field 'h->parent'
  - Revert change to pr_debug(), back to ckpt_debug()
Changelog[v13]:
  - Avoid access to hh->vma_type after the header is freed
  - Test for no vma's in exit_mmap() before calling unmap_vma() (or it
    may crash if restart fails after having removed all vma's)
Changelog[v12]:
  - Replace obsolete ckpt_debug() with pr_debug()
Changelog[v9]:
  - Introduce ckpt_ctx_checkpoint() for checkpoint-specific ctx setup
Changelog[v7]:
  - Fix argument given to kunmap_atomic() in memory dump/restore
Changelog[v6]:
  - Balance all calls to ckpt_hbuf_get() with matching ckpt_hbuf_put()
    (even though it's not really needed)
Changelog[v5]:
  - Improve memory restore code (following Dave Hansen's comments)
  - Change dump format (and code) to allow chunks of <vaddrs, pages>
    instead of one long list of each
  - Memory restore now maps user pages explicitly to copy data into them,
    instead of reading directly to user space; got rid of mprotect_fixup()
Changelog[v4]:
  - Use standard list_... for ckpt_pgarr
Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 arch/x86/include/asm/ldt.h       |    7 +
 arch/x86/mm/checkpoint.c         |   64 ++++++
 checkpoint/memory.c              |  463 ++++++++++++++++++++++++++++++++++++++
 checkpoint/objhash.c             |    1 +
 checkpoint/process.c             |    3 +
 fs/exec.c                        |    2 +-
 include/linux/checkpoint.h       |    7 +
 include/linux/checkpoint_hdr.h   |    2 +-
 include/linux/checkpoint_types.h |    1 +
 include/linux/mm.h               |   12 +
 mm/filemap.c                     |   19 ++
 mm/mmap.c                        |   23 ++-
 12 files changed, 601 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/ldt.h b/arch/x86/include/asm/ldt.h
index 46727eb..f2845f9 100644
--- a/arch/x86/include/asm/ldt.h
+++ b/arch/x86/include/asm/ldt.h
@@ -37,4 +37,11 @@ struct user_desc {
 #define MODIFY_LDT_CONTENTS_CODE	2
 
 #endif /* !__ASSEMBLY__ */
+
+#ifdef __KERNEL__
+#include <linux/linkage.h>
+asmlinkage int sys_modify_ldt(int func, void __user *ptr,
+			      unsigned long bytecount);
+#endif
+
 #endif /* _ASM_X86_LDT_H */
diff --git a/arch/x86/mm/checkpoint.c b/arch/x86/mm/checkpoint.c
index dc4fbb4..c781416 100644
--- a/arch/x86/mm/checkpoint.c
+++ b/arch/x86/mm/checkpoint.c
@@ -13,6 +13,7 @@
 
 #include <asm/desc.h>
 #include <asm/i387.h>
+#include <asm/elf.h>
 
 #include <linux/checkpoint_types.h>
 #include <asm/checkpoint_hdr.h>
@@ -461,3 +462,66 @@ int restore_read_header_arch(struct ckpt_ctx *ctx)
 	ckpt_hdr_put(ctx, h);
 	return ret;
 }
+
+int restore_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+	struct ckpt_hdr_mm_context *h;
+	unsigned int n;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_MM_CONTEXT);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ckpt_debug("nldt %d vdso %#lx (%p)\n",
+		 h->nldt, (unsigned long) h->vdso, mm->context.vdso);
+
+	ret = -EINVAL;
+	if (h->vdso != (unsigned long) mm->context.vdso)
+		goto out;
+	if (h->ldt_entry_size != LDT_ENTRY_SIZE)
+		goto out;
+
+	ret = _ckpt_read_obj_type(ctx, NULL,
+				  h->nldt * LDT_ENTRY_SIZE,
+				  CKPT_HDR_MM_CONTEXT_LDT);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * to utilize the syscall modify_ldt() we first convert the data
+	 * in the checkpoint image from 'struct desc_struct' to 'struct
+	 * user_desc' with reverse logic of include/asm/desc.h:fill_ldt()
+	 */
+	for (n = 0; n < h->nldt; n++) {
+		struct user_desc info;
+		struct desc_struct desc;
+		mm_segment_t old_fs;
+
+		ret = ckpt_kread(ctx, &desc, LDT_ENTRY_SIZE);
+		if (ret < 0)
+			break;
+
+		info.entry_number = n;
+		info.base_addr = desc.base0 | (desc.base1 << 16);
+		info.limit = desc.limit0;
+		info.seg_32bit = desc.d;
+		info.contents = desc.type >> 2;
+		info.read_exec_only = (desc.type >> 1) ^ 1;
+		info.limit_in_pages = desc.g;
+		info.seg_not_present = desc.p ^ 1;
+		info.useable = desc.avl;
+
+		old_fs = get_fs();
+		set_fs(get_ds());
+		ret = sys_modify_ldt(1, (struct user_desc __user *) &info,
+				     sizeof(info));
+		set_fs(old_fs);
+
+		if (ret < 0)
+			break;
+	}
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
diff --git a/checkpoint/memory.c b/checkpoint/memory.c
index 6d1a3cd..99bafaa 100644
--- a/checkpoint/memory.c
+++ b/checkpoint/memory.c
@@ -15,6 +15,9 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/mm_types.h>
 #include <linux/proc_fs.h>
@@ -631,3 +634,463 @@ int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t)
 
 	return objref;
 }
+
+/***********************************************************************
+ * Restart
+ *
+ * Unlike checkpoint, restart is executed in the context of each restarting
+ * process: vma regions are restored via a call to mmap(), and the data is
+ * read into the address space of the current process.
+ */
+
+/**
+ * read_pages_vaddrs - read addresses of pages to page-array chain
+ * @ctx - restart context
+ * @nr_pages - number of address to read
+ */
+static int read_pages_vaddrs(struct ckpt_ctx *ctx, unsigned long nr_pages)
+{
+	struct ckpt_pgarr *pgarr;
+	unsigned long *vaddrp;
+	int nr, ret;
+
+	while (nr_pages) {
+		pgarr = pgarr_current(ctx);
+		if (!pgarr)
+			return -ENOMEM;
+		nr = pgarr_nr_free(pgarr);
+		if (nr > nr_pages)
+			nr = nr_pages;
+		vaddrp = &pgarr->vaddrs[pgarr->nr_used];
+		ret = ckpt_kread(ctx, vaddrp, nr * sizeof(unsigned long));
+		if (ret < 0)
+			return ret;
+		pgarr->nr_used += nr;
+		nr_pages -= nr;
+	}
+	return 0;
+}
+
+static int restore_read_page(struct ckpt_ctx *ctx, struct page *page, void *p)
+{
+	void *ptr;
+	int ret;
+
+	ret = ckpt_kread(ctx, p, PAGE_SIZE);
+	if (ret < 0)
+		return ret;
+
+	ptr = kmap_atomic(page, KM_USER1);
+	memcpy(ptr, p, PAGE_SIZE);
+	kunmap_atomic(ptr, KM_USER1);
+
+	return 0;
+}
+
+/**
+ * read_pages_contents - read in data of pages in page-array chain
+ * @ctx - restart context
+ */
+static int read_pages_contents(struct ckpt_ctx *ctx)
+{
+	struct mm_struct *mm = current->mm;
+	struct ckpt_pgarr *pgarr;
+	unsigned long *vaddrs;
+	char *buf;
+	int i, ret = 0;
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	down_read(&mm->mmap_sem);
+	list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
+		vaddrs = pgarr->vaddrs;
+		for (i = 0; i < pgarr->nr_used; i++) {
+			struct page *page;
+
+			_ckpt_debug(CKPT_DPAGE, "got page %#lx\n", vaddrs[i]);
+			ret = get_user_pages(current, mm, vaddrs[i],
+					     1, 1, 1, &page, NULL);
+			if (ret < 0)
+				goto out;
+
+			ret = restore_read_page(ctx, page, buf);
+			page_cache_release(page);
+
+			if (ret < 0)
+				goto out;
+		}
+	}
+
+ out:
+	up_read(&mm->mmap_sem);
+	kfree(buf);
+	return 0;
+}
+
+/**
+ * restore_memory_contents - restore contents of a VMA with private memory
+ * @ctx - restart context
+ *
+ * Reads a header that specifies how many pages will follow, then reads
+ * a list of virtual addresses into ctx->pgarr_list page-array chain,
+ * followed by the actual contents of the corresponding pages. Iterates
+ * these steps until reaching a header specifying "0" pages, which marks
+ * the end of the contents.
+ */
+static int restore_memory_contents(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_pgarr *h;
+	unsigned long nr_pages;
+	int len, ret = 0;
+
+	while (1) {
+		h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
+		if (IS_ERR(h))
+			break;
+
+		ckpt_debug("total pages %ld\n", (unsigned long) h->nr_pages);
+
+		nr_pages = h->nr_pages;
+		ckpt_hdr_put(ctx, h);
+
+		if (!nr_pages)
+			break;
+
+		len = nr_pages * (sizeof(unsigned long) + PAGE_SIZE);
+		ret = _ckpt_read_buffer(ctx, NULL, len);
+		if (ret < 0)
+			break;
+
+		ret = read_pages_vaddrs(ctx, nr_pages);
+		if (ret < 0)
+			break;
+		ret = read_pages_contents(ctx);
+		if (ret < 0)
+			break;
+		pgarr_reset_all(ctx);
+	}
+
+	return ret;
+}
+
+/**
+ * calc_map_prot_bits - convert vm_flags to mmap protection
+ * orig_vm_flags: source vm_flags
+ */
+static unsigned long calc_map_prot_bits(unsigned long orig_vm_flags)
+{
+	unsigned long vm_prot = 0;
+
+	if (orig_vm_flags & VM_READ)
+		vm_prot |= PROT_READ;
+	if (orig_vm_flags & VM_WRITE)
+		vm_prot |= PROT_WRITE;
+	if (orig_vm_flags & VM_EXEC)
+		vm_prot |= PROT_EXEC;
+	if (orig_vm_flags & PROT_SEM)   /* only (?) with IPC-SHM  */
+		vm_prot |= PROT_SEM;
+
+	return vm_prot;
+}
+
+/**
+ * calc_map_flags_bits - convert vm_flags to mmap flags
+ * orig_vm_flags: source vm_flags
+ */
+static unsigned long calc_map_flags_bits(unsigned long orig_vm_flags)
+{
+	unsigned long vm_flags = 0;
+
+	vm_flags = MAP_FIXED;
+	if (orig_vm_flags & VM_GROWSDOWN)
+		vm_flags |= MAP_GROWSDOWN;
+	if (orig_vm_flags & VM_DENYWRITE)
+		vm_flags |= MAP_DENYWRITE;
+	if (orig_vm_flags & VM_EXECUTABLE)
+		vm_flags |= MAP_EXECUTABLE;
+	if (orig_vm_flags & VM_MAYSHARE)
+		vm_flags |= MAP_SHARED;
+	else
+		vm_flags |= MAP_PRIVATE;
+
+	return vm_flags;
+}
+
+/**
+ * generic_vma_restore - restore a vma
+ * @mm - address space
+ * @file - file to map (NULL for anonymous)
+ * @h - vma header data
+ */
+static unsigned long generic_vma_restore(struct mm_struct *mm,
+					 struct file *file,
+					 struct ckpt_hdr_vma *h)
+{
+	unsigned long vm_size, vm_start, vm_flags, vm_prot, vm_pgoff;
+	unsigned long addr;
+
+	if (h->vm_end < h->vm_start)
+		return -EINVAL;
+	if (h->vma_objref < 0)
+		return -EINVAL;
+	if (h->vm_flags & CKPT_VMA_NOT_SUPPORTED)
+		return -ENOSYS;
+
+	vm_start = h->vm_start;
+	vm_pgoff = h->vm_pgoff;
+	vm_size = h->vm_end - h->vm_start;
+	vm_prot = calc_map_prot_bits(h->vm_flags);
+	vm_flags = calc_map_flags_bits(h->vm_flags);
+
+	down_write(&mm->mmap_sem);
+	addr = do_mmap_pgoff(file, vm_start, vm_size,
+			     vm_prot, vm_flags, vm_pgoff);
+	up_write(&mm->mmap_sem);
+	ckpt_debug("size %#lx prot %#lx flag %#lx pgoff %#lx => %#lx\n",
+		 vm_size, vm_prot, vm_flags, vm_pgoff, addr);
+
+	return addr;
+}
+
+/**
+ * private_vma_restore - read vma data, recreate it and read contents
+ * @ctx: checkpoint context
+ * @mm: memory address space
+ * @file: file to use for mapping
+ * @h - vma header data
+ */
+int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+			struct file *file, struct ckpt_hdr_vma *h)
+{
+	unsigned long addr;
+
+	if (h->vm_flags & VM_SHARED)
+		return -EINVAL;
+
+	addr = generic_vma_restore(mm, file, h);
+	if (IS_ERR((void *) addr))
+		return PTR_ERR((void *) addr);
+
+	return restore_memory_contents(ctx);
+}
+
+/**
+ * anon_private_restore - read vma data, recreate it and read contents
+ * @ctx: checkpoint context
+ * @mm: memory address space
+ * @h - vma header data
+ */
+static int anon_private_restore(struct ckpt_ctx *ctx,
+				     struct mm_struct *mm,
+				     struct ckpt_hdr_vma *h)
+{
+	/*
+	 * vm_pgoff for anonymous mapping is the "global" page
+	 * offset (namely from addr 0x0), so we force a zero
+	 */
+	h->vm_pgoff = 0;
+
+	return private_vma_restore(ctx, mm, NULL, h);
+}
+
+/* callbacks to restore vma per its type: */
+struct restore_vma_ops {
+	char *vma_name;
+	enum vma_type vma_type;
+	int (*restore) (struct ckpt_ctx *ctx,
+			struct mm_struct *mm,
+			struct ckpt_hdr_vma *ptr);
+};
+
+static struct restore_vma_ops restore_vma_ops[] = {
+	/* ignored vma */
+	{
+		.vma_name = "IGNORE",
+		.vma_type = CKPT_VMA_IGNORE,
+		.restore = NULL,
+	},
+	/* special mapping (vdso) */
+	{
+		.vma_name = "VDSO",
+		.vma_type = CKPT_VMA_VDSO,
+		.restore = special_mapping_restore,
+	},
+	/* anonymous private */
+	{
+		.vma_name = "ANON PRIVATE",
+		.vma_type = CKPT_VMA_ANON,
+		.restore = anon_private_restore,
+	},
+	/* file-mapped private */
+	{
+		.vma_name = "FILE PRIVATE",
+		.vma_type = CKPT_VMA_FILE,
+		.restore = filemap_restore,
+	},
+};
+
+/**
+ * restore_vma - read vma data, recreate it and read contents
+ * @ctx: checkpoint context
+ * @mm: memory address space
+ */
+static int restore_vma(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+	struct ckpt_hdr_vma *h;
+	struct restore_vma_ops *ops;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_VMA);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ckpt_debug("vma %#lx-%#lx flags %#lx type %d vmaref %d\n",
+		   (unsigned long) h->vm_start, (unsigned long) h->vm_end,
+		   (unsigned long) h->vm_flags, (int) h->vma_type,
+		   (int) h->vma_objref);
+
+	ret = -EINVAL;
+	if (h->vm_end < h->vm_start)
+		goto out;
+	if (h->vma_objref < 0)
+		goto out;
+	if (h->vma_type >= CKPT_VMA_MAX)
+		goto out;
+
+	ops = &restore_vma_ops[h->vma_type];
+
+	/* make sure we don't change this accidentally */
+	BUG_ON(ops->vma_type != h->vma_type);
+
+	if (ops->restore) {
+		ckpt_debug("vma type %s\n", ops->vma_name);
+		ret = ops->restore(ctx, mm, h);
+	} else {
+		ckpt_debug("vma ignored\n");
+		ret = 0;
+	}
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static int destroy_mm(struct mm_struct *mm)
+{
+	struct vm_area_struct *vmnext = mm->mmap;
+	struct vm_area_struct *vma;
+	int ret;
+
+	while (vmnext) {
+		vma = vmnext;
+		vmnext = vmnext->vm_next;
+		ret = do_munmap(mm, vma->vm_start, vma->vm_end-vma->vm_start);
+		if (ret < 0) {
+			pr_warning("c/r: failed do_munmap (%d)\n", ret);
+			return ret;
+		}
+	}
+	return 0;
+}
+
+static struct mm_struct *do_restore_mm(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_mm *h;
+	struct mm_struct *mm;
+	struct file *file;
+	unsigned int nr;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_MM);
+	if (IS_ERR(h))
+		return (struct mm_struct *) h;
+
+	ckpt_debug("map_count %d\n", h->map_count);
+
+	/* XXX need more sanity checks */
+
+	ret = -EINVAL;
+	if ((h->start_code > h->end_code) ||
+	    (h->start_data > h->end_data))
+		goto out;
+	if (h->exe_objref < 0)
+		goto out;
+
+	mm = current->mm;
+
+	/* point of no return -- destruct current mm */
+	down_write(&mm->mmap_sem);
+	ret = destroy_mm(mm);
+	if (ret < 0) {
+		up_write(&mm->mmap_sem);
+		goto out;
+	}
+
+	/* FIX: need also mm->flags */
+
+	mm->start_code = h->start_code;
+	mm->end_code = h->end_code;
+	mm->start_data = h->start_data;
+	mm->end_data = h->end_data;
+	mm->start_brk = h->start_brk;
+	mm->brk = h->brk;
+	mm->start_stack = h->start_stack;
+	mm->arg_start = h->arg_start;
+	mm->arg_end = h->arg_end;
+	mm->env_start = h->env_start;
+	mm->env_end = h->env_end;
+
+	/* restore the ->exe_file */
+	if (h->exe_objref) {
+		file = ckpt_obj_fetch(ctx, h->exe_objref, CKPT_OBJ_FILE);
+		if (IS_ERR(file)) {
+			up_write(&mm->mmap_sem);
+			ret = PTR_ERR(file);
+			goto out;
+		}
+		set_mm_exe_file(mm, file);
+	}
+
+	up_write(&mm->mmap_sem);
+
+	for (nr = h->map_count; nr; nr--) {
+		ret = restore_vma(ctx, mm);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = restore_mm_context(ctx, mm);
+ out:
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	/* restore_obj() expect an extra reference */
+	atomic_inc(&mm->mm_users);
+	return mm;
+}
+
+void *restore_mm(struct ckpt_ctx *ctx)
+{
+	return (void *) do_restore_mm(ctx);
+}
+
+int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref)
+{
+	struct mm_struct *mm;
+	int ret;
+
+	mm = ckpt_obj_fetch(ctx, mm_objref, CKPT_OBJ_MM);
+	if (IS_ERR(mm))
+		return PTR_ERR(mm);
+
+	if (mm == current->mm)
+		return 0;
+
+	ret = exec_mmap(mm);
+	if (ret < 0)
+		return ret;
+
+	atomic_inc(&mm->mm_users);
+	return 0;
+}
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index 4113c7d..79325d9 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -121,6 +121,7 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.ref_drop = obj_mm_drop,
 		.ref_grab = obj_mm_grab,
 		.checkpoint = checkpoint_mm,
+		.restore = restore_mm,
 	},
 };
 
diff --git a/checkpoint/process.c b/checkpoint/process.c
index a0b1bf9..3ce82cb 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -143,6 +143,9 @@ static int restore_task_objs(struct ckpt_ctx *ctx)
 	ret = restore_obj_file_table(ctx, h->files_objref);
 	ckpt_debug("file_table: ret %d (%p)\n", ret, current->files);
 
+	ret = restore_obj_mm(ctx, h->mm_objref);
+	ckpt_debug("mm: ret %d (%p)\n", ret, current->mm);
+
 	ckpt_hdr_put(ctx, h);
 	return ret;
 }
diff --git a/fs/exec.c b/fs/exec.c
index 895823d..d7c8f96 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -694,7 +694,7 @@ int kernel_read(struct file *file, unsigned long offset,
 
 EXPORT_SYMBOL(kernel_read);
 
-static int exec_mmap(struct mm_struct *mm)
+int exec_mmap(struct mm_struct *mm)
 {
 	struct task_struct *tsk;
 	struct mm_struct * old_mm, *active_mm;
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 30c22f6..a22eb65 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -67,6 +67,7 @@ extern int checkpoint_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm);
 extern int restore_read_header_arch(struct ckpt_ctx *ctx);
 extern int restore_thread(struct ckpt_ctx *ctx);
 extern int restore_cpu(struct ckpt_ctx *ctx);
+extern int restore_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm);
 
 /* file table */
 extern int checkpoint_obj_file_table(struct ckpt_ctx *ctx,
@@ -97,7 +98,13 @@ extern int private_vma_checkpoint(struct ckpt_ctx *ctx,
 				  int vma_objref);
 
 extern int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref);
 extern int checkpoint_mm(struct ckpt_ctx *ctx, void *ptr);
+extern void *restore_mm(struct ckpt_ctx *ctx);
+
+extern int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+			       struct file *file, struct ckpt_hdr_vma *h);
+
 
 #define CKPT_VMA_NOT_SUPPORTED					\
 	(VM_SHARED | VM_MAYSHARE | VM_IO | VM_HUGETLB |		\
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index f8a4173..671dcab 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -188,7 +188,7 @@ struct ckpt_hdr_mm {
 	__u64 arg_start, arg_end, env_start, env_end;
 } __attribute__((aligned(8)));
 
-/* vma subtypes */
+/* vma subtypes - index into restore_vma_dispatch[] */
 enum vma_type {
 	CKPT_VMA_IGNORE = 0,
 	CKPT_VMA_VDSO,		/* special vdso vma */
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index c30d7bf..a0ea5f6 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -16,6 +16,7 @@
 #ifdef __KERNEL__
 struct ckpt_ctx;
 struct ckpt_hdr_file;
+struct ckpt_hdr_vma;
 
 #include <linux/list.h>
 #include <linux/path.h>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 05f0ed9..ae70b50 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1169,6 +1169,9 @@ extern int do_munmap(struct mm_struct *, unsigned long, size_t);
 
 extern unsigned long do_brk(unsigned long, unsigned long);
 
+/* fs/exec.c */
+extern int exec_mmap(struct mm_struct *mm);
+
 /* filemap.c */
 extern unsigned long page_unuse(struct page *);
 extern void truncate_inode_pages(struct address_space *, loff_t);
@@ -1182,6 +1185,15 @@ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
 int write_one_page(struct page *page, int wait);
 void task_dirty_inc(struct task_struct *tsk);
 
+
+/* checkpoint/restart */
+#ifdef CONFIG_CHECKPOINT
+extern int filemap_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+			   struct ckpt_hdr_vma *hh);
+extern int special_mapping_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+				   struct ckpt_hdr_vma *hh);
+#endif
+
 /* readahead.c */
 #define VM_MAX_READAHEAD	128	/* kbytes */
 #define VM_MIN_READAHEAD	16	/* kbytes (includes current page) */
diff --git a/mm/filemap.c b/mm/filemap.c
index fdeb4b7..86da5d5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1648,6 +1648,25 @@ static int filemap_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma)
 
 	return private_vma_checkpoint(ctx, vma, CKPT_VMA_FILE, vma_objref);
 }
+
+int filemap_restore(struct ckpt_ctx *ctx,
+		    struct mm_struct *mm,
+		    struct ckpt_hdr_vma *h)
+{
+	struct file *file;
+	int ret;
+
+	if (h->vma_type == CKPT_VMA_FILE &&
+	    (h->vm_flags & (VM_SHARED | VM_MAYSHARE)))
+		return -EINVAL;
+
+	file = ckpt_obj_fetch(ctx, h->vma_objref, CKPT_OBJ_FILE);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	ret = private_vma_restore(ctx, mm, file, h);
+	return ret;
+}
 #endif /* CONFIG_CHECKPOINT */
 
 struct vm_operations_struct generic_file_vm_ops = {
diff --git a/mm/mmap.c b/mm/mmap.c
index 003354b..e8e9124 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2110,7 +2110,7 @@ void exit_mmap(struct mm_struct *mm)
 	tlb = tlb_gather_mmu(mm, 1);
 	/* update_hiwater_rss(mm) here? but nobody should be looking */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
-	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+	end = vma ? unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL) : 0;
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
@@ -2269,6 +2269,14 @@ static void special_mapping_close(struct vm_area_struct *vma)
 }
 
 #ifdef CONFIG_CHECKPOINT
+/*
+ * FIX:
+ *   - checkpoint vdso pages (once per distinct vdso is enough)
+ *   - check for compatilibility between saved and current vdso
+ *   - accommodate for dynamic kernel data in vdso page
+ *
+ * Current, we require COMPAT_VDSO which somewhat mitigates the issue
+ */
 static int special_mapping_checkpoint(struct ckpt_ctx *ctx,
 				      struct vm_area_struct *vma)
 {
@@ -2290,6 +2298,19 @@ static int special_mapping_checkpoint(struct ckpt_ctx *ctx,
 
 	return generic_vma_checkpoint(ctx, vma, CKPT_VMA_VDSO, 0);
 }
+
+int special_mapping_restore(struct ckpt_ctx *ctx,
+			    struct mm_struct *mm,
+			    struct ckpt_hdr_vma *h)
+{
+	/*
+	 * FIX:
+	 * Currently, we only handle VDSO/vsyscall special handling.
+	 * Even that, is very basic - call arch_setup_additional_pages
+	 * requiring the same mapping (start address) as before.
+	 */
+	return arch_setup_additional_pages(NULL, h->vm_start, 0);
+}
 #endif /* CONFIG_CHECKPOINT */
 
 static struct vm_operations_struct special_mapping_vmops = {
-- 
1.6.0.4
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 16/43] c/r: export shmem_getpage() to support shared memory
       [not found] ` <1243445589-32388-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-05-27 17:32   ` [RFC v16][PATCH 13/43] c/r: introduce method '->checkpoint()' in struct vm_operations_struct Oren Laadan
  2009-05-27 17:32   ` [RFC v16][PATCH 15/43] c/r: restore memory address space (private memory) Oren Laadan
@ 2009-05-27 17:32   ` Oren Laadan
  2009-05-27 17:32   ` [RFC v16][PATCH 18/43] c/r: restore anonymous- and file-mapped- " Oren Laadan
                     ` (5 subsequent siblings)
  8 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, linux-api-u79uwXL29TY76Z2rM5mHXA,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Export functionality to retrieve specific pages from shared memory
given an inode in shmem-fs; this will be used in the next two patches
to provide support for c/r of shared memory.
mm/shmem.c:
- shmem_getpage() and 'enum sgp_type' moved to linux/mm.h
Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 include/linux/mm.h |   11 +++++++++++
 mm/shmem.c         |   15 ++-------------
 2 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ae70b50..53e916a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -330,6 +330,17 @@ void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
 
+/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
+enum sgp_type {
+	SGP_READ,	/* don't exceed i_size, don't allocate page */
+	SGP_CACHE,	/* don't exceed i_size, may allocate page */
+	SGP_DIRTY,	/* like SGP_CACHE, but set new page dirty */
+	SGP_WRITE,	/* may exceed i_size, may allocate page */
+};
+
+extern int shmem_getpage(struct inode *inode, unsigned long idx,
+			 struct page **pagep, enum sgp_type sgp, int *type);
+
 /*
  * Compound pages have a destructor function.  Provide a
  * prototype for that function and accessor functions.
diff --git a/mm/shmem.c b/mm/shmem.c
index b25f95c..f260336 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -99,14 +99,6 @@ static struct vfsmount *shm_mnt;
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
 
-/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
-enum sgp_type {
-	SGP_READ,	/* don't exceed i_size, don't allocate page */
-	SGP_CACHE,	/* don't exceed i_size, may allocate page */
-	SGP_DIRTY,	/* like SGP_CACHE, but set new page dirty */
-	SGP_WRITE,	/* may exceed i_size, may allocate page */
-};
-
 #ifdef CONFIG_TMPFS
 static unsigned long shmem_default_max_blocks(void)
 {
@@ -119,9 +111,6 @@ static unsigned long shmem_default_max_inodes(void)
 }
 #endif
 
-static int shmem_getpage(struct inode *inode, unsigned long idx,
-			 struct page **pagep, enum sgp_type sgp, int *type);
-
 static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
 {
 	/*
@@ -1202,8 +1191,8 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
  * vm. If we swap it in we mark it dirty since we also free the swap
  * entry since a page cannot live in both the swap and page cache
  */
-static int shmem_getpage(struct inode *inode, unsigned long idx,
-			struct page **pagep, enum sgp_type sgp, int *type)
+int shmem_getpage(struct inode *inode, unsigned long idx,
+		  struct page **pagep, enum sgp_type sgp, int *type)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info = SHMEM_I(inode);
-- 
1.6.0.4
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 18/43] c/r: restore anonymous- and file-mapped- shared memory
       [not found] ` <1243445589-32388-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
                     ` (2 preceding siblings ...)
  2009-05-27 17:32   ` [RFC v16][PATCH 16/43] c/r: export shmem_getpage() to support shared memory Oren Laadan
@ 2009-05-27 17:32   ` Oren Laadan
  2009-05-27 17:32   ` [RFC v16][PATCH 30/43] c/r: stub implementation for IPC namespace Oren Laadan
                     ` (4 subsequent siblings)
  8 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, linux-api-u79uwXL29TY76Z2rM5mHXA,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
The bulk of the work is in ckpt_read_vma(), which has been refactored:
the part that create the suitable 'struct file *' for the mapping is
now larger and moved to a separate function. What's left is to read
the VMA description, get the file pointer, create the mapping, and
proceed to read the contents in.
Both anonymous shared VMAs that have been read earlier (as indicated
by a look up to objhash) and file-mapped shared VMAs are skipped.
Anonymous shared VMAs seen for the first time have their contents
read in directly to the backing inode, as indexed by the page numbers
(as opposed to virtual addresses).
Changelog[v14]:
  - Introduce patch
Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 checkpoint/memory.c        |   66 ++++++++++++++++++++++++++++++++-----------
 include/linux/checkpoint.h |    6 ++++
 include/linux/mm.h         |    2 +
 mm/filemap.c               |   13 ++++++++-
 mm/shmem.c                 |   49 ++++++++++++++++++++++++++++++++
 5 files changed, 118 insertions(+), 18 deletions(-)
diff --git a/checkpoint/memory.c b/checkpoint/memory.c
index 2b73abc..c163b76 100644
--- a/checkpoint/memory.c
+++ b/checkpoint/memory.c
@@ -785,13 +785,36 @@ static int restore_read_page(struct ckpt_ctx *ctx, struct page *page, void *p)
 	return 0;
 }
 
+static struct page *bring_private_page(unsigned long addr)
+{
+	struct page *page;
+	int ret;
+
+	ret = get_user_pages(current, current->mm, addr, 1, 1, 1, &page, NULL);
+	if (ret < 0)
+		page = ERR_PTR(ret);
+	return page;
+}
+
+static struct page *bring_shared_page(unsigned long idx, struct inode *ino)
+{
+	struct page *page = NULL;
+	int ret;
+
+	ret = shmem_getpage(ino, idx, &page, SGP_WRITE, NULL);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (page)
+		unlock_page(page);
+	return page;
+}
+
 /**
  * read_pages_contents - read in data of pages in page-array chain
  * @ctx - restart context
  */
-static int read_pages_contents(struct ckpt_ctx *ctx)
+static int read_pages_contents(struct ckpt_ctx *ctx, struct inode *inode)
 {
-	struct mm_struct *mm = current->mm;
 	struct ckpt_pgarr *pgarr;
 	unsigned long *vaddrs;
 	char *buf;
@@ -801,17 +824,22 @@ static int read_pages_contents(struct ckpt_ctx *ctx)
 	if (!buf)
 		return -ENOMEM;
 
-	down_read(&mm->mmap_sem);
+	down_read(¤t->mm->mmap_sem);
 	list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
 		vaddrs = pgarr->vaddrs;
 		for (i = 0; i < pgarr->nr_used; i++) {
 			struct page *page;
 
 			_ckpt_debug(CKPT_DPAGE, "got page %#lx\n", vaddrs[i]);
-			ret = get_user_pages(current, mm, vaddrs[i],
-					     1, 1, 1, &page, NULL);
-			if (ret < 0)
+			if (inode)
+				page = bring_shared_page(vaddrs[i], inode);
+			else
+				page = bring_private_page(vaddrs[i]);
+
+			if (IS_ERR(page)) {
+				ret = PTR_ERR(page);
 				goto out;
+			}
 
 			ret = restore_read_page(ctx, page, buf);
 			page_cache_release(page);
@@ -822,14 +850,15 @@ static int read_pages_contents(struct ckpt_ctx *ctx)
 	}
 
  out:
-	up_read(&mm->mmap_sem);
+	up_read(¤t->mm->mmap_sem);
 	kfree(buf);
 	return 0;
 }
 
 /**
- * restore_memory_contents - restore contents of a VMA with private memory
+ * restore_memory_contents - restore contents of a memory region
  * @ctx - restart context
+ * @inode - backing inode
  *
  * Reads a header that specifies how many pages will follow, then reads
  * a list of virtual addresses into ctx->pgarr_list page-array chain,
@@ -837,7 +866,7 @@ static int read_pages_contents(struct ckpt_ctx *ctx)
  * these steps until reaching a header specifying "0" pages, which marks
  * the end of the contents.
  */
-static int restore_memory_contents(struct ckpt_ctx *ctx)
+int restore_memory_contents(struct ckpt_ctx *ctx, struct inode *inode)
 {
 	struct ckpt_hdr_pgarr *h;
 	unsigned long nr_pages;
@@ -864,7 +893,7 @@ static int restore_memory_contents(struct ckpt_ctx *ctx)
 		ret = read_pages_vaddrs(ctx, nr_pages);
 		if (ret < 0)
 			break;
-		ret = read_pages_contents(ctx);
+		ret = read_pages_contents(ctx, inode);
 		if (ret < 0)
 			break;
 		pgarr_reset_all(ctx);
@@ -922,9 +951,9 @@ static unsigned long calc_map_flags_bits(unsigned long orig_vm_flags)
  * @file - file to map (NULL for anonymous)
  * @h - vma header data
  */
-static unsigned long generic_vma_restore(struct mm_struct *mm,
-					 struct file *file,
-					 struct ckpt_hdr_vma *h)
+unsigned long generic_vma_restore(struct mm_struct *mm,
+				  struct file *file,
+				  struct ckpt_hdr_vma *h)
 {
 	unsigned long vm_size, vm_start, vm_flags, vm_prot, vm_pgoff;
 	unsigned long addr;
@@ -971,7 +1000,7 @@ int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
 	if (IS_ERR((void *) addr))
 		return PTR_ERR((void *) addr);
 
-	return restore_memory_contents(ctx);
+	return restore_memory_contents(ctx, NULL);
 }
 
 /**
@@ -1031,16 +1060,19 @@ static struct restore_vma_ops restore_vma_ops[] = {
 	{
 		.vma_name = "ANON SHARED",
 		.vma_type = CKPT_VMA_SHM_ANON,
+		.restore = shmem_restore,
 	},
 	/* anonymous shared (skipped) */
 	{
 		.vma_name = "ANON SHARED (skip)",
 		.vma_type = CKPT_VMA_SHM_ANON_SKIP,
+		.restore = shmem_restore,
 	},
 	/* file-mapped shared */
 	{
 		.vma_name = "FILE SHARED",
 		.vma_type = CKPT_VMA_SHM_FILE,
+		.restore = filemap_restore,
 	},
 };
 
@@ -1059,15 +1091,15 @@ static int restore_vma(struct ckpt_ctx *ctx, struct mm_struct *mm)
 	if (IS_ERR(h))
 		return PTR_ERR(h);
 
-	ckpt_debug("vma %#lx-%#lx flags %#lx type %d vmaref %d\n",
+	ckpt_debug("vma %#lx-%#lx flags %#lx type %d vmaref %d inoref %d\n",
 		   (unsigned long) h->vm_start, (unsigned long) h->vm_end,
 		   (unsigned long) h->vm_flags, (int) h->vma_type,
-		   (int) h->vma_objref);
+		   (int) h->vma_objref, (int) h->ino_objref);
 
 	ret = -EINVAL;
 	if (h->vm_end < h->vm_start)
 		goto out;
-	if (h->vma_objref < 0)
+	if (h->vma_objref < 0 || h->ino_objref < 0)
 		goto out;
 	if (h->vma_type >= CKPT_VMA_MAX)
 		goto out;
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 18b4941..169aa70 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -106,9 +106,15 @@ extern int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref);
 extern int checkpoint_mm(struct ckpt_ctx *ctx, void *ptr);
 extern void *restore_mm(struct ckpt_ctx *ctx);
 
+extern unsigned long generic_vma_restore(struct mm_struct *mm,
+					 struct file *file,
+					 struct ckpt_hdr_vma *h);
+
 extern int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
 			       struct file *file, struct ckpt_hdr_vma *h);
 
+extern int restore_memory_contents(struct ckpt_ctx *ctx, struct inode *inode);
+
 
 #define CKPT_VMA_NOT_SUPPORTED						\
 	(VM_IO | VM_HUGETLB | VM_NONLINEAR | VM_PFNMAP |		\
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 53e916a..c8e8972 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1203,6 +1203,8 @@ extern int filemap_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
 			   struct ckpt_hdr_vma *hh);
 extern int special_mapping_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
 				   struct ckpt_hdr_vma *hh);
+extern int shmem_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+			 struct ckpt_hdr_vma *hh);
 #endif
 
 /* readahead.c */
diff --git a/mm/filemap.c b/mm/filemap.c
index 0d28481..d669395 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1691,17 +1691,28 @@ int filemap_restore(struct ckpt_ctx *ctx,
 		    struct ckpt_hdr_vma *h)
 {
 	struct file *file;
+	unsigned long addr;
 	int ret;
 
 	if (h->vma_type == CKPT_VMA_FILE &&
 	    (h->vm_flags & (VM_SHARED | VM_MAYSHARE)))
 		return -EINVAL;
+	if (h->vma_type == CKPT_VMA_SHM_FILE &&
+	    !(h->vm_flags & (VM_SHARED | VM_MAYSHARE)))
+		return -EINVAL;
 
 	file = ckpt_obj_fetch(ctx, h->vma_objref, CKPT_OBJ_FILE);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
-	ret = private_vma_restore(ctx, mm, file, h);
+	if (h->vma_type == CKPT_VMA_FILE) {
+		/* private mapped file */
+		ret = private_vma_restore(ctx, mm, file, h);
+	} else {
+		/* shared mapped file */
+		addr = generic_vma_restore(mm, file, h);
+		ret = (IS_ERR((void *) addr) ? PTR_ERR((void *) addr) : 0);
+	}
 	return ret;
 }
 #endif /* CONFIG_CHECKPOINT */
diff --git a/mm/shmem.c b/mm/shmem.c
index d349c10..829eefa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2409,6 +2409,55 @@ static int shmem_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma)
 
 	return shmem_vma_checkpoint(ctx, vma, vma_type, ino_objref);
 }
+
+int shmem_restore(struct ckpt_ctx *ctx,
+		  struct mm_struct *mm, struct ckpt_hdr_vma *h)
+{
+	unsigned long addr;
+	struct file *file;
+	int ret = 0;
+
+	file = ckpt_obj_fetch(ctx, h->ino_objref, CKPT_OBJ_FILE);
+	if (PTR_ERR(file) == -EINVAL)
+		file = NULL;
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	/* if file is NULL, this is the premiere - create and insert */
+	if (!file) {
+		if (h->vma_type != CKPT_VMA_SHM_ANON)
+			return -EINVAL;
+		/*
+		 * in theory could pass NULL to mmap and let it create
+		 * the file. But, if 'shm_size != vm_end - vm_start',
+		 * or if 'vm_pgoff != 0', then the vma reflects only a
+		 * portion of the shm object and we need to "manually"
+		 * create the full shm object.
+		 */
+		file = shmem_file_setup("/dev/zero", h->ino_size, h->vm_flags);
+		if (IS_ERR(file))
+			return PTR_ERR(file);
+		ret = ckpt_obj_insert(ctx, file, h->ino_objref, CKPT_OBJ_FILE);
+		if (ret < 0)
+			goto out;
+	} else {
+		if (h->vma_type != CKPT_VMA_SHM_ANON_SKIP)
+			return -EINVAL;
+		/* Already need fput() for the file above; keep path simple */
+		get_file(file);
+	}
+
+	addr = generic_vma_restore(mm, file, h);
+	if (IS_ERR((void *) addr))
+		return PTR_ERR((void *) addr);
+
+	if (h->vma_type == CKPT_VMA_SHM_ANON)
+		ret = restore_memory_contents(ctx, file->f_dentry->d_inode);
+ out:
+	fput(file);
+	return ret;
+}
+
 #endif /* CONFIG_CHECKPOINT */
 
 static void init_once(void *foo)
-- 
1.6.0.4
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 30/43] c/r: stub implementation for IPC namespace
       [not found] ` <1243445589-32388-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
                     ` (3 preceding siblings ...)
  2009-05-27 17:32   ` [RFC v16][PATCH 18/43] c/r: restore anonymous- and file-mapped- " Oren Laadan
@ 2009-05-27 17:32   ` Oren Laadan
  2009-05-27 17:32   ` [RFC v16][PATCH 31/43] deferqueue: generic queue to defer work Oren Laadan
                     ` (3 subsequent siblings)
  8 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, linux-api-u79uwXL29TY76Z2rM5mHXA,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Dan Smith,
	Oren Laadan
From: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Changes:
 - Update to match UTS changes
Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 checkpoint/checkpoint.c        |    2 --
 checkpoint/objhash.c           |   28 ++++++++++++++++++++++++++++
 checkpoint/process.c           |   24 ++++++++++++++++++++++--
 include/linux/checkpoint.h     |   15 +++++++++++++++
 include/linux/checkpoint_hdr.h |    3 +++
 5 files changed, 68 insertions(+), 4 deletions(-)
diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index 904f19b..afc7300 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -310,8 +310,6 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 
 	rcu_read_lock();
 	nsproxy = task_nsproxy(t);
-	if (nsproxy->ipc_ns != ctx->root_nsproxy->ipc_ns)
-		ret = -EPERM;
 	if (nsproxy->mnt_ns != ctx->root_nsproxy->mnt_ns)
 		ret = -EPERM;
 	if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns)
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index 8b7adc6..045a920 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -15,6 +15,8 @@
 #include <linux/hash.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/sched.h>
+#include <linux/ipc_namespace.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 
@@ -159,6 +161,22 @@ static int obj_uts_ns_users(void *ptr)
 	return atomic_read(&((struct uts_namespace *) ptr)->kref.refcount);
 }
 
+static int obj_ipc_ns_grab(void *ptr)
+{
+	get_ipc_ns((struct ipc_namespace *) ptr);
+	return 0;
+}
+
+static void obj_ipc_ns_drop(void *ptr)
+{
+	put_ipc_ns((struct ipc_namespace *) ptr);
+}
+
+static int obj_ipc_ns_users(void *ptr)
+{
+	return atomic_read(&((struct ipc_namespace *) ptr)->count);
+}
+
 static struct ckpt_obj_ops ckpt_obj_ops[] = {
 	/* ignored object */
 	{
@@ -226,6 +244,16 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
 		.checkpoint = checkpoint_bad,
 		.restore = restore_bad,
 	},
+	/* ipc_ns object */
+	{
+		.obj_name = "IPC_NS",
+		.obj_type = CKPT_OBJ_IPC_NS,
+		.ref_drop = obj_ipc_ns_drop,
+		.ref_grab = obj_ipc_ns_grab,
+		.ref_users = obj_ipc_ns_users,
+		.checkpoint = checkpoint_bad,
+		.restore = restore_bad,
+	},
 };
 
 
diff --git a/checkpoint/process.c b/checkpoint/process.c
index a827987..eff3d76 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -89,6 +89,7 @@ static int do_checkpoint_ns(struct ckpt_ctx *ctx, struct nsproxy *nsproxy)
 	struct ckpt_hdr_ns *h;
 	int ns_flags = 0;
 	int uts_objref;
+	int ipc_objref;
 	int first, ret;
 
 	uts_objref = ckpt_obj_lookup_add(ctx, nsproxy->uts_ns,
@@ -98,12 +99,20 @@ static int do_checkpoint_ns(struct ckpt_ctx *ctx, struct nsproxy *nsproxy)
 	if (first)
 		ns_flags |= CLONE_NEWUTS;
 
+	ipc_objref = ckpt_obj_lookup_add(ctx, nsproxy->ipc_ns,
+					 CKPT_OBJ_IPC_NS, &first);
+	if (ipc_objref < 0)
+		return ipc_objref;
+	if (first)
+		ns_flags |= CLONE_NEWIPC;
+
 	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_NS);
 	if (!h)
 		return -ENOMEM;
 
 	h->flags = ns_flags;
 	h->uts_objref = uts_objref;
+	h->ipc_objref = ipc_objref;
 
 	ret = ckpt_write_obj(ctx, &h->h);
 	ckpt_hdr_put(ctx, h);
@@ -112,6 +121,10 @@ static int do_checkpoint_ns(struct ckpt_ctx *ctx, struct nsproxy *nsproxy)
 
 	if (ns_flags & CLONE_NEWUTS)
 		ret = checkpoint_uts_ns(ctx, nsproxy->uts_ns);
+#if 0
+	if (!ret && (ns_flags & CLONE_NEWIPC))
+		ret = checkpoint_ipc_ns(ctx, nsproxy->ipc_ns);
+#endif
 
 	/* FIX: Write other namespaces here */
 	return ret;
@@ -438,9 +451,10 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
 		return (struct nsproxy *) h;
 
 	ret = -EINVAL;
-	if (h->uts_objref <= 0)
+	if (h->uts_objref <= 0 ||
+	    h->ipc_objref <= 0)
 		goto out;
-	if (h->flags & ~CLONE_NEWUTS)
+	if (h->flags & ~(CLONE_NEWUTS | CLONE_NEWIPC))
 		goto out;
 
 	/* each unseen-before namespace will be un-shared now */
@@ -456,6 +470,12 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
 	 */
 	ret = restore_uts_ns(ctx, h->uts_objref, h->flags);
 	ckpt_debug("uts ns: %d\n", ret);
+	if (ret < 0)
+		goto out;
+#if 0
+	ret = restore_ipc_ns(ctx, h->ipc_objref, h->flags);
+	ckpt_debug("ipc ns: %d\n", ret);
+#endif
 
 	/* FIX: add more namespaces here */
  out:
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index a7125fc..5a42399 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -81,6 +81,21 @@ extern int restore_restart_block(struct ckpt_ctx *ctx);
 extern int checkpoint_ns(struct ckpt_ctx *ctx, void *ptr);
 extern void *restore_ns(struct ckpt_ctx *ctx);
 
+#if 0
+/* ipc-ns */
+#ifdef CONFIG_SYSVIPC
+extern int checkpoint_ipc_ns(struct ckpt_ctx *ctx,
+			     struct ipc_namespace *ipc_ns);
+extern int restore_ipc_ns(struct ckpt_ctx *ctx, int ns_objref, int flags);
+#else
+static inline int checkpoint_ipc_ns(struct ckpt_ctx *ctx,
+				    struct ipc_namespace *ipc_ns)
+{ return 0; }
+static inline int restore_ipc_ns(struct ckpt_ctx *ctx)
+{ return 0; }
+#endif /* CONFIG_SYSVIPC */
+#endif
+
 /* file table */
 extern int checkpoint_obj_file_table(struct ckpt_ctx *ctx,
 				     struct task_struct *t);
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 1603279..44a48dc 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -55,6 +55,7 @@ enum {
 	CKPT_HDR_CPU,
 	CKPT_HDR_NS,
 	CKPT_HDR_UTS_NS,
+	CKPT_HDR_IPC_NS,
 
 	/* 201-299: reserved for arch-dependent */
 
@@ -95,6 +96,7 @@ enum obj_type {
 	CKPT_OBJ_MM,
 	CKPT_OBJ_NS,
 	CKPT_OBJ_UTS_NS,
+	CKPT_OBJ_IPC_NS,
 	CKPT_OBJ_MAX
 };
 
@@ -167,6 +169,7 @@ struct ckpt_hdr_ns {
 	struct ckpt_hdr h;
 	__u32 flags;
 	__s32 uts_objref;
+	__u32 ipc_objref;
 } __attribute__((aligned(8)));
 
 /* task's shared resources */
-- 
1.6.0.4
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 31/43] deferqueue: generic queue to defer work
       [not found] ` <1243445589-32388-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
                     ` (4 preceding siblings ...)
  2009-05-27 17:32   ` [RFC v16][PATCH 30/43] c/r: stub implementation for IPC namespace Oren Laadan
@ 2009-05-27 17:32   ` Oren Laadan
  2009-05-27 17:33   ` [RFC v16][PATCH 34/43] c/r: save and restore ipc namespace basics Oren Laadan
                     ` (2 subsequent siblings)
  8 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, linux-api-u79uwXL29TY76Z2rM5mHXA,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Add a interface to postpone an action until the end of the entire
checkpoint or restart operation. This is useful when during the
scan of tasks an operation cannot be performed in place, to avoid
the need for a second scan.
One use case is when restoring an ipc shared memory region that has
been deleted (but is still attached), during restart it needs to be
create, attached and then deleted. However, creation and attachment
are performed in distinct locations, so deletion can not be performed
on the spot. Instead, this work (delete) is deferred until later.
(This example is in one of the following patches).
This interface allows chronic procrastination in the kernel:
deferqueue_create(void):
    Allocates and returns a new deferqueue.
deferqueue_run(deferqueue):
    Executes all the pending works in the queue. Returns the number
    of works executed, or an error upon the first error reported by
    a deferred work.
deferqueue_add(deferqueue, data, size, func, dtor):
    Enqueue a deferred work. @function is the callback function to
    do the work, which will be called with @data as an argument.
    @size tells the size of data. @dtor is a destructor callback
    that is invoked for deferred works remaining in the queue when
    the queue is destroyed. NOTE: for a given deferred work, @dtor
    is _not_ called if @func was already called (regardless of the
    return value of the latter).
deferqueue_destroy(deferqueue):
    Free the deferqueue and any queued items while invoking the
    @dtor callback for each queued item.
Why aren't we using the existing kernel workqueue mechanism?  We need
to defer to work until the end of the operation: not earlier, since we
need other things to be in place; not later, to not block waiting for
it. However, the workqueue schedules the work for 'some time later'.
Also, the kernel workqueue may run in any task context, but we require
many times that an operation be run in the context of some specific
restarting task (e.g., restoring IPC state of a certain ipc_ns).
Instead, this mechanism is a simple way for the c/r operation as a
whole, and later a task in particular, to defer some action until
later (but not arbitrarily later) _in the restore_ operation.
Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 checkpoint/Kconfig         |    5 ++
 include/linux/deferqueue.h |   58 +++++++++++++++++++++++
 kernel/Makefile            |    1 +
 kernel/deferqueue.c        |  109 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 173 insertions(+), 0 deletions(-)
diff --git a/checkpoint/Kconfig b/checkpoint/Kconfig
index 1761b0a..53ed6fa 100644
--- a/checkpoint/Kconfig
+++ b/checkpoint/Kconfig
@@ -2,9 +2,14 @@
 # implemented the hooks for processor state etc. needed by the
 # core checkpoint/restart code.
 
+config DEFERQUEUE
+	bool
+	default n
+
 config CHECKPOINT
 	bool "Enable checkpoint/restart (EXPERIMENTAL)"
 	depends on CHECKPOINT_SUPPORT && EXPERIMENTAL
+	select DEFERQUEUE
 	help
 	  Application checkpoint/restart is the ability to save the
 	  state of a running application so that it can later resume
diff --git a/include/linux/deferqueue.h b/include/linux/deferqueue.h
new file mode 100644
index 0000000..2eb58cf
--- /dev/null
+++ b/include/linux/deferqueue.h
@@ -0,0 +1,58 @@
+/*
+ * deferqueue.h --- deferred work queue handling for Linux.
+ */
+
+#ifndef _LINUX_DEFERQUEUE_H
+#define _LINUX_DEFERQUEUE_H
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+/*
+ * This interface allows chronic procrastination in the kernel:
+ *
+ * deferqueue_create(void):
+ *     Allocates and returns a new deferqueue.
+ *
+ * deferqueue_run(deferqueue):
+ *     Executes all the pending works in the queue. Returns the number
+ *     of works executed, or an error upon the first error reported by
+ *     a deferred work.
+ *
+ * deferqueue_add(deferqueue, data, size, func, dtor):
+ * 	Enqueue a deferred work. @function is the callback function to
+ *      do the work, which will be called with @data as an argument.
+ *      @size tells the size of data. @dtor is a destructor callback
+ *      that is invoked for deferred works remaining in the queue when
+ *      the queue is destroyed. NOTE: for a given deferred work, @dtor
+ *      is _not_ called if @func was already called (regardless of the
+ *      return value of the latter).
+ *
+ * deferqueue_destroy(deferqueue):
+ *      Free the deferqueue and any queued items while invoking the
+ *      @dtor callback for each queued item.
+ */
+
+
+typedef int (*deferqueue_func_t)(void *);
+
+struct deferqueue_entry {
+	deferqueue_func_t function;
+	deferqueue_func_t destructor;
+	struct list_head list;
+	char data[0];
+};
+
+struct deferqueue_head {
+	spinlock_t lock;
+	struct list_head list;
+};
+
+struct deferqueue_head *deferqueue_create(void);
+void deferqueue_destroy(struct deferqueue_head *head);
+int deferqueue_add(struct deferqueue_head *head, void *data, int size,
+		   deferqueue_func_t func, deferqueue_func_t dtor);
+int deferqueue_run(struct deferqueue_head *head);
+
+#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 4242366..6bc638d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,6 +22,7 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 endif
 
+obj-$(CONFIG_DEFERQUEUE) += deferqueue.o
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
diff --git a/kernel/deferqueue.c b/kernel/deferqueue.c
new file mode 100644
index 0000000..efd99d5
--- /dev/null
+++ b/kernel/deferqueue.c
@@ -0,0 +1,109 @@
+/*
+ *  Infrastructure to manage deferred work
+ *
+ *  This differs from a workqueue in that the work must be deferred
+ *  until specifically run by the caller.
+ *
+ *  As the only user currently is checkpoint/restart, which has
+ *  very simple usage, the locking is kept simple.  Adding rules
+ *  is protected by the head->lock.  But deferqueue_run() is only
+ *  called once, after all entries have been added.  So it is not
+ *  protected.  Similarly, _destroy is only called once when the
+ *  ckpt_ctx is releeased, so it is not locked or refcounted.  These
+ *  can of course be added if needed by other users.
+ *
+ *  Why not use workqueue ?  We need to defer work until the end of an
+ *  operation: not earlier, since we need other things to be in place;
+ *  not later, to not block waiting for it. However, the workqueue
+ *  schedules the work for 'some time later'. Also, workqueue may run
+ *  in any task context, but we require many times that an operation
+ *  be run in the context of some specific restarting task (e.g.,
+ *  restoring IPC state of a certain ipc_ns).
+ *
+ *  Instead, this mechanism is a simple way for the c/r operation as a
+ *  whole, and later a task in particular, to defer some action until
+ *  later (but not arbitrarily later) _in the restore_ operation.
+ *
+ *  Copyright (C) 2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/deferqueue.h>
+
+struct deferqueue_head *deferqueue_create(void)
+{
+	struct deferqueue_head *h = kmalloc(sizeof(*h), GFP_KERNEL);
+	if (h) {
+		spin_lock_init(&h->lock);
+		INIT_LIST_HEAD(&h->list);
+	}
+	return h;
+}
+
+void deferqueue_destroy(struct deferqueue_head *h)
+{
+	if (!list_empty(&h->list)) {
+		struct deferqueue_entry *dq, *n;
+
+		pr_debug("%s: freeing non-empty queue\n", __func__);
+		list_for_each_entry_safe(dq, n, &h->list, list) {
+			dq->destructor(dq->data);
+			list_del(&dq->list);
+			kfree(dq);
+		}
+	}
+	kfree(h);
+}
+
+int deferqueue_add(struct deferqueue_head *head, void *data, int size,
+		   deferqueue_func_t func, deferqueue_func_t dtor)
+{
+	struct deferqueue_entry *dq;
+
+	dq = kmalloc(sizeof(dq) + size, GFP_KERNEL);
+	if (!dq)
+		return -ENOMEM;
+
+	dq->function = func;
+	dq->destructor = dtor;
+	memcpy(dq->data, data, size);
+
+	pr_debug("%s: adding work %p func %p dtor %p\n",
+		 __func__, dq, func, dtor);
+	spin_lock(&head->lock);
+	list_add_tail(&head->list, &dq->list);
+	spin_unlock(&head->lock);
+	return 0;
+}
+
+/*
+ * deferqueue_run - perform all work in the work queue
+ * @head: deferqueue_head from which to run
+ *
+ * returns: number of works performed, or < 0 on error
+ */
+int deferqueue_run(struct deferqueue_head *head)
+{
+	struct deferqueue_entry *dq, *n;
+	int nr = 0;
+	int ret;
+
+	list_for_each_entry_safe(dq, n, &head->list, list) {
+		pr_debug("doing work %p function %p\n", dq, dq->function);
+		/* don't call destructor - function callback should do it */
+		ret = dq->function(dq->data);
+		if (ret < 0)
+			pr_debug("wq function failed %d\n", ret);
+		list_del(&dq->list);
+		kfree(dq);
+		nr++;
+	}
+
+	return nr;
+}
-- 
1.6.0.4
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 34/43] c/r: save and restore ipc namespace basics
       [not found] ` <1243445589-32388-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
                     ` (5 preceding siblings ...)
  2009-05-27 17:32   ` [RFC v16][PATCH 31/43] deferqueue: generic queue to defer work Oren Laadan
@ 2009-05-27 17:33   ` Oren Laadan
  2009-05-27 17:33   ` [RFC v16][PATCH 38/43] c/r: support message-queues sysv-ipc Oren Laadan
  2009-05-27 17:33   ` [RFC v16][PATCH 39/43] c/r (ipc): export interface from ipc/sem.c to cleanup ipc sem Oren Laadan
  8 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, linux-api-u79uwXL29TY76Z2rM5mHXA,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Save and restores the common state (parameters) of ipc namespace.
Also add logic to iterate through the objects of sysvipc shared memory,
message queues and semaphores. The logic to save and restore the state
of these objects will be added in the next few patches.
Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 checkpoint/process.c           |    4 -
 include/linux/checkpoint.h     |    5 +-
 include/linux/checkpoint_hdr.h |   22 +++++
 ipc/checkpoint.c               |  203 ++++++++++++++++++++++++++++++++++++++--
 4 files changed, 220 insertions(+), 14 deletions(-)
diff --git a/checkpoint/process.c b/checkpoint/process.c
index eff3d76..b604a85 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -121,10 +121,8 @@ static int do_checkpoint_ns(struct ckpt_ctx *ctx, struct nsproxy *nsproxy)
 
 	if (ns_flags & CLONE_NEWUTS)
 		ret = checkpoint_uts_ns(ctx, nsproxy->uts_ns);
-#if 0
 	if (!ret && (ns_flags & CLONE_NEWIPC))
 		ret = checkpoint_ipc_ns(ctx, nsproxy->ipc_ns);
-#endif
 
 	/* FIX: Write other namespaces here */
 	return ret;
@@ -472,10 +470,8 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
 	ckpt_debug("uts ns: %d\n", ret);
 	if (ret < 0)
 		goto out;
-#if 0
 	ret = restore_ipc_ns(ctx, h->ipc_objref, h->flags);
 	ckpt_debug("ipc ns: %d\n", ret);
-#endif
 
 	/* FIX: add more namespaces here */
  out:
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 9a7517f..d5498bc 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -85,7 +85,6 @@ extern int restore_restart_block(struct ckpt_ctx *ctx);
 extern int checkpoint_ns(struct ckpt_ctx *ctx, void *ptr);
 extern void *restore_ns(struct ckpt_ctx *ctx);
 
-#if 0
 /* ipc-ns */
 #ifdef CONFIG_SYSVIPC
 extern int checkpoint_ipc_ns(struct ckpt_ctx *ctx,
@@ -98,7 +97,9 @@ static inline int checkpoint_ipc_ns(struct ckpt_ctx *ctx,
 static inline int restore_ipc_ns(struct ckpt_ctx *ctx)
 { return 0; }
 #endif /* CONFIG_SYSVIPC */
-#endif
+
+extern int checkpoint_ipcns(struct ckpt_ctx *ctx, struct ipc_namespace *ipc_ns);
+extern int restore_ipcns(struct ckpt_ctx *ctx);
 
 /* file table */
 extern int checkpoint_obj_file_table(struct ckpt_ctx *ctx,
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 05769f4..406b5d6 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -305,6 +305,28 @@ struct ckpt_hdr_pgarr {
 
 
 /* ipc commons */
+struct ckpt_hdr_ipcns {
+	struct ckpt_hdr h;
+	__u64 shm_ctlmax;
+	__u64 shm_ctlall;
+	__s32 shm_ctlmni;
+
+	__s32 msg_ctlmax;
+	__s32 msg_ctlmnb;
+	__s32 msg_ctlmni;
+
+	__s32 sem_ctl_msl;
+	__s32 sem_ctl_mns;
+	__s32 sem_ctl_opm;
+	__s32 sem_ctl_mni;
+} __attribute__((aligned(8)));
+
+struct ckpt_hdr_ipc {
+	struct ckpt_hdr h;
+	__u32 ipc_type;
+	__u32 ipc_count;
+} __attribute__((aligned(8)));
+
 struct ckpt_hdr_ipc_perms {
 	__s32 id;
 	__u32 key;
diff --git a/ipc/checkpoint.c b/ipc/checkpoint.c
index b7b48b0..436be5e 100644
--- a/ipc/checkpoint.c
+++ b/ipc/checkpoint.c
@@ -20,15 +20,12 @@
 
 #include "util.h"
 
-int checkpoint_ipcns(struct ckpt_ctx *ctx, struct ipc_namespace *ipc_ns)
-{
-	return 0;
-}
+/* for ckpt_debug */
+static char *ipc_ind_to_str[] = { "sem", "msg", "shm" };
 
-int restore_ipcns(struct ckpt_ctx *ctx)
-{
-	return 0;
-}
+/**************************************************************************
+ * Checkpoint
+ */
 
 int checkpoint_fill_ipc_perms(struct ckpt_hdr_ipc_perms *h,
 			      struct kern_ipc_perm *perm)
@@ -48,6 +45,82 @@ int checkpoint_fill_ipc_perms(struct ckpt_hdr_ipc_perms *h,
 	return 0;
 }
 
+static int checkpoint_ipc_any(struct ckpt_ctx *ctx,
+			      struct ipc_namespace *ipc_ns,
+			      int ipc_ind, int ipc_type,
+			      int (*func)(int id, void *p, void *data))
+{
+	struct ckpt_hdr_ipc *h;
+	struct ipc_ids *ipc_ids = &ipc_ns->ids[ipc_ind];
+	int ret = -ENOMEM;
+
+	down_read(&ipc_ids->rw_mutex);
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_IPC);
+	if (!h)
+		goto out;
+
+	h->ipc_type = ipc_type;
+	h->ipc_count = ipc_ids->in_use;
+	ckpt_debug("ipc-%s count %d\n", ipc_ind_to_str[ipc_ind], h->ipc_count);
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		goto out;
+
+	ret = idr_for_each(&ipc_ids->ipcs_idr, func, ctx);
+	ckpt_debug("ipc-%s ret %d\n", ipc_ind_to_str[ipc_ind], ret);
+ out:
+	up_read(&ipc_ids->rw_mutex);
+	return ret;
+}
+
+int checkpoint_ipc_ns(struct ckpt_ctx *ctx, struct ipc_namespace *ipc_ns)
+{
+	struct ckpt_hdr_ipcns *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_IPC_NS);
+	if (!h)
+		return -ENOMEM;
+
+	h->shm_ctlmax = ipc_ns->shm_ctlmax;
+	h->shm_ctlall = ipc_ns->shm_ctlall;
+	h->shm_ctlmni = ipc_ns->shm_ctlmni;
+
+	h->msg_ctlmax = ipc_ns->msg_ctlmax;
+	h->msg_ctlmnb = ipc_ns->msg_ctlmnb;
+	h->msg_ctlmni = ipc_ns->msg_ctlmni;
+
+	h->sem_ctl_msl = ipc_ns->sem_ctls[0];
+	h->sem_ctl_mns = ipc_ns->sem_ctls[1];
+	h->sem_ctl_opm = ipc_ns->sem_ctls[2];
+	h->sem_ctl_mni = ipc_ns->sem_ctls[3];
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ret;
+
+#if 0 /* NEXT FEW PATCHES */
+	ret = checkpoint_ipc_any(ctx, ipc_ns, IPC_SHM_IDS,
+				 CKPT_HDR_IPC_SHM, checkpoint_ipc_shm);
+	if (ret < 0)
+		return ret;
+	ret = checkpoint_ipc_any(ctx, ipc_ns, IPC_MSG_IDS,
+				 CKPT_HDR_IPC_MSG, checkpoint_ipc_msg);
+	if (ret < 0)
+		return ret;
+	ret = checkpoint_ipc_any(ctx, ipc_ns, IPC_SEM_IDS,
+				 CKPT_HDR_IPC_SEM, checkpoint_ipc_sem);
+#endif
+	return ret;
+}
+
+/**************************************************************************
+ * Restart
+ */
+
 int restore_load_ipc_perms(struct ckpt_hdr_ipc_perms *h,
 			   struct kern_ipc_perm *perm)
 {
@@ -79,3 +152,117 @@ int restore_load_ipc_perms(struct ckpt_hdr_ipc_perms *h,
 
 	return 0;
 }
+
+static int restore_ipc_any(struct ckpt_ctx *ctx, int ipc_ind, int ipc_type,
+			   int (*func)(struct ckpt_ctx *ctx))
+{
+	struct ckpt_hdr_ipc *h;
+	int n, ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_IPC);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ckpt_debug("ipc-%s: count %d\n", ipc_ind_to_str[ipc_ind], h->ipc_count);
+
+	ret = -EINVAL;
+	if (h->ipc_type != ipc_type)
+		goto out;
+
+	ret = 0;
+	for (n = 0; n < h->ipc_count; n++) {
+		ret = (*func)(ctx);
+		if (ret < 0)
+			goto out;
+	}
+ out:
+	ckpt_debug("ipc-%s: ret %d\n", ipc_ind_to_str[ipc_ind], ret);
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+static int do_restore_ipc_ns(struct ckpt_ctx *ctx)
+{
+	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+	struct ckpt_hdr_ipcns *h;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_IPC_NS);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ret = -EINVAL;
+	if (h->shm_ctlmax < 0 || h->shm_ctlall < 0 || h->shm_ctlmni < 0)
+		goto out;
+	if (h->msg_ctlmax < 0 || h->msg_ctlmnb < 0 || h->msg_ctlmni < 0)
+		goto out;
+	if (h->sem_ctl_msl < 0 || h->sem_ctl_mns < 0 ||
+	    h->sem_ctl_opm < 0 || h->sem_ctl_mni < 0)
+		goto out;
+
+	/* this is a brand new ipc_ns: safe to rewrite its properties */
+	ipc_ns->shm_ctlmax = h->shm_ctlmax;
+	ipc_ns->shm_ctlall = h->shm_ctlall;
+	ipc_ns->shm_ctlmni = h->shm_ctlmni;
+
+	ipc_ns->msg_ctlmax = h->msg_ctlmax;
+	ipc_ns->msg_ctlmnb = h->msg_ctlmnb;
+	ipc_ns->msg_ctlmni = h->msg_ctlmni;
+
+	ipc_ns->sem_ctls[0] = h->sem_ctl_msl;
+	ipc_ns->sem_ctls[1] = h->sem_ctl_mns;
+	ipc_ns->sem_ctls[2] = h->sem_ctl_opm;
+	ipc_ns->sem_ctls[3] = h->sem_ctl_mni;
+
+#if 0 /* NEXT FEW PATCHES */
+	ret = restore_ipc_any(ctx, IPC_SHM_IDS,
+			      CKPT_HDR_IPC_SHM, restore_ipc_shm);
+	if (ret < 0)
+		goto out;
+	ret = ckpt_read_ipc_any(ctx, IPC_MSG_IDS,
+			      CKPT_HDR_IPC_MSG, restore_ipc_msg);
+	if (ret < 0)
+		goto out;
+	ret = restore_ipc_any(ctx, IPC_SEM_IDS,
+			      CKPT_HDR_IPC_SEM, restore_ipc_sem);
+#endif
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+int restore_ipc_ns(struct ckpt_ctx *ctx, int ns_objref, int flags)
+{
+	struct ipc_namespace *ipc_ns;
+	int ret = 0;
+
+	ipc_ns = ckpt_obj_fetch(ctx, ns_objref, CKPT_OBJ_IPC_NS);
+	if (PTR_ERR(ipc_ns) == -EINVAL)
+		ipc_ns = NULL;
+	if (IS_ERR(ipc_ns))
+		return PTR_ERR(ipc_ns);
+
+	/* sanity: CLONE_NEWIPC if-and-only-if ipc_ns is NULL (first timer) */
+	if (!!ipc_ns ^ !(flags & CLONE_NEWIPC))
+		return -EINVAL;
+
+	if (!ipc_ns) {
+		ret = do_restore_ipc_ns(ctx);
+		if (ret < 0)
+			return ret;
+		ret = ckpt_obj_insert(ctx, current->nsproxy->ipc_ns,
+				      ns_objref, CKPT_OBJ_IPC_NS);
+	} else {
+		struct ipc_namespace *old_ipc_ns;
+
+		/* safe because nsproxy->count must be 1 ... */
+		BUG_ON(atomic_read(¤t->nsproxy->count) != 1);
+
+		old_ipc_ns = current->nsproxy->ipc_ns;
+		current->nsproxy->ipc_ns = ipc_ns;
+		get_ipc_ns(ipc_ns);
+		put_ipc_ns(old_ipc_ns);
+	}
+
+	return ret;
+}
-- 
1.6.0.4
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 38/43] c/r: support message-queues sysv-ipc
       [not found] ` <1243445589-32388-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
                     ` (6 preceding siblings ...)
  2009-05-27 17:33   ` [RFC v16][PATCH 34/43] c/r: save and restore ipc namespace basics Oren Laadan
@ 2009-05-27 17:33   ` Oren Laadan
  2009-05-27 17:33   ` [RFC v16][PATCH 39/43] c/r (ipc): export interface from ipc/sem.c to cleanup ipc sem Oren Laadan
  8 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, linux-api-u79uwXL29TY76Z2rM5mHXA,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Checkpoint of sysvipc message-queues is performed by iterating through
all 'msq' objects and dumping the contents of each one. The message
queued on each 'msq' are dumped with that object.
Message of a specific queue get written one by one. The queue lock
cannot be held while dumping them, but the loop must be protected from
someone (who ?) writing or reading. To do that we grab the lock, then
hijack the entire chain of messages from the queue, drop the lock,
and then safely dump them in a loop. Finally, with the lock held, we
re-attach the chain while verifying that there isn't other (new) data
on that queue.
Writing the message contents themselves is straight forward. The code
is similar to that in ipc/msgutil.c, the main difference being that
we deal with kernel memory and not user memory.
Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 include/linux/checkpoint_hdr.h |   21 +++-
 ipc/Makefile                   |    2 +-
 ipc/checkpoint.c               |    6 +-
 ipc/checkpoint_msg.c           |  362 ++++++++++++++++++++++++++++++++++++++++
 ipc/util.h                     |    3 +
 5 files changed, 389 insertions(+), 5 deletions(-)
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index f7e331d..b05f39c 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -73,6 +73,7 @@ enum {
 	CKPT_HDR_IPC = 501,
 	CKPT_HDR_IPC_SHM,
 	CKPT_HDR_IPC_MSG,
+	CKPT_HDR_IPC_MSG_MSG,
 	CKPT_HDR_IPC_SEM,
 
 	CKPT_HDR_TAIL = 9001,
@@ -356,6 +357,25 @@ struct ckpt_hdr_ipc_shm {
 	__u32 objref;
 } __attribute__((aligned(8)));
 
+struct ckpt_hdr_ipc_msg {
+	struct ckpt_hdr h;
+	struct ckpt_hdr_ipc_perms perms;
+	__u64 q_stime;
+	__u64 q_rtime;
+	__u64 q_ctime;
+	__u64 q_cbytes;
+	__u64 q_qnum;
+	__u64 q_qbytes;
+	__s32 q_lspid;
+	__s32 q_lrpid;
+} __attribute__((aligned(8)));
+
+struct ckpt_hdr_ipc_msg_msg {
+	struct ckpt_hdr h;
+	__s32 m_type;
+	__u32 m_ts;
+} __attribute__((aligned(8)));
+
 
 #define CKPT_TST_OVERFLOW_16(a, b) \
 	((sizeof(a) > sizeof(b)) && ((a) > SHORT_MAX))
@@ -366,5 +386,4 @@ struct ckpt_hdr_ipc_shm {
 #define CKPT_TST_OVERFLOW_64(a, b) \
 	((sizeof(a) > sizeof(b)) && ((a) > LONG_MAX))
 
-
 #endif /* _CHECKPOINT_CKPT_HDR_H_ */
diff --git a/ipc/Makefile b/ipc/Makefile
index 7e23683..ca408ff 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -9,5 +9,5 @@ obj_mq-$(CONFIG_COMPAT) += compat_mq.o
 obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y)
 obj-$(CONFIG_IPC_NS) += namespace.o
 obj-$(CONFIG_POSIX_MQUEUE_SYSCTL) += mq_sysctl.o
-obj-$(CONFIG_CHECKPOINT) += checkpoint.o checkpoint_shm.o
+obj-$(CONFIG_CHECKPOINT) += checkpoint.o checkpoint_shm.o checkpoint_msg.o
 
diff --git a/ipc/checkpoint.c b/ipc/checkpoint.c
index 25d2277..7eece96 100644
--- a/ipc/checkpoint.c
+++ b/ipc/checkpoint.c
@@ -104,11 +104,11 @@ int checkpoint_ipc_ns(struct ckpt_ctx *ctx, struct ipc_namespace *ipc_ns)
 
 	ret = checkpoint_ipc_any(ctx, ipc_ns, IPC_SHM_IDS,
 				 CKPT_HDR_IPC_SHM, checkpoint_ipc_shm);
-#if 0 /* NEXT FEW PATCHES */
 	if (ret < 0)
 		return ret;
 	ret = checkpoint_ipc_any(ctx, ipc_ns, IPC_MSG_IDS,
 				 CKPT_HDR_IPC_MSG, checkpoint_ipc_msg);
+#if 0 /* NEXT FEW PATCHES */
 	if (ret < 0)
 		return ret;
 	ret = checkpoint_ipc_any(ctx, ipc_ns, IPC_SEM_IDS,
@@ -216,11 +216,11 @@ static int do_restore_ipc_ns(struct ckpt_ctx *ctx)
 
 	ret = restore_ipc_any(ctx, IPC_SHM_IDS,
 			      CKPT_HDR_IPC_SHM, restore_ipc_shm);
-#if 0 /* NEXT FEW PATCHES */
 	if (ret < 0)
 		goto out;
-	ret = ckpt_read_ipc_any(ctx, IPC_MSG_IDS,
+	ret = restore_ipc_any(ctx, IPC_MSG_IDS,
 			      CKPT_HDR_IPC_MSG, restore_ipc_msg);
+#if 0 /* NEXT FEW PATCHES */
 	if (ret < 0)
 		goto out;
 	ret = restore_ipc_any(ctx, IPC_SEM_IDS,
diff --git a/ipc/checkpoint_msg.c b/ipc/checkpoint_msg.c
new file mode 100644
index 0000000..a988a9e
--- /dev/null
+++ b/ipc/checkpoint_msg.c
@@ -0,0 +1,362 @@
+/*
+ *  Checkpoint/restart - dump state of sysvipc msg
+ *
+ *  Copyright (C) 2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DIPC
+
+#include <linux/mm.h>
+#include <linux/msg.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/nsproxy.h>
+#include <linux/ipc_namespace.h>
+
+#include "util.h"
+
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+/************************************************************************
+ * ipc checkpoint
+ */
+
+static int fill_ipc_msg_hdr(struct ckpt_ctx *ctx,
+			    struct ckpt_hdr_ipc_msg *h,
+			    struct msg_queue *msq)
+{
+	int ret = 0;
+
+	ipc_lock_by_ptr(&msq->q_perm);
+
+	ret = checkpoint_fill_ipc_perms(&h->perms, &msq->q_perm);
+	if (ret < 0)
+		goto unlock;
+
+	h->q_stime = msq->q_stime;
+	h->q_rtime = msq->q_rtime;
+	h->q_ctime = msq->q_ctime;
+	h->q_cbytes = msq->q_cbytes;
+	h->q_qnum = msq->q_qnum;
+	h->q_qbytes = msq->q_qbytes;
+	h->q_lspid = msq->q_lspid;
+	h->q_lrpid = msq->q_lrpid;
+
+ unlock:
+	ipc_unlock(&msq->q_perm);
+	ckpt_debug("msg: lspid %d rspid %d qnum %lld qbytes %lld\n",
+		 h->q_lspid, h->q_lrpid, h->q_qnum, h->q_qbytes);
+
+	return ret;
+}
+
+static int checkpoint_msg_contents(struct ckpt_ctx *ctx, struct msg_msg *msg)
+{
+	struct ckpt_hdr_ipc_msg_msg *h;
+	struct msg_msgseg *seg;
+	int total, len;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_IPC_MSG_MSG);
+	if (!h)
+		return -ENOMEM;
+
+	h->m_type = msg->m_type;
+	h->m_ts = msg->m_ts;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ret;
+
+	total = msg->m_ts;
+	len = min(total, (int) DATALEN_MSG);
+	ret = ckpt_write_buffer(ctx, (msg + 1), len);
+	if (ret < 0)
+		return ret;
+
+	seg = msg->next;
+	total -= len;
+
+	while (total) {
+		len = min(total, (int) DATALEN_SEG);
+		ret = ckpt_write_buffer(ctx, (seg + 1), len);
+		if (ret < 0)
+			break;
+		seg = seg->next;
+		total -= len;
+	}
+
+	return ret;
+}
+
+static int checkpoint_msg_queue(struct ckpt_ctx *ctx, struct msg_queue *msq)
+{
+	struct list_head messages;
+	struct msg_msg *msg;
+	int ret = -EBUSY;
+
+	/*
+	 * Scanning the msq requires the lock, but then we can't write
+	 * data out from inside. Instead, we grab the lock, remove all
+	 * messages to our own list, drop the lock, write the messages,
+	 * and finally re-attach the them to the msq with the lock taken.
+	 */
+	ipc_lock_by_ptr(&msq->q_perm);
+	if (!list_empty(&msq->q_receivers))
+		goto unlock;
+	if (!list_empty(&msq->q_senders))
+		goto unlock;
+	if (list_empty(&msq->q_messages))
+		goto unlock;
+	/* temporarily take out all messages */
+	INIT_LIST_HEAD(&messages);
+	list_splice_init(&msq->q_messages, &messages);
+ unlock:
+	ipc_unlock(&msq->q_perm);
+
+	list_for_each_entry(msg, &messages, m_list) {
+		ret = checkpoint_msg_contents(ctx, msg);
+		if (ret < 0)
+			break;
+	}
+
+	/* put all the messages back in */
+	ipc_lock_by_ptr(&msq->q_perm);
+	list_splice(&messages, &msq->q_messages);
+	ipc_unlock(&msq->q_perm);
+
+	return ret;
+}
+
+int checkpoint_ipc_msg(int id, void *p, void *data)
+{
+	struct ckpt_hdr_ipc_msg *h;
+	struct ckpt_ctx *ctx = (struct ckpt_ctx *) data;
+	struct kern_ipc_perm *perm = (struct kern_ipc_perm *) p;
+	struct msg_queue *msq;
+	int ret;
+
+	msq = container_of(perm, struct msg_queue, q_perm);
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_IPC_MSG);
+	if (!h)
+		return -ENOMEM;
+
+	ret = fill_ipc_msg_hdr(ctx, h, msq);
+	if (ret < 0)
+		goto out;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	if (ret < 0)
+		goto out;
+
+	if (h->q_qnum)
+		ret = checkpoint_msg_queue(ctx, msq);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+
+/************************************************************************
+ * ipc restart
+ */
+
+static int load_ipc_msg_hdr(struct ckpt_ctx *ctx,
+			    struct ckpt_hdr_ipc_msg *h,
+			    struct msg_queue *msq)
+{
+	int ret = 0;
+
+	ret = restore_load_ipc_perms(&h->perms, &msq->q_perm);
+	if (ret < 0)
+		return ret;
+
+	ckpt_debug("msq: lspid %d lrpid %d qnum %lld qbytes %lld\n",
+		 h->q_lspid, h->q_lrpid, h->q_qnum, h->q_qbytes);
+
+	if (h->q_lspid < 0 || h->q_lrpid < 0)
+		return -EINVAL;
+
+	msq->q_stime = h->q_stime;
+	msq->q_rtime = h->q_rtime;
+	msq->q_ctime = h->q_ctime;
+	msq->q_lspid = h->q_lspid;
+	msq->q_lrpid = h->q_lrpid;
+
+	return 0;
+}
+
+static struct msg_msg *restore_msg_contents_one(struct ckpt_ctx *ctx, int *clen)
+{
+	struct ckpt_hdr_ipc_msg_msg *h;
+	struct msg_msg *msg = NULL;
+	struct msg_msgseg *seg, **pseg;
+	int total, len;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_IPC_MSG_MSG);
+	if (IS_ERR(h))
+		return (struct msg_msg *) h;
+
+	ret = -EINVAL;
+	if (h->m_type < 1)
+		goto out;
+	if (h->m_ts > current->nsproxy->ipc_ns->msg_ctlmax)
+		goto out;
+
+	total = h->m_ts;
+	len = min(total, (int) DATALEN_MSG);
+	msg = kmalloc(sizeof(*msg) + len, GFP_KERNEL);
+	if (!msg) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	msg->next = NULL;
+	pseg = &msg->next;
+
+	ret = _ckpt_read_buffer(ctx, (msg + 1), len);
+	if (ret < 0)
+		goto out;
+
+	total -= len;
+	while (total) {
+		len = min(total, (int) DATALEN_SEG);
+		seg = kmalloc(sizeof(*seg) + len, GFP_KERNEL);
+		if (!seg) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		seg->next = NULL;
+		*pseg = seg;
+		pseg = &seg->next;
+
+		ret = _ckpt_read_buffer(ctx, (seg + 1), len);
+		if (ret < 0)
+			goto out;
+		total -= len;
+	}
+
+	msg->m_type = h->m_type;
+	msg->m_ts = h->m_ts;
+	*clen = h->m_ts;
+ out:
+	if (ret < 0 && msg) {
+		free_msg(msg);
+		msg = ERR_PTR(ret);
+	}
+	ckpt_hdr_put(ctx, h);
+	return msg;
+}
+
+static inline void free_msg_list(struct list_head *queue)
+{
+	struct msg_msg *msg, *tmp;
+
+	list_for_each_entry_safe(msg, tmp, queue, m_list)
+		free_msg(msg);
+}
+
+static int restore_msg_contents(struct ckpt_ctx *ctx, struct list_head *queue,
+				unsigned long qnum, unsigned long *cbytes)
+{
+	struct msg_msg *msg;
+	int clen = 0;
+	int ret = 0;
+
+	INIT_LIST_HEAD(queue);
+
+	*cbytes = 0;
+	while (qnum--) {
+		msg = restore_msg_contents_one(ctx, &clen);
+		if (IS_ERR(msg))
+			goto fail;
+		list_add_tail(&msg->m_list, queue);
+		*cbytes += clen;
+	}
+	return 0;
+ fail:
+	ret = PTR_ERR(msg);
+	free_msg_list(queue);
+	return ret;
+}
+
+int restore_ipc_msg(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_ipc_msg *h;
+	struct kern_ipc_perm *perms;
+	struct msg_queue *msq;
+	struct ipc_ids *msg_ids = ¤t->nsproxy->ipc_ns->ids[IPC_MSG_IDS];
+	struct list_head messages;
+	unsigned long cbytes;
+	int msgflag;
+	int ret;
+
+	INIT_LIST_HEAD(&messages);
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_IPC_MSG);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ret = -EINVAL;
+	if (h->perms.id < 0)
+		goto out;
+
+	/* read queued messages into temporary queue */
+	ret = restore_msg_contents(ctx, &messages, h->q_qnum, &cbytes);
+	if (ret < 0)
+		goto out;
+
+	ret = -EINVAL;
+	if (h->q_cbytes != cbytes)
+		goto out;
+
+	/* restore the message queue */
+	msgflag = h->perms.mode | IPC_CREAT | IPC_EXCL;
+	ckpt_debug("msg: do_msgget key %d flag %#x id %d\n",
+		 h->perms.key, msgflag, h->perms.id);
+	ret = do_msgget(h->perms.key, msgflag, h->perms.id);
+	ckpt_debug("msg: do_msgget ret %d\n", ret);
+	if (ret < 0)
+		goto out;
+
+	down_write(&msg_ids->rw_mutex);
+
+	/* we are the sole owners/users of this ipc_ns, it can't go away */
+	perms = ipc_lock(msg_ids, h->perms.id);
+	BUG_ON(IS_ERR(perms));	/* ipc_ns is private to us */
+
+	msq = container_of(perms, struct msg_queue, q_perm);
+	BUG_ON(!list_empty(&msq->q_messages));	/* ipc_ns is private to us */
+
+	/* attach queued messages we read before */
+	list_splice_init(&messages, &msq->q_messages);
+
+	/* adjust msq and namespace statistics */
+	atomic_add(h->q_cbytes, ¤t->nsproxy->ipc_ns->msg_bytes);
+	atomic_add(h->q_qnum, ¤t->nsproxy->ipc_ns->msg_hdrs);
+	msq->q_cbytes = h->q_cbytes;
+	msq->q_qbytes = h->q_qbytes;
+	msq->q_qnum = h->q_qnum;
+
+	ret = load_ipc_msg_hdr(ctx, h, msq);
+	ipc_unlock(perms);
+
+	if (ret < 0) {
+		ckpt_debug("msq: need to remove (%d)\n", ret);
+		freeque(current->nsproxy->ipc_ns, perms);
+	}
+	up_write(&msg_ids->rw_mutex);
+ out:
+	free_msg_list(&messages);  /* no-op if all ok, else cleanup msgs */
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
diff --git a/ipc/util.h b/ipc/util.h
index db067b0..2a05fb3 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -196,6 +196,9 @@ extern int restore_load_ipc_perms(struct ckpt_hdr_ipc_perms *h,
 
 extern int checkpoint_ipc_shm(int id, void *p, void *data);
 extern int restore_ipc_shm(struct ckpt_ctx *ctx);
+
+extern int checkpoint_ipc_msg(int id, void *p, void *data);
+extern int restore_ipc_msg(struct ckpt_ctx *ctx);
 #endif
 
 #endif
-- 
1.6.0.4
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 39/43] c/r (ipc): export interface from ipc/sem.c to cleanup ipc sem
       [not found] ` <1243445589-32388-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
                     ` (7 preceding siblings ...)
  2009-05-27 17:33   ` [RFC v16][PATCH 38/43] c/r: support message-queues sysv-ipc Oren Laadan
@ 2009-05-27 17:33   ` Oren Laadan
  8 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, linux-api-u79uwXL29TY76Z2rM5mHXA,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Export freeary() which will be used in the next patch during restart
to cleanup an ipc sem.
Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 ipc/sem.c  |    3 +--
 ipc/util.h |    1 +
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/ipc/sem.c b/ipc/sem.c
index 207dbbb..c60076e 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -93,7 +93,6 @@
 #define sem_checkid(sma, semid)	ipc_checkid(&sma->sem_perm, semid)
 
 static int newary(struct ipc_namespace *, struct ipc_params *, int);
-static void freeary(struct ipc_namespace *, struct kern_ipc_perm *);
 #ifdef CONFIG_PROC_FS
 static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
 #endif
@@ -521,7 +520,7 @@ static void free_un(struct rcu_head *head)
  * as a writer and the spinlock for this semaphore set hold. sem_ids.rw_mutex
  * remains locked on exit.
  */
-static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
+void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 {
 	struct sem_undo *un, *tu;
 	struct sem_queue *q, *tq;
diff --git a/ipc/util.h b/ipc/util.h
index 2a05fb3..347ffb2 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -185,6 +185,7 @@ int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
 extern int do_shmget(key_t key, size_t size, int shmflg, int req_id);
 extern int do_msgget(key_t key, int msgflg, int req_id);
 extern void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp);
+extern void freeary(struct ipc_namespace *, struct kern_ipc_perm *);
 
 extern void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp);
 
-- 
1.6.0.4
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related	[flat|nested] 50+ messages in thread
 
- * [RFC v16][PATCH 35/43] c/r (ipc): export interface from ipc/shm.c to delete ipc shm
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (27 preceding siblings ...)
       [not found] ` <1243445589-32388-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-05-27 17:33 ` Oren Laadan
  2009-05-27 17:33 ` [RFC v16][PATCH 36/43] c/r: support share-memory sysv-ipc Oren Laadan
                   ` (5 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Export shmctl_down() which will be used in the next patch during
restart to delete an ipc shm (the shm is mapped already, so it
won't be lost).
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 include/linux/shm.h |    4 ++++
 ipc/shm.c           |    4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/include/linux/shm.h b/include/linux/shm.h
index eca6235..ec36e99 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -118,6 +118,10 @@ static inline int is_file_shm_hugepages(struct file *file)
 }
 #endif
 
+struct ipc_namespace;
+extern int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
+		       struct shmid_ds __user *buf, int version);
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_SHM_H_ */
diff --git a/ipc/shm.c b/ipc/shm.c
index 7dd5f0c..8aba22f 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -598,8 +598,8 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
  * to be held in write mode.
  * NOTE: no locks must be held, the rw_mutex is taken inside this function.
  */
-static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
-		       struct shmid_ds __user *buf, int version)
+int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
+		struct shmid_ds __user *buf, int version)
 {
 	struct kern_ipc_perm *ipcp;
 	struct shmid64_ds shmid64;
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 36/43] c/r: support share-memory sysv-ipc
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (28 preceding siblings ...)
  2009-05-27 17:33 ` [RFC v16][PATCH 35/43] c/r (ipc): export interface from ipc/shm.c to delete ipc shm Oren Laadan
@ 2009-05-27 17:33 ` Oren Laadan
  2009-05-27 17:33 ` [RFC v16][PATCH 37/43] c/r (ipc): make 'struct msg_msgseg' visible in ipc/util.h Oren Laadan
                   ` (4 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Checkpoint of sysvipc shared memory is performed in two steps: first,
the entire ipc namespace is dumped as a whole by iterating through all
shm objects and dumping the contents of each one. The shmem inode is
registered in the objhash. Second, for each vma that refers to ipc
shared memory we find the inode in the objhash, and save the objref.
(If we find a new inode, that indicates that the ipc namespace is not
entirely frozen and someone must have manipulated it since step 1).
Handling of shm objects that have been deleted (via IPC_RMID) is left
to a later patch in this series.
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 checkpoint/memory.c              |   28 ++++-
 checkpoint/sys.c                 |   10 ++
 include/linux/checkpoint.h       |    3 +
 include/linux/checkpoint_hdr.h   |   19 +++-
 include/linux/checkpoint_types.h |    1 +
 include/linux/shm.h              |    9 ++
 ipc/Makefile                     |    2 +-
 ipc/checkpoint.c                 |    4 +-
 ipc/checkpoint_shm.c             |  261 ++++++++++++++++++++++++++++++++++++++
 ipc/shm.c                        |   73 +++++++++++
 ipc/util.h                       |    4 +-
 11 files changed, 406 insertions(+), 8 deletions(-)
diff --git a/checkpoint/memory.c b/checkpoint/memory.c
index c163b76..997359f 100644
--- a/checkpoint/memory.c
+++ b/checkpoint/memory.c
@@ -20,6 +20,7 @@
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/mm_types.h>
+#include <linux/shm.h>
 #include <linux/proc_fs.h>
 #include <linux/swap.h>
 #include <linux/checkpoint.h>
@@ -459,9 +460,9 @@ static int vma_dump_pages(struct ckpt_ctx *ctx, int total)
  * virtual addresses into ctx->pgarr_list page-array chain. Then dump
  * the addresses, followed by the page contents.
  */
-static int checkpoint_memory_contents(struct ckpt_ctx *ctx,
-				      struct vm_area_struct *vma,
-				      struct inode *inode)
+int checkpoint_memory_contents(struct ckpt_ctx *ctx,
+			       struct vm_area_struct *vma,
+			       struct inode *inode)
 {
 	struct ckpt_hdr_pgarr *h;
 	unsigned long addr, end;
@@ -1022,6 +1023,13 @@ static int anon_private_restore(struct ckpt_ctx *ctx,
 	return private_vma_restore(ctx, mm, NULL, h);
 }
 
+static int bad_vma_restore(struct ckpt_ctx *ctx,
+			   struct mm_struct *mm,
+			   struct ckpt_hdr_vma *h)
+{
+	return -EINVAL;
+}
+
 /* callbacks to restore vma per its type: */
 struct restore_vma_ops {
 	char *vma_name;
@@ -1074,6 +1082,20 @@ static struct restore_vma_ops restore_vma_ops[] = {
 		.vma_type = CKPT_VMA_SHM_FILE,
 		.restore = filemap_restore,
 	},
+	/* sysvipc shared */
+	{
+		.vma_name = "IPC SHARED",
+		.vma_type = CKPT_VMA_SHM_IPC,
+		/* ipc inode itself is restore by restore_ipc_ns()... */
+		.restore = bad_vma_restore,
+
+	},
+	/* sysvipc shared (skip) */
+	{
+		.vma_name = "IPC SHARED (skip)",
+		.vma_type = CKPT_VMA_SHM_IPC_SKIP,
+		.restore = ipcshm_restore,
+	},
 };
 
 /**
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index f6cf0ac..ac3bf7c 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -20,6 +20,7 @@
 #include <linux/uaccess.h>
 #include <linux/capability.h>
 #include <linux/checkpoint.h>
+#include <linux/deferqueue.h>
 
 /*
  * ckpt_unpriv_allowed - sysctl controlled, do not allow checkpoints or
@@ -188,8 +189,17 @@ static void task_arr_free(struct ckpt_ctx *ctx)
 
 static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 {
+	int ret;
+
 	BUG_ON(atomic_read(&ctx->refcount));
 
+	if (ctx->deferqueue) {
+		ret = deferqueue_run(ctx->deferqueue);
+		if (ret != 0)
+			pr_warning("c/r: deferqueue had %d entries\n", ret);
+		deferqueue_destroy(ctx->deferqueue);
+	}
+
 	if (ctx->file)
 		fput(ctx->file);
 
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index d5498bc..064dd25 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -145,6 +145,9 @@ extern unsigned long generic_vma_restore(struct mm_struct *mm,
 extern int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
 			       struct file *file, struct ckpt_hdr_vma *h);
 
+extern int checkpoint_memory_contents(struct ckpt_ctx *ctx,
+				      struct vm_area_struct *vma,
+				      struct inode *inode);
 extern int restore_memory_contents(struct ckpt_ctx *ctx, struct inode *inode);
 
 
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 406b5d6..f7e331d 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -278,7 +278,9 @@ enum vma_type {
 	CKPT_VMA_SHM_ANON,	/* shared anonymous */
 	CKPT_VMA_SHM_ANON_SKIP,	/* shared anonymous (skip contents) */
 	CKPT_VMA_SHM_FILE,	/* shared mapped file, only msync */
-	CKPT_VMA_MAX
+	CKPT_VMA_SHM_IPC,	/* shared sysvipc */
+	CKPT_VMA_SHM_IPC_SKIP,	/* shared sysvipc (skip contents) */
+	CKPT_VMA_MAX,
 };
 
 /* vma descriptor */
@@ -328,6 +330,7 @@ struct ckpt_hdr_ipc {
 } __attribute__((aligned(8)));
 
 struct ckpt_hdr_ipc_perms {
+	struct ckpt_hdr h;
 	__s32 id;
 	__u32 key;
 	__u32 uid;
@@ -339,6 +342,20 @@ struct ckpt_hdr_ipc_perms {
 	__u64 seq;
 } __attribute__((aligned(8)));
 
+struct ckpt_hdr_ipc_shm {
+	struct ckpt_hdr h;
+	struct ckpt_hdr_ipc_perms perms;
+	__u64 shm_segsz;
+	__u64 shm_atim;
+	__u64 shm_dtim;
+	__u64 shm_ctim;
+	__s32 shm_cprid;
+	__s32 shm_lprid;
+	__u32 mlock_uid;
+	__u32 flags;
+	__u32 objref;
+} __attribute__((aligned(8)));
+
 
 #define CKPT_TST_OVERFLOW_16(a, b) \
 	((sizeof(a) > sizeof(b)) && ((a) > SHORT_MAX))
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index f39e1c1..45a0f80 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -50,6 +50,7 @@ struct ckpt_ctx {
 	atomic_t refcount;
 
 	struct ckpt_obj_hash *obj_hash;	/* repository for shared objects */
+	struct deferqueue_head *deferqueue;	/* queue of deferred work */
 
 	struct path fs_mnt;     /* container root (FIXME) */
 
diff --git a/include/linux/shm.h b/include/linux/shm.h
index ec36e99..97972eb 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -122,6 +122,15 @@ struct ipc_namespace;
 extern int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
 		       struct shmid_ds __user *buf, int version);
 
+#ifdef CONFIG_CHECKPOINT
+#ifdef CONFIG_SYSVIPC
+extern int ipcshm_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+			  struct ckpt_hdr_vma *h);
+#else
+define ipcshm_restart NULL
+#endif
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_SHM_H_ */
diff --git a/ipc/Makefile b/ipc/Makefile
index aa6c8dd..7e23683 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -9,5 +9,5 @@ obj_mq-$(CONFIG_COMPAT) += compat_mq.o
 obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y)
 obj-$(CONFIG_IPC_NS) += namespace.o
 obj-$(CONFIG_POSIX_MQUEUE_SYSCTL) += mq_sysctl.o
-obj-$(CONFIG_CHECKPOINT) += checkpoint.o
+obj-$(CONFIG_CHECKPOINT) += checkpoint.o checkpoint_shm.o
 
diff --git a/ipc/checkpoint.c b/ipc/checkpoint.c
index 436be5e..25d2277 100644
--- a/ipc/checkpoint.c
+++ b/ipc/checkpoint.c
@@ -102,9 +102,9 @@ int checkpoint_ipc_ns(struct ckpt_ctx *ctx, struct ipc_namespace *ipc_ns)
 	if (ret < 0)
 		return ret;
 
-#if 0 /* NEXT FEW PATCHES */
 	ret = checkpoint_ipc_any(ctx, ipc_ns, IPC_SHM_IDS,
 				 CKPT_HDR_IPC_SHM, checkpoint_ipc_shm);
+#if 0 /* NEXT FEW PATCHES */
 	if (ret < 0)
 		return ret;
 	ret = checkpoint_ipc_any(ctx, ipc_ns, IPC_MSG_IDS,
@@ -214,9 +214,9 @@ static int do_restore_ipc_ns(struct ckpt_ctx *ctx)
 	ipc_ns->sem_ctls[2] = h->sem_ctl_opm;
 	ipc_ns->sem_ctls[3] = h->sem_ctl_mni;
 
-#if 0 /* NEXT FEW PATCHES */
 	ret = restore_ipc_any(ctx, IPC_SHM_IDS,
 			      CKPT_HDR_IPC_SHM, restore_ipc_shm);
+#if 0 /* NEXT FEW PATCHES */
 	if (ret < 0)
 		goto out;
 	ret = ckpt_read_ipc_any(ctx, IPC_MSG_IDS,
diff --git a/ipc/checkpoint_shm.c b/ipc/checkpoint_shm.c
new file mode 100644
index 0000000..713f910
--- /dev/null
+++ b/ipc/checkpoint_shm.c
@@ -0,0 +1,261 @@
+/*
+ *  Checkpoint/restart - dump state of sysvipc shm
+ *
+ *  Copyright (C) 2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DIPC
+
+#include <linux/mm.h>
+#include <linux/shm.h>
+#include <linux/shmem_fs.h>
+#include <linux/hugetlb.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
+#include <linux/nsproxy.h>
+#include <linux/ipc_namespace.h>
+#include <linux/deferqueue.h>
+
+#include <linux/msg.h>	/* needed for util.h that uses 'struct msg_msg' */
+#include "util.h"
+
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+/************************************************************************
+ * ipc checkpoint
+ */
+
+static int fill_ipc_shm_hdr(struct ckpt_ctx *ctx,
+			    struct ckpt_hdr_ipc_shm *h,
+			    struct shmid_kernel *shp)
+{
+	int ret = 0;
+
+	ipc_lock_by_ptr(&shp->shm_perm);
+
+	ret = checkpoint_fill_ipc_perms(&h->perms, &shp->shm_perm);
+	if (ret < 0)
+		goto unlock;
+
+	h->shm_segsz = shp->shm_segsz;
+	h->shm_atim = shp->shm_atim;
+	h->shm_dtim = shp->shm_dtim;
+	h->shm_ctim = shp->shm_ctim;
+	h->shm_cprid = shp->shm_cprid;
+	h->shm_lprid = shp->shm_lprid;
+
+	if (shp->mlock_user)
+		h->mlock_uid = shp->mlock_user->uid;
+	else
+		h->mlock_uid = (unsigned int) -1;
+
+	h->flags = 0;
+	/* check if shm was setup with SHM_NORESERVE */
+	if (SHMEM_I(shp->shm_file->f_dentry->d_inode)->flags & VM_NORESERVE)
+		h->flags |= SHM_NORESERVE;
+	/* check if shm was setup with SHM_HUGETLB (unsupported yet) */
+	if (is_file_hugepages(shp->shm_file)) {
+		pr_warning("c/r: unsupported SHM_HUGETLB\n");
+		ret = -ENOSYS;
+	}
+
+ unlock:
+	ipc_unlock(&shp->shm_perm);
+	ckpt_debug("shm: cprid %d lprid %d segsz %lld mlock %d\n",
+		 h->shm_cprid, h->shm_lprid, h->shm_segsz, h->mlock_uid);
+
+	return ret;
+}
+
+int checkpoint_ipc_shm(int id, void *p, void *data)
+{
+	struct ckpt_hdr_ipc_shm *h;
+	struct ckpt_ctx *ctx = (struct ckpt_ctx *) data;
+	struct kern_ipc_perm *perm = (struct kern_ipc_perm *) p;
+	struct shmid_kernel *shp;
+	struct inode *inode;
+	int first, objref;
+	int ret;
+
+	shp = container_of(perm, struct shmid_kernel, shm_perm);
+	inode = shp->shm_file->f_dentry->d_inode;
+
+	objref = ckpt_obj_lookup_add(ctx, inode, CKPT_OBJ_INODE, &first);
+	if (objref < 0)
+		return objref;
+	/* this must be the first time we see this region */
+	BUG_ON(!first);
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_IPC_SHM);
+	if (!h)
+		return -ENOMEM;
+
+	ret = fill_ipc_shm_hdr(ctx, h, shp);
+	if (ret < 0)
+		goto out;
+
+	h->objref = objref;
+	ckpt_debug("shm: objref %d\n", h->objref);
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	if (ret < 0)
+		goto out;
+
+	ret = checkpoint_memory_contents(ctx, NULL, inode);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+/************************************************************************
+ * ipc restart
+ */
+
+struct dq_ipcshm_del {
+	/*
+	 * XXX: always keep ->ipcns first so that put_ipc_ns() can
+	 * be safely provided as the dtor for this deferqueue object
+	 */
+	struct ipc_namespace *ipcns;
+	int id;
+};
+
+static int ipc_shm_delete(void *data)
+{
+	struct dq_ipcshm_del *dq = (struct dq_ipcshm_del *) data;
+	mm_segment_t old_fs;
+	int ret;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	ret = shmctl_down(dq->ipcns, dq->id, IPC_RMID, NULL, 0);
+	set_fs(old_fs);
+
+	put_ipc_ns(dq->ipcns);
+	return ret;
+}
+
+static int load_ipc_shm_hdr(struct ckpt_ctx *ctx,
+			    struct ckpt_hdr_ipc_shm *h,
+			    struct shmid_kernel *shp)
+{
+	int ret;
+
+	ret = restore_load_ipc_perms(&h->perms, &shp->shm_perm);
+	if (ret < 0)
+		return ret;
+
+	ckpt_debug("shm: cprid %d lprid %d segsz %lld mlock %d\n",
+		 h->shm_cprid, h->shm_lprid, h->shm_segsz, h->mlock_uid);
+
+	if (h->shm_cprid < 0 || h->shm_lprid < 0)
+		return -EINVAL;
+
+	shp->shm_segsz = h->shm_segsz;
+	shp->shm_atim = h->shm_atim;
+	shp->shm_dtim = h->shm_dtim;
+	shp->shm_ctim = h->shm_ctim;
+	shp->shm_cprid = h->shm_cprid;
+	shp->shm_lprid = h->shm_lprid;
+
+	return 0;
+}
+
+int restore_ipc_shm(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_ipc_shm *h;
+	struct kern_ipc_perm *perms;
+	struct shmid_kernel *shp;
+	struct ipc_ids *shm_ids = ¤t->nsproxy->ipc_ns->ids[IPC_SHM_IDS];
+	struct file *file;
+	int shmflag;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_IPC_SHM);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ret = -EINVAL;
+	if (h->perms.id < 0)
+		goto out;
+
+#define CKPT_SHMFL_MASK  (SHM_NORESERVE | SHM_HUGETLB)
+	if (h->flags & ~CKPT_SHMFL_MASK)
+		goto out;
+
+	ret = -ENOSYS;
+	if (h->mlock_uid != (unsigned int) -1)	/* FIXME: support SHM_LOCK */
+		goto out;
+	if (h->flags & SHM_HUGETLB)	/* FIXME: support SHM_HUGETLB */
+		goto out;
+
+	/*
+	 * SHM_DEST means that the shm is to be deleted after creation.
+	 * However, deleting before it's actually attached is quite silly.
+	 * Instead, we defer this task to until restart has succeeded.
+	 */
+	if (h->perms.mode & SHM_DEST) {
+		struct dq_ipcshm_del dq;
+
+		/* to not confuse the rest of the code */
+		h->perms.mode &= ~SHM_DEST;
+
+		dq.id = h->perms.id;
+		dq.ipcns = current->nsproxy->ipc_ns;
+		get_ipc_ns(dq.ipcns);
+
+		/* XXX can safely use put_ipc_ns() as dtor, see above */
+		ret = deferqueue_add(ctx->deferqueue, &dq, sizeof(dq),
+				     (deferqueue_func_t) ipc_shm_delete,
+				     (deferqueue_func_t) put_ipc_ns);
+		if (ret < 0)
+			goto out;
+	}
+
+	shmflag = h->flags | h->perms.mode | IPC_CREAT | IPC_EXCL;
+	ckpt_debug("shm: do_shmget size %lld flag %#x id %d\n",
+		 h->shm_segsz, shmflag, h->perms.id);
+	ret = do_shmget(h->perms.key, h->shm_segsz, shmflag, h->perms.id);
+	ckpt_debug("shm: do_shmget ret %d\n", ret);
+	if (ret < 0)
+		goto out;
+
+	down_write(&shm_ids->rw_mutex);
+
+	/* we are the sole owners/users of this ipc_ns, it can't go away */
+	perms = ipc_lock(shm_ids, h->perms.id);
+	BUG_ON(IS_ERR(perms));  /* ipc_ns is private to us */
+
+	shp = container_of(perms, struct shmid_kernel, shm_perm);
+	file = shp->shm_file;
+	get_file(file);
+
+	ret = load_ipc_shm_hdr(ctx, h, shp);
+	ipc_unlock(perms);
+	if (ret < 0)
+		goto mutex;
+
+	/* deposit in objhash and read contents in */
+	ret = ckpt_obj_insert(ctx, file, h->objref, CKPT_OBJ_FILE);
+	if (ret < 0)
+		goto mutex;
+	ret = restore_memory_contents(ctx, file->f_dentry->d_inode);
+ mutex:
+	fput(file);
+	if (ret < 0) {
+		ckpt_debug("shm: need to remove (%d)\n", ret);
+		do_shm_rmid(current->nsproxy->ipc_ns, perms);
+	}
+	up_write(&shm_ids->rw_mutex);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
diff --git a/ipc/shm.c b/ipc/shm.c
index 8aba22f..0ed6a9d 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -40,6 +40,8 @@
 #include <linux/mount.h>
 #include <linux/ipc_namespace.h>
 #include <linux/ima.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/checkpoint.h>
 
 #include <asm/uaccess.h>
 
@@ -305,6 +307,74 @@ int is_file_shm_hugepages(struct file *file)
 	return ret;
 }
 
+#ifdef CONFIG_CHECKPOINT
+static int ipcshm_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma)
+{
+	int ino_objref;
+	int first;
+
+	ino_objref = ckpt_obj_lookup_add(ctx, vma->vm_file->f_dentry->d_inode,
+				       CKPT_OBJ_INODE, &first);
+	if (ino_objref < 0)
+		return ino_objref;
+
+	/*
+	 * This shouldn't happen, because all IPC regions should have
+	 * been already dumped by now via ipc namespaces; It means
+	 * the ipc_ns has been modified recently during checkpoint.
+	 */
+	if (first)
+		return -EBUSY;
+
+	return generic_vma_checkpoint(ctx, vma, CKPT_VMA_SHM_IPC_SKIP,
+				      0, ino_objref);
+}
+
+int ipcshm_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+		   struct ckpt_hdr_vma *h)
+{
+	struct file *file;
+	int shmid, shmflg = 0;
+	mm_segment_t old_fs;
+	unsigned long start;
+	unsigned long addr;
+	int ret;
+
+	if (!h->ino_objref)
+		return -EINVAL;
+	/* FIX: verify the vm_flags too */
+
+	file = ckpt_obj_fetch(ctx, h->ino_objref, CKPT_OBJ_FILE);
+	if (IS_ERR(file))
+		PTR_ERR(file);
+
+	shmid = file->f_dentry->d_inode->i_ino;
+
+	if (!(h->vm_flags & VM_WRITE))
+		shmflg |= SHM_RDONLY;
+
+	/*
+	 * FIX: do_shmat() has limited interface: all-or-nothing
+	 * mapping. If the vma, however, reflects a partial mapping
+	 * then we need to modify that function to accomplish the
+	 * desired outcome.  Partial mapping can exist due to the user
+	 * call shmat() and then unmapping part of the region.
+	 * Currently, we at least detect this and call it a foul play.
+	 */
+	if (((h->vm_end - h->vm_start) != h->ino_size) || h->vm_pgoff)
+		return -ENOSYS;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	start = h->vm_start;
+	ret = do_shmat(shmid, (char __user *) start, shmflg, &addr);
+	set_fs(old_fs);
+
+	BUG_ON(ret >= 0 && addr != h->vm_start);
+	return ret;
+}
+#endif
+
 static const struct file_operations shm_file_operations = {
 	.mmap		= shm_mmap,
 	.fsync		= shm_fsync,
@@ -320,6 +390,9 @@ static struct vm_operations_struct shm_vm_ops = {
 	.set_policy = shm_set_policy,
 	.get_policy = shm_get_policy,
 #endif
+#if defined(CONFIG_CHECKPOINT)
+	.checkpoint = ipcshm_checkpoint,
+#endif
 };
 
 /**
diff --git a/ipc/util.h b/ipc/util.h
index 1356909..5a6373f 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -183,7 +183,9 @@ extern int checkpoint_fill_ipc_perms(struct ckpt_hdr_ipc_perms *h,
 				     struct kern_ipc_perm *perm);
 extern int restore_load_ipc_perms(struct ckpt_hdr_ipc_perms *h,
 				  struct kern_ipc_perm *perm);
-#endif
 
+extern int checkpoint_ipc_shm(int id, void *p, void *data);
+extern int restore_ipc_shm(struct ckpt_ctx *ctx);
+#endif
 
 #endif
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 37/43] c/r (ipc): make 'struct msg_msgseg' visible in ipc/util.h
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (29 preceding siblings ...)
  2009-05-27 17:33 ` [RFC v16][PATCH 36/43] c/r: support share-memory sysv-ipc Oren Laadan
@ 2009-05-27 17:33 ` Oren Laadan
  2009-05-27 17:33 ` [RFC v16][PATCH 40/43] c/r: support semaphore sysv-ipc Oren Laadan
                   ` (3 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Move the definition of 'struct msg_msgseg' and constants DATALEN_*
to ipc/util.h, where they are visible to ipc/ckpt_msg.c
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 ipc/msg.c     |    3 +--
 ipc/msgutil.c |    8 --------
 ipc/util.h    |   10 ++++++++++
 3 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/ipc/msg.c b/ipc/msg.c
index 1db7c45..1d5d087 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -72,7 +72,6 @@ struct msg_sender {
 
 #define msg_unlock(msq)		ipc_unlock(&(msq)->q_perm)
 
-static void freeque(struct ipc_namespace *, struct kern_ipc_perm *);
 static int newque(struct ipc_namespace *, struct ipc_params *, int);
 #ifdef CONFIG_PROC_FS
 static int sysvipc_msg_proc_show(struct seq_file *s, void *it);
@@ -278,7 +277,7 @@ static void expunge_all(struct msg_queue *msq, int res)
  * msg_ids.rw_mutex (writer) and the spinlock for this message queue are held
  * before freeque() is called. msg_ids.rw_mutex remains locked on exit.
  */
-static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
+void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 {
 	struct list_head *tmp;
 	struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index f095ee2..e119243 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -36,14 +36,6 @@ struct ipc_namespace init_ipc_ns = {
 
 atomic_t nr_ipc_ns = ATOMIC_INIT(1);
 
-struct msg_msgseg {
-	struct msg_msgseg* next;
-	/* the next part of the message follows immediately */
-};
-
-#define DATALEN_MSG	(PAGE_SIZE-sizeof(struct msg_msg))
-#define DATALEN_SEG	(PAGE_SIZE-sizeof(struct msg_msgseg))
-
 struct msg_msg *load_msg(const void __user *src, int len)
 {
 	struct msg_msg *msg;
diff --git a/ipc/util.h b/ipc/util.h
index 5a6373f..db067b0 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -140,6 +140,14 @@ extern void free_msg(struct msg_msg *msg);
 extern struct msg_msg *load_msg(const void __user *src, int len);
 extern int store_msg(void __user *dest, struct msg_msg *msg, int len);
 
+struct msg_msgseg {
+	struct msg_msgseg *next;
+	/* the next part of the message follows immediately */
+};
+
+#define DATALEN_MSG	(PAGE_SIZE-sizeof(struct msg_msg))
+#define DATALEN_SEG	(PAGE_SIZE-sizeof(struct msg_msgseg))
+
 extern void recompute_msgmni(struct ipc_namespace *);
 
 static inline int ipc_buildid(int id, int seq)
@@ -175,6 +183,8 @@ int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
 
 /* for checkpoint/restart */
 extern int do_shmget(key_t key, size_t size, int shmflg, int req_id);
+extern int do_msgget(key_t key, int msgflg, int req_id);
+extern void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp);
 
 extern void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp);
 
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 40/43] c/r: support semaphore sysv-ipc
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (30 preceding siblings ...)
  2009-05-27 17:33 ` [RFC v16][PATCH 37/43] c/r (ipc): make 'struct msg_msgseg' visible in ipc/util.h Oren Laadan
@ 2009-05-27 17:33 ` Oren Laadan
  2009-05-27 17:33 ` [RFC v16][PATCH 41/43] c/r: (s390): expose a constant for the number of words (CRs) Oren Laadan
                   ` (2 subsequent siblings)
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan
Checkpoint of sysvipc semaphores is performed by iterating through all
sem objects and dumping the contents of each one. The semaphore array
of each sem is dumped with that object.
The semaphore array (sem->sem_base) holds an array of 'struct sem',
which is a {int, int}. Because this translates into the same format
on 32- and 64-bit architectures, the checkpoint format is simply the
dump of this array as is.
TODO: this patch does not handle semaphore-undo -- this data should be
saved per-task while iterating through the tasks.
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 include/linux/checkpoint_hdr.h |    8 ++
 ipc/Makefile                   |    3 +-
 ipc/checkpoint.c               |    4 -
 ipc/checkpoint_sem.c           |  220 ++++++++++++++++++++++++++++++++++++++++
 ipc/util.h                     |    5 +
 5 files changed, 235 insertions(+), 5 deletions(-)
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index b05f39c..cd427d8 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -376,6 +376,14 @@ struct ckpt_hdr_ipc_msg_msg {
 	__u32 m_ts;
 } __attribute__((aligned(8)));
 
+struct ckpt_hdr_ipc_sem {
+	struct ckpt_hdr h;
+	struct ckpt_hdr_ipc_perms perms;
+	__u64 sem_otime;
+	__u64 sem_ctime;
+	__u32 sem_nsems;
+} __attribute__((aligned(8)));
+
 
 #define CKPT_TST_OVERFLOW_16(a, b) \
 	((sizeof(a) > sizeof(b)) && ((a) > SHORT_MAX))
diff --git a/ipc/Makefile b/ipc/Makefile
index ca408ff..81af168 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -9,5 +9,6 @@ obj_mq-$(CONFIG_COMPAT) += compat_mq.o
 obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y)
 obj-$(CONFIG_IPC_NS) += namespace.o
 obj-$(CONFIG_POSIX_MQUEUE_SYSCTL) += mq_sysctl.o
-obj-$(CONFIG_CHECKPOINT) += checkpoint.o checkpoint_shm.o checkpoint_msg.o
+obj-$(CONFIG_CHECKPOINT) += checkpoint.o \
+			checkpoint_shm.o checkpoint_msg.o checkpoint_sem.o
 
diff --git a/ipc/checkpoint.c b/ipc/checkpoint.c
index 7eece96..f621226 100644
--- a/ipc/checkpoint.c
+++ b/ipc/checkpoint.c
@@ -108,12 +108,10 @@ int checkpoint_ipc_ns(struct ckpt_ctx *ctx, struct ipc_namespace *ipc_ns)
 		return ret;
 	ret = checkpoint_ipc_any(ctx, ipc_ns, IPC_MSG_IDS,
 				 CKPT_HDR_IPC_MSG, checkpoint_ipc_msg);
-#if 0 /* NEXT FEW PATCHES */
 	if (ret < 0)
 		return ret;
 	ret = checkpoint_ipc_any(ctx, ipc_ns, IPC_SEM_IDS,
 				 CKPT_HDR_IPC_SEM, checkpoint_ipc_sem);
-#endif
 	return ret;
 }
 
@@ -220,12 +218,10 @@ static int do_restore_ipc_ns(struct ckpt_ctx *ctx)
 		goto out;
 	ret = restore_ipc_any(ctx, IPC_MSG_IDS,
 			      CKPT_HDR_IPC_MSG, restore_ipc_msg);
-#if 0 /* NEXT FEW PATCHES */
 	if (ret < 0)
 		goto out;
 	ret = restore_ipc_any(ctx, IPC_SEM_IDS,
 			      CKPT_HDR_IPC_SEM, restore_ipc_sem);
-#endif
  out:
 	ckpt_hdr_put(ctx, h);
 	return ret;
diff --git a/ipc/checkpoint_sem.c b/ipc/checkpoint_sem.c
new file mode 100644
index 0000000..34dea40
--- /dev/null
+++ b/ipc/checkpoint_sem.c
@@ -0,0 +1,220 @@
+/*
+ *  Checkpoint/restart - dump state of sysvipc sem
+ *
+ *  Copyright (C) 2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DIPC
+
+#include <linux/mm.h>
+#include <linux/sem.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/nsproxy.h>
+#include <linux/ipc_namespace.h>
+
+#include <linux/msg.h>	/* needed for util.h that uses 'struct msg_msg' */
+#include "util.h"
+
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+/************************************************************************
+ * ipc checkpoint
+ */
+
+static int fill_ipc_sem_hdr(struct ckpt_ctx *ctx,
+			       struct ckpt_hdr_ipc_sem *h,
+			       struct sem_array *sem)
+{
+	int ret = 0;
+
+	ipc_lock_by_ptr(&sem->sem_perm);
+
+	ret = checkpoint_fill_ipc_perms(&h->perms, &sem->sem_perm);
+	if (ret < 0)
+		goto unlock;
+
+	h->sem_otime = sem->sem_otime;
+	h->sem_ctime = sem->sem_ctime;
+	h->sem_nsems = sem->sem_nsems;
+
+ unlock:
+	ipc_unlock(&sem->sem_perm);
+	ckpt_debug("sem: nsems %u\n", h->sem_nsems);
+
+	return ret;
+}
+
+/**
+ * ckpt_write_sem_array - dump the state of a semaphore array
+ * @ctx: checkpoint context
+ * @sem: semphore array
+ *
+ * The state of a sempahore is an array of 'struct sem'. This structure
+ * is {int, int}, which translates to the same format {32 bits, 32 bits}
+ * on both 32- and 64-bit architectures. So we simply dump the array.
+ *
+ * The sem-undo information is not saved per ipc_ns, but rather per task.
+ */
+static int checkpoint_sem_array(struct ckpt_ctx *ctx, struct sem_array *sem)
+{
+	/* this is a "best-effort" test, so lock not needed */
+	if (!list_empty(&sem->sem_pending))
+		return -EBUSY;
+
+	/* our caller holds the mutex, so this is safe */
+	return ckpt_write_buffer(ctx, sem->sem_base,
+			       sem->sem_nsems * sizeof(*sem->sem_base));
+}
+
+int checkpoint_ipc_sem(int id, void *p, void *data)
+{
+	struct ckpt_hdr_ipc_sem *h;
+	struct ckpt_ctx *ctx = (struct ckpt_ctx *) data;
+	struct kern_ipc_perm *perm = (struct kern_ipc_perm *) p;
+	struct sem_array *sem;
+	int ret;
+
+	sem = container_of(perm, struct sem_array, sem_perm);
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_IPC_SEM);
+	if (!h)
+		return -ENOMEM;
+
+	ret = fill_ipc_sem_hdr(ctx, h, sem);
+	if (ret < 0)
+		goto out;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	if (ret < 0)
+		goto out;
+
+	if (h->sem_nsems)
+		ret = checkpoint_sem_array(ctx, sem);
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+/************************************************************************
+ * ipc restart
+ */
+
+static int load_ipc_sem_hdr(struct ckpt_ctx *ctx,
+			       struct ckpt_hdr_ipc_sem *h,
+			       struct sem_array *sem)
+{
+	int ret = 0;
+
+	ret = restore_load_ipc_perms(&h->perms, &sem->sem_perm);
+	if (ret < 0)
+		return ret;
+
+	ckpt_debug("sem: nsems %u\n", h->sem_nsems);
+
+	sem->sem_otime = h->sem_otime;
+	sem->sem_ctime = h->sem_ctime;
+	sem->sem_nsems = h->sem_nsems;
+
+	return 0;
+}
+
+/**
+ * ckpt_read_sem_array - read the state of a semaphore array
+ * @ctx: checkpoint context
+ * @sem: semphore array
+ *
+ * Expect the data in an array of 'struct sem': {32 bit, 32 bit}.
+ * See comment in ckpt_write_sem_array().
+ *
+ * The sem-undo information is not restored per ipc_ns, but rather per task.
+ */
+static struct sem *restore_sem_array(struct ckpt_ctx *ctx, int nsems)
+{
+	struct sem *sma;
+	int i, ret;
+
+	sma = kmalloc(nsems * sizeof(*sma), GFP_KERNEL);
+	ret = _ckpt_read_buffer(ctx, sma, nsems * sizeof(*sma));
+	if (ret < 0)
+		goto out;
+
+	/* validate sem array contents */
+	for (i = 0; i < nsems; i++) {
+		if (sma[i].semval < 0 || sma[i].sempid < 0) {
+			ret = -EINVAL;
+			break;
+		}
+	}
+ out:
+	if (ret < 0) {
+		kfree(sma);
+		sma = ERR_PTR(ret);
+	}
+	return sma;
+}
+
+int restore_ipc_sem(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_ipc_sem *h;
+	struct kern_ipc_perm *perms;
+	struct sem_array *sem;
+	struct sem *sma = NULL;
+	struct ipc_ids *sem_ids = ¤t->nsproxy->ipc_ns->ids[IPC_SEM_IDS];
+	int semflag, ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_IPC_SEM);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ret = -EINVAL;
+	if (h->perms.id < 0)
+		goto out;
+	if (h->sem_nsems < 0)
+		goto out;
+
+	/* read sempahore array state */
+	sma = restore_sem_array(ctx, h->sem_nsems);
+	if (IS_ERR(sma)) {
+		ret = PTR_ERR(sma);
+		goto out;
+	}
+
+	/* restore the message queue now */
+	semflag = h->perms.mode | IPC_CREAT | IPC_EXCL;
+	ckpt_debug("sem: do_semget key %d flag %#x id %d\n",
+		 h->perms.key, semflag, h->perms.id);
+	ret = do_semget(h->perms.key, h->sem_nsems, semflag, h->perms.id);
+	ckpt_debug("sem: do_msgget ret %d\n", ret);
+	if (ret < 0)
+		goto out;
+
+	down_write(&sem_ids->rw_mutex);
+
+	/* we are the sole owners/users of this ipc_ns, it can't go away */
+	perms = ipc_lock(sem_ids, h->perms.id);
+	BUG_ON(IS_ERR(perms));  /* ipc_ns is private to us */
+
+	sem = container_of(perms, struct sem_array, sem_perm);
+	memcpy(sem->sem_base, sma, sem->sem_nsems * sizeof(*sma));
+
+	ret = load_ipc_sem_hdr(ctx, h, sem);
+	ipc_unlock(perms);
+
+	if (ret < 0) {
+		ckpt_debug("sem: need to remove (%d)\n", ret);
+		freeary(current->nsproxy->ipc_ns, perms);
+	}
+	up_write(&sem_ids->rw_mutex);
+ out:
+	kfree(sma);
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
diff --git a/ipc/util.h b/ipc/util.h
index 347ffb2..020de7b 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -184,6 +184,8 @@ int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
 /* for checkpoint/restart */
 extern int do_shmget(key_t key, size_t size, int shmflg, int req_id);
 extern int do_msgget(key_t key, int msgflg, int req_id);
+extern int do_semget(key_t key, int nsems, int semflg, int req_id);
+
 extern void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp);
 extern void freeary(struct ipc_namespace *, struct kern_ipc_perm *);
 
@@ -200,6 +202,9 @@ extern int restore_ipc_shm(struct ckpt_ctx *ctx);
 
 extern int checkpoint_ipc_msg(int id, void *p, void *data);
 extern int restore_ipc_msg(struct ckpt_ctx *ctx);
+
+extern int checkpoint_ipc_sem(int id, void *p, void *data);
+extern int restore_ipc_sem(struct ckpt_ctx *ctx);
 #endif
 
 #endif
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 41/43] c/r: (s390): expose a constant for the number of words (CRs)
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (31 preceding siblings ...)
  2009-05-27 17:33 ` [RFC v16][PATCH 40/43] c/r: support semaphore sysv-ipc Oren Laadan
@ 2009-05-27 17:33 ` Oren Laadan
  2009-05-27 18:39   ` Alexey Dobriyan
  2009-05-27 17:33 ` [RFC v16][PATCH 42/43] c/r: add CKPT_COPY() macro Oren Laadan
  2009-05-27 17:33 ` [RFC v16][PATCH 43/43] c/r: define s390-specific checkpoint-restart code Oren Laadan
  34 siblings, 1 reply; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Oren Laadan,
	Dan Smith
We need to use this value in the checkpoint/restart code and would like to
have a constant instead of a magic '3'.
Changelog:
    Mar 30:
            . Add CHECKPOINT_SUPPORT in Kconfig (Nathan Lynch)
    Mar 03:
            . Picked up additional use of magic '3' in ptrace.h
Signed-off-by: Dan Smith <danms@us.ibm.com>
---
 arch/s390/Kconfig |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 2eca5fe..bf62cad 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -49,6 +49,10 @@ config GENERIC_TIME_VSYSCALL
 config GENERIC_CLOCKEVENTS
 	def_bool y
 
+config CHECKPOINT_SUPPORT
+	bool
+	default y if 64BIT
+
 config GENERIC_BUG
 	bool
 	depends on BUG
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * Re: [RFC v16][PATCH 41/43] c/r: (s390): expose a constant for the number of words (CRs)
  2009-05-27 17:33 ` [RFC v16][PATCH 41/43] c/r: (s390): expose a constant for the number of words (CRs) Oren Laadan
@ 2009-05-27 18:39   ` Alexey Dobriyan
  0 siblings, 0 replies; 50+ messages in thread
From: Alexey Dobriyan @ 2009-05-27 18:39 UTC (permalink / raw)
  To: Oren Laadan
  Cc: Andrew Morton, Linus Torvalds, containers, linux-kernel, linux-mm,
	linux-api, Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Dan Smith
On Wed, May 27, 2009 at 01:33:07PM -0400, Oren Laadan wrote:
> We need to use this value in the checkpoint/restart code and would like to
> have a constant instead of a magic '3'.
> 
> Changelog:
>     Mar 30:
>             . Add CHECKPOINT_SUPPORT in Kconfig (Nathan Lynch)
>     Mar 03:
>             . Picked up additional use of magic '3' in ptrace.h
> 
> Signed-off-by: Dan Smith <danms@us.ibm.com>
> ---
>  arch/s390/Kconfig |    4 ++++
>  1 files changed, 4 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
> index 2eca5fe..bf62cad 100644
> --- a/arch/s390/Kconfig
> +++ b/arch/s390/Kconfig
> @@ -49,6 +49,10 @@ config GENERIC_TIME_VSYSCALL
>  config GENERIC_CLOCKEVENTS
>  	def_bool y
>  
> +config CHECKPOINT_SUPPORT
> +	bool
> +	default y if 64BIT
> +
>  config GENERIC_BUG
>  	bool
>  	depends on BUG
Changelog and content aren't compatible.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 50+ messages in thread 
 
- * [RFC v16][PATCH 42/43] c/r: add CKPT_COPY() macro
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (32 preceding siblings ...)
  2009-05-27 17:33 ` [RFC v16][PATCH 41/43] c/r: (s390): expose a constant for the number of words (CRs) Oren Laadan
@ 2009-05-27 17:33 ` Oren Laadan
  2009-05-27 17:33 ` [RFC v16][PATCH 43/43] c/r: define s390-specific checkpoint-restart code Oren Laadan
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Dan Smith,
	Oren Laadan
From: Dan Smith <danms@us.ibm.com>
As suggested by Dave[1], this provides us a way to make the copy-in and
copy-out processes symmetric.  CKPT_COPY_ARRAY() provides us a way to do
the same thing but for arrays.  It's not critical, but it helps us unify
the checkpoint and restart paths for some things.
Changelog:
    Mar 04:
            . Removed semicolons
            . Added build-time check for __must_be_array in CKPT_COPY_ARRAY
    Feb 27:
            . Changed CKPT_COPY() to use assignment, eliminating the need
              for the CKPT_COPY_BIT() macro
            . Add CKPT_COPY_ARRAY() macro to help copying register arrays,
              etc
            . Move the macro definitions inside the CR #ifdef
    Feb 25:
            . Changed WARN_ON() to BUILD_BUG_ON()
Signed-off-by: Dan Smith <danms@us.ibm.com>
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
1: https://lists.linux-foundation.org/pipermail/containers/2009-February/015821.html (all the way at the bottom)
---
 include/linux/checkpoint.h |   29 +++++++++++++++++++++++++++++
 1 files changed, 29 insertions(+), 0 deletions(-)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 064dd25..669e90c 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -157,6 +157,34 @@ extern int restore_memory_contents(struct ckpt_ctx *ctx, struct inode *inode);
 	 VM_MAPPED_COPY | VM_INSERTPAGE | VM_MIXEDMAP | VM_SAO)
 
 
+/* useful macros to copy fields and buffers to/from ckpt_hdr_xxx structures */
+#define CKPT_CPT 1
+#define CKPT_RST 2
+
+#define CKPT_COPY(op, SAVE, LIVE)				        \
+	do {							\
+		if (op == CKPT_CPT)				\
+			SAVE = LIVE;				\
+		else						\
+			LIVE = SAVE;				\
+	} while (0)
+
+/*
+ * Copy @count items from @LIVE to @SAVE if op is CKPT_CPT (otherwise,
+ * copy in the reverse direction)
+ */
+#define CKPT_COPY_ARRAY(op, SAVE, LIVE, count)				\
+	do {								\
+		(void)__must_be_array(SAVE);				\
+		(void)__must_be_array(LIVE);				\
+		BUILD_BUG_ON(sizeof(*SAVE) != sizeof(*LIVE));		\
+		if (op == CKPT_CPT)					\
+			memcpy(SAVE, LIVE, count * sizeof(*SAVE));	\
+		else							\
+			memcpy(LIVE, SAVE, count * sizeof(*SAVE));	\
+	} while (0)
+
+
 /* debugging flags */
 #define CKPT_DBASE	0x1		/* anything */
 #define CKPT_DSYS	0x2		/* generic (system) */
@@ -189,6 +217,7 @@ extern unsigned long ckpt_debug_level;
  * CKPT_DBASE is the base flags, doesn't change
  * CKPT_DFLAG is to be redfined in each source file
  */
+
 #define ckpt_debug(fmt, args...)  \
 	_ckpt_debug(CKPT_DBASE | CKPT_DFLAG, fmt, ## args)
 
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread
- * [RFC v16][PATCH 43/43] c/r: define s390-specific checkpoint-restart code
  2009-05-27 17:32 [RFC v16][PATCH 00/43] Kernel based checkpoint/restart Oren Laadan
                   ` (33 preceding siblings ...)
  2009-05-27 17:33 ` [RFC v16][PATCH 42/43] c/r: add CKPT_COPY() macro Oren Laadan
@ 2009-05-27 17:33 ` Oren Laadan
  34 siblings, 0 replies; 50+ messages in thread
From: Oren Laadan @ 2009-05-27 17:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, containers, linux-kernel, linux-mm, linux-api,
	Serge Hallyn, Dave Hansen, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Pavel Emelyanov, Alexey Dobriyan, Dan Smith
From: Dan Smith <danms@us.ibm.com>
Implement the s390 arch-specific checkpoint/restart helpers.  This
is on top of Oren Laadan's c/r code.
With these, I am able to checkpoint and restart simple programs as per
Oren's patch intro.  While on x86 I never had to freeze a single task
to checkpoint it, on s390 I do need to.  That is a prereq for consistent
snapshots (esp with multiple processes) anyway so I don't see that as
a problem.
Changelog:
    Apr 11:
            . Introduce ckpt_arch_vdso()
    Feb 27:
            . Add checkpoint_s390.h
            . Fixed up save and restore of PSW, with the non-address bits
              properly masked out
    Feb 25:
            . Make checkpoint_hdr.h safe for inclusion in userspace
            . Replace comment about vsdo code
            . Add comment about restoring access registers
            . Write and read an empty ckpt_hdr_head_arch record to appease
              code (mktree) that expects it to be there
            . Utilize NUM_CKPT_WORDS in checkpoint_hdr.h
    Feb 24:
            . Use CKPT_COPY() to unify the un/loading of cpu and mm state
            . Fix fprs definition in ckpt_hdr_cpu
            . Remove debug WARN_ON() from checkpoint.c
    Feb 23:
            . Macro-ize the un/packing of trace flags
            . Fix the crash when externally-linked
            . Break out the restart functions into restart.c
            . Remove unneeded s390_enable_sie() call
    Jan 30:
            . Switched types in ckpt_hdr_cpu to __u64 etc.
              (Per Oren suggestion)
            . Replaced direct inclusion of structs in
              ckpt_hdr_cpu with the struct members.
              (Per Oren suggestion)
            . Also ended up adding a bunch of new things
              into restart (mm_segment, ksp, etc) in vain
              attempt to get code using fpu to not segfault
              after restart.
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Signed-off-by: Dan Smith <danms@us.ibm.com>
---
 arch/s390/include/asm/checkpoint_hdr.h |   81 ++++++++++++++
 arch/s390/include/asm/unistd.h         |    4 +-
 arch/s390/kernel/compat_wrapper.S      |   12 ++
 arch/s390/kernel/syscalls.S            |    2 +
 arch/s390/mm/Makefile                  |    1 +
 arch/s390/mm/checkpoint.c              |  183 ++++++++++++++++++++++++++++++++
 arch/s390/mm/checkpoint_s390.h         |   23 ++++
 7 files changed, 305 insertions(+), 1 deletions(-)
diff --git a/arch/s390/include/asm/checkpoint_hdr.h b/arch/s390/include/asm/checkpoint_hdr.h
new file mode 100644
index 0000000..185194b
--- /dev/null
+++ b/arch/s390/include/asm/checkpoint_hdr.h
@@ -0,0 +1,81 @@
+#ifndef __ASM_S390_CKPT_HDR_H
+#define __ASM_S390_CKPT_HDR_H
+/*
+ *  Checkpoint/restart - architecture specific headers s/390
+ *
+ *  Copyright IBM Corp. 2009
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <linux/types.h>
+#include <linux/checkpoint_hdr.h>
+#include <asm/ptrace.h>
+
+#ifdef __KERNEL__
+#include <asm/processor.h>
+#else
+#include <sys/user.h>
+#endif
+
+/*
+ * Notes
+ * NUM_GPRS defined in <asm/ptrace.h> to be 16
+ * NUM_FPRS defined in <asm/ptrace.h> to be 16
+ * NUM_APRS defined in <asm/ptrace.h> to be 16
+ * NUM_CR_WORDS defined in <asm/ptrace.h> to be 3
+ */
+struct ckpt_hdr_cpu {
+	struct ckpt_hdr h;
+	__u64 args[1];
+	__u64 gprs[NUM_GPRS];
+	__u64 orig_gpr2;
+	__u16 svcnr;
+	__u16 ilc;
+	__u32 acrs[NUM_ACRS];
+	__u64 ieee_instruction_pointer;
+
+	/* psw_t */
+	__u64 psw_t_mask;
+	__u64 psw_t_addr;
+
+	/* s390_fp_regs_t */
+	__u32 fpc;
+	union {
+		float f;
+		double d;
+		__u64 ui;
+		struct {
+			__u32 fp_hi;
+			__u32 fp_lo;
+		} fp;
+	} fprs[NUM_FPRS];
+
+	/* per_struct */
+	__u64 per_control_regs[NUM_CR_WORDS];
+	__u64 starting_addr;
+	__u64 ending_addr;
+	__u64 address;
+	__u16 perc_atmid;
+	__u8 access_id;
+	__u8 single_step;
+	__u8 instruction_fetch;
+};
+
+struct ckpt_hdr_mm_context {
+	struct ckpt_hdr h;
+	unsigned long vdso_base;
+	int noexec;
+	int has_pgste;
+	int alloc_pgste;
+	unsigned long asce_bits;
+	unsigned long asce_limit;
+};
+
+struct ckpt_hdr_header_arch {
+	struct ckpt_hdr h;
+};
+
+#endif /* __ASM_S390_CKPT_HDR__H */
diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h
index f0f19e6..3d22f17 100644
--- a/arch/s390/include/asm/unistd.h
+++ b/arch/s390/include/asm/unistd.h
@@ -267,7 +267,9 @@
 #define __NR_epoll_create1	327
 #define	__NR_preadv		328
 #define	__NR_pwritev		329
-#define NR_syscalls 330
+#define __NR_checkpoint		330
+#define __NR_restart		331
+#define NR_syscalls 332
 
 /* 
  * There are some system calls that are not present on 64 bit, some
diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
index fb38af6..ece87c8 100644
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -1823,3 +1823,15 @@ compat_sys_pwritev_wrapper:
 	llgfr	%r5,%r5			# u32
 	llgfr	%r6,%r6			# u32
 	jg	compat_sys_pwritev	# branch to system call
+
+	.globl sys_checkpoint_wrapper
+sys_checkpoint_wrapper:
+	lgfr	%r2,%r2			# pid_t
+	lgfr	%r3,%r3			# int
+	llgfr	%r4,%r4			# unsigned long
+
+	.globl sys_restore_wrapper
+sys_restore_wrapper:
+	lgfr	%r2,%r2			# int
+	lgfr	%r3,%r3			# int
+	llgfr	%r4,%r4			# unsigned long
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index 2c7739f..e755e93 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -338,3 +338,5 @@ SYSCALL(sys_dup3,sys_dup3,sys_dup3_wrapper)
 SYSCALL(sys_epoll_create1,sys_epoll_create1,sys_epoll_create1_wrapper)
 SYSCALL(sys_preadv,sys_preadv,compat_sys_preadv_wrapper)
 SYSCALL(sys_pwritev,sys_pwritev,compat_sys_pwritev_wrapper)
+SYSCALL(sys_checkpoint,sys_checkpoint,sys_checkpoint_wrapper) /* 330 */
+SYSCALL(sys_restart,sys_restart,sys_restore_wrapper)
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index 2a74581..d1c3fbf 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -6,3 +6,4 @@ obj-y	 := init.o fault.o extmem.o mmap.o vmem.o pgtable.o
 obj-$(CONFIG_CMM) += cmm.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_PAGE_STATES) += page-states.o
+obj-$(CONFIG_CHECKPOINT) += checkpoint.o
diff --git a/arch/s390/mm/checkpoint.c b/arch/s390/mm/checkpoint.c
new file mode 100644
index 0000000..f97f619
--- /dev/null
+++ b/arch/s390/mm/checkpoint.c
@@ -0,0 +1,183 @@
+/*
+ *  Checkpoint/restart - architecture specific support for s390
+ *
+ *  Copyright IBM Corp. 2009
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <asm/system.h>
+#include <asm/pgtable.h>
+#include <asm/elf.h>
+
+#include <linux/checkpoint.h>
+#include <asm/checkpoint_hdr.h>
+
+/**************************************************************************
+ * Checkpoint
+ */
+
+static void s390_copy_regs(int op, struct ckpt_hdr_cpu *h,
+			   struct task_struct *t)
+{
+	struct pt_regs *regs = task_pt_regs(t);
+	struct thread_struct *thr = &t->thread;
+
+	/* Save the whole PSW to facilitate forensic debugging, but only
+	 * restore the address portion to avoid letting userspace do
+	 * bad things by manipulating its value.
+	 */
+	if (op == CKPT_CPT) {
+		CKPT_COPY(op, h->psw_t_addr, regs->psw.addr);
+	} else {
+		regs->psw.addr &= ~PSW_ADDR_INSN;
+		regs->psw.addr |= h->psw_t_addr;
+	}
+
+	CKPT_COPY(op, h->args[0], regs->args[0]);
+	CKPT_COPY(op, h->orig_gpr2, regs->orig_gpr2);
+	CKPT_COPY(op, h->svcnr, regs->svcnr);
+	CKPT_COPY(op, h->ilc, regs->ilc);
+	CKPT_COPY(op, h->ieee_instruction_pointer,
+		thr->ieee_instruction_pointer);
+	CKPT_COPY(op, h->psw_t_mask, regs->psw.mask);
+	CKPT_COPY(op, h->fpc, thr->fp_regs.fpc);
+	CKPT_COPY(op, h->starting_addr, thr->per_info.starting_addr);
+	CKPT_COPY(op, h->ending_addr, thr->per_info.ending_addr);
+	CKPT_COPY(op, h->address, thr->per_info.lowcore.words.address);
+	CKPT_COPY(op, h->perc_atmid, thr->per_info.lowcore.words.perc_atmid);
+	CKPT_COPY(op, h->access_id, thr->per_info.lowcore.words.access_id);
+	CKPT_COPY(op, h->single_step, thr->per_info.single_step);
+	CKPT_COPY(op, h->instruction_fetch, thr->per_info.instruction_fetch);
+
+	CKPT_COPY_ARRAY(op, h->gprs, regs->gprs, NUM_GPRS);
+	CKPT_COPY_ARRAY(op, h->fprs, thr->fp_regs.fprs, NUM_FPRS);
+	CKPT_COPY_ARRAY(op, h->acrs, thr->acrs, NUM_ACRS);
+	CKPT_COPY_ARRAY(op, h->per_control_regs,
+		      thr->per_info.control_regs.words.cr, NUM_CR_WORDS);
+}
+
+static void s390_mm(int op, struct ckpt_hdr_mm_context *h,
+		    struct mm_struct *mm)
+{
+	CKPT_COPY(op, h->noexec, mm->context.noexec);
+	CKPT_COPY(op, h->has_pgste, mm->context.has_pgste);
+	CKPT_COPY(op, h->alloc_pgste, mm->context.alloc_pgste);
+	CKPT_COPY(op, h->asce_bits, mm->context.asce_bits);
+	CKPT_COPY(op, h->asce_limit, mm->context.asce_limit);
+}
+
+int checkpoint_thread(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	return 0;
+}
+
+/* dump the cpu state and registers of a given task */
+int checkpoint_cpu(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct ckpt_hdr_cpu *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_CPU);
+	if (!h)
+		return -ENOMEM;
+
+	s390_copy_regs(CKPT_CPT, h, t);
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+/* Write an empty header since it is assumed to be there */
+int checkpoint_write_header_arch(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_header_arch *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_HEADER_ARCH);
+	if (!h)
+		return -ENOMEM;
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+int checkpoint_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+	struct ckpt_hdr_mm_context *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_MM_CONTEXT);
+	if (!h)
+		return -ENOMEM;
+
+	s390_mm(CKPT_CPT, h, mm);
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+/**************************************************************************
+ * Restart
+ */
+
+int restore_thread(struct ckpt_ctx *ctx)
+{
+	return 0;
+}
+
+int restore_cpu(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_cpu *h;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_CPU);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	s390_copy_regs(CKPT_RST, h, current);
+
+	/* s390 does not restore the access registers after a syscall,
+	 * but does on a task switch.  Since we're switching tasks (in
+	 * a way), we need to replicate that behavior here.
+	 */
+	restore_access_regs(h->acrs);
+
+	ckpt_hdr_put(ctx, h);
+	return 0;
+}
+
+int restore_read_header_arch(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_header_arch *h;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_HEADER_ARCH);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ckpt_hdr_put(ctx, h);
+	return 0;
+}
+
+
+int restore_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+	struct ckpt_hdr_mm_context *h;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_MM_CONTEXT);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	s390_mm(CKPT_RST, h, mm);
+
+	ckpt_hdr_put(ctx, h);
+	return 0;
+}
diff --git a/arch/s390/mm/checkpoint_s390.h b/arch/s390/mm/checkpoint_s390.h
new file mode 100644
index 0000000..c3bf24d
--- /dev/null
+++ b/arch/s390/mm/checkpoint_s390.h
@@ -0,0 +1,23 @@
+/*
+ *  Checkpoint/restart - architecture specific support for s390
+ *
+ *  Copyright IBM Corp. 2009
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#ifndef _S390_CHECKPOINT_H
+#define _S390_CHECKPOINT_H
+
+#include <linux/checkpoint_hdr.h>
+#include <linux/sched.h>
+#include <linux/mm_types.h>
+
+extern void checkpoint_s390_regs(int op, struct ckpt_hdr_cpu *h,
+				 struct task_struct *t);
+extern void checkpoint_s390_mm(int op, struct ckpt_hdr_mm_context *h,
+			       struct mm_struct *mm);
+
+#endif /* _S390_CHECKPOINT_H */
-- 
1.6.0.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 50+ messages in thread