linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Cyrill Gorcunov <gorcunov@openvz.org>
To: linux-kernel@vger.kernel.org
Cc: Andrew Vagin <avagin@parallels.com>,
	Pavel Emelyanov <xemul@parallels.com>,
	James Bottomley <jbottomley@parallels.com>,
	Glauber Costa <glommer@parallels.com>,
	"H. Peter Anvin" <hpa@zytor.com>, Ingo Molnar <mingo@elte.hu>,
	Tejun Heo <tj@kernel.org>, Dave Hansen <dave@linux.vnet.ibm.com>,
	"Eric W. Biederman" <ebiederm@xmission.com>,
	Daniel Lezcano <dlezcano@fr.ibm.com>,
	Alexey Dobriyan <adobriyan@gmail.com>,
	Cyrill Gorcunov <gorcunov@openvz.org>
Subject: [patch 5/5] elf: Add support for loading ET_CKPT files
Date: Fri, 14 Oct 2011 15:04:21 +0400	[thread overview]
Message-ID: <20111014110511.670174429@openvz.org> (raw)
In-Reply-To: 20111014110416.552685686@openvz.org

[-- Attachment #1: binfmt-elf-for-cr-5 --]
[-- Type: text/plain, Size: 26381 bytes --]

This patch add ability to run that named "checkpoint" files by
enhancing Elf file format, which includes

 - new Elf file type ET_CKPT

 - three additional program header types PT_CKPT_VMA, PT_CKPT_CORE
   and PT_CKPT_PAGES.

     PT_CKPT_VMA -- holds 'vma_entry' structure, which describes the
     memory area the kernel should map. It also might contain a file descriptor
     so the kernel will be mapping a file povided. Usually such file get
     opened by user-space helper tool which prepares 'vma_entry' structure
     for the kernel.

     PT_CKPT_CORE -- 'core_entry' structure (registers, tls, tasks specific
     settings). The structure is defined as a 16K container which should be
     enough for most cases. 8K of it is reserved for arch specific settings.

     PT_CKPT_PAGES -- a set of all pages which contents we should restored.

Apart from Elf extension flush_old_exec() has been splitted to two
functions -- the former flush_old_exec() and flush_exec_keep_thread().
The later doesn't call for de_thread() allowing to keep threads
relationship. Also arch_setup_additional_pages_at() helper added
to setup vdso at predefined address.

At moment only pure x86-64 architecture is supported.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Andrew Vagin <avagin@parallels.com>
CC: Pavel Emelyanov <xemul@parallels.com>
CC: James Bottomley <jbottomley@parallels.com>
CC: Glauber Costa <glommer@parallels.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Ingo Molnar <mingo@elte.hu>
CC: Tejun Heo <tj@kernel.org>
CC: Dave Hansen <dave@linux.vnet.ibm.com>
CC: Eric W. Biederman <ebiederm@xmission.com>
CC: Daniel Lezcano <dlezcano@fr.ibm.com>
CC: Alexey Dobriyan <adobriyan@gmail.com>
---
 arch/x86/include/asm/elf.h      |    3 
 arch/x86/include/asm/elf_ckpt.h |   80 ++++++++
 arch/x86/kernel/Makefile        |    2 
 arch/x86/kernel/elf_ckpt.c      |  161 ++++++++++++++++++
 arch/x86/vdso/vma.c             |   22 ++
 fs/Kconfig.binfmt               |   11 +
 fs/Makefile                     |    1 
 fs/binfmt_elf.c                 |   17 +
 fs/binfmt_elf_ckpt.c            |  356 ++++++++++++++++++++++++++++++++++++++++
 fs/exec.c                       |   27 +--
 include/linux/binfmts.h         |    1 
 include/linux/elf_ckpt.h        |  103 +++++++++++
 12 files changed, 772 insertions(+), 12 deletions(-)

Index: linux-2.6.git/arch/x86/include/asm/elf.h
===================================================================
--- linux-2.6.git.orig/arch/x86/include/asm/elf.h
+++ linux-2.6.git/arch/x86/include/asm/elf.h
@@ -314,7 +314,8 @@ struct linux_binprm;
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
 extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 				       int uses_interp);
-
+extern int arch_setup_additional_pages_at(struct linux_binprm *bprm,
+					  void *addr, int uses_interp);
 extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
 #define compat_arch_setup_additional_pages	syscall32_setup_pages
 
Index: linux-2.6.git/arch/x86/include/asm/elf_ckpt.h
===================================================================
--- /dev/null
+++ linux-2.6.git/arch/x86/include/asm/elf_ckpt.h
@@ -0,0 +1,80 @@
+#ifndef _LINUX_ELF_X86_CHECKPOINT_H
+#define _LINUX_ELF_X86_CHECKPOINT_H
+
+#include <linux/errno.h>
+
+#include <asm/types.h>
+#include <asm/ptrace.h>
+
+#define CKPT_GDT_ENTRY_TLS_ENTRIES	3
+
+struct user_regs_entry {
+	__u64	r15;
+	__u64	r14;
+	__u64	r13;
+	__u64	r12;
+	__u64	bp;
+	__u64	bx;
+	__u64	r11;
+	__u64	r10;
+	__u64	r9;
+	__u64	r8;
+	__u64	ax;
+	__u64	cx;
+	__u64	dx;
+	__u64	si;
+	__u64	di;
+	__u64	orig_ax;
+	__u64	ip;
+	__u64	cs;
+	__u64	flags;
+	__u64	sp;
+	__u64	ss;
+	__u64	fs_base;
+	__u64	gs_base;
+	__u64	ds;
+	__u64	es;
+	__u64	fs;
+	__u64	gs;
+} __packed;
+
+struct desc_struct_entry {
+	__u32	a;
+	__u32	b;
+} __packed;
+
+struct user_fpregs_entry {
+	__u16	cwd;
+	__u16	swd;
+	__u16	twd;
+	__u16	fop;
+	__u64	rip;
+	__u64	rdp;
+	__u32	mxcsr;
+	__u32	mxcsr_mask;
+	__u32	st_space[32];
+	__u32	xmm_space[64];
+	__u32	padding[24];
+} __packed;
+
+struct ckpt_arch_entry {
+	struct user_regs_entry		gpregs;
+	struct user_fpregs_entry	fpregs;
+	struct desc_struct		tls_array[CKPT_GDT_ENTRY_TLS_ENTRIES];
+};
+
+struct core_entry;
+
+#ifdef CONFIG_X86_64
+extern int load_elf_ckpt_arch(struct task_struct *tsk, struct pt_regs *regs,
+			      struct core_entry *core_entry);
+#else
+static inline int
+load_elf_ckpt_arch(struct task_struct *tsk, struct pt_regs *regs,
+		   struct core_entry *core_entry)
+{
+	return -ENOEXEC;
+}
+#endif
+
+#endif /* _LINUX_ELF_X86_CHECKPOINT_H */
Index: linux-2.6.git/arch/x86/kernel/Makefile
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/Makefile
+++ linux-2.6.git/arch/x86/kernel/Makefile
@@ -99,6 +99,8 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) 
 obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o
 obj-$(CONFIG_OF)			+= devicetree.o
 
+obj-$(CONFIG_BINFMT_ELF_CKPT)		+= elf_ckpt.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
Index: linux-2.6.git/arch/x86/kernel/elf_ckpt.c
===================================================================
--- /dev/null
+++ linux-2.6.git/arch/x86/kernel/elf_ckpt.c
@@ -0,0 +1,161 @@
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/binfmts.h>
+#include <linux/string.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/personality.h>
+#include <linux/elfcore.h>
+#include <linux/init.h>
+#include <linux/highuid.h>
+#include <linux/compiler.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+#include <linux/random.h>
+#include <linux/elf.h>
+#include <linux/utsname.h>
+#include <linux/coredump.h>
+#include <linux/regset.h>
+
+#include <asm/uaccess.h>
+#include <asm/param.h>
+#include <asm/page.h>
+#include <asm/prctl.h>
+#include <asm/proto.h>
+#include <asm/i387.h>
+
+#include <linux/elf_ckpt.h>
+#include <linux/flex_array.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+
+#ifdef CONFIG_X86_64
+
+#define cp_reg(d, s, r) d.r = s.r
+
+int load_elf_ckpt_arch(struct task_struct *tsk, struct pt_regs *regs,
+		       struct core_entry *core_entry)
+{
+	struct ckpt_arch_entry *arch = (struct ckpt_arch_entry *)core_entry->arch;
+	struct thread_struct *thread = &current->thread;
+
+	struct user_regs_struct gpregs;
+	struct user_i387_struct	fpregs;
+
+	mm_segment_t old_fs;
+	int i, ret;
+
+	if (core_entry->header.arch != CKPT_HEADER_ARCH_X86_64) {
+		pr_err("elf-ckpt-x86: Unsupported or corrupted header\n");
+		return -ENOEXEC;
+	}
+
+	BUILD_BUG_ON(CKPT_GDT_ENTRY_TLS_ENTRIES != GDT_ENTRY_TLS_ENTRIES);
+	BUILD_BUG_ON(sizeof(struct ckpt_arch_entry) > CKPT_ARCH_SIZE);
+
+	memset(&gpregs, 0, sizeof(gpregs));
+	memset(&fpregs, 0, sizeof(fpregs));
+
+	/*
+	 * General purpose registers
+	 */
+	cp_reg(gpregs, arch->gpregs, r15);
+	cp_reg(gpregs, arch->gpregs, r14);
+	cp_reg(gpregs, arch->gpregs, r13);
+	cp_reg(gpregs, arch->gpregs, r12);
+	cp_reg(gpregs, arch->gpregs, bp);
+	cp_reg(gpregs, arch->gpregs, bx);
+	cp_reg(gpregs, arch->gpregs, r11);
+	cp_reg(gpregs, arch->gpregs, r10);
+	cp_reg(gpregs, arch->gpregs, r9);
+	cp_reg(gpregs, arch->gpregs, r8);
+	cp_reg(gpregs, arch->gpregs, ax);
+	cp_reg(gpregs, arch->gpregs, cx);
+	cp_reg(gpregs, arch->gpregs, dx);
+	cp_reg(gpregs, arch->gpregs, si);
+	cp_reg(gpregs, arch->gpregs, di);
+	cp_reg(gpregs, arch->gpregs, orig_ax);
+	cp_reg(gpregs, arch->gpregs, ip);
+	cp_reg(gpregs, arch->gpregs, cs);
+	cp_reg(gpregs, arch->gpregs, flags);
+	cp_reg(gpregs, arch->gpregs, sp);
+	cp_reg(gpregs, arch->gpregs, ss);
+	cp_reg(gpregs, arch->gpregs, fs_base);
+	cp_reg(gpregs, arch->gpregs, gs_base);
+	cp_reg(gpregs, arch->gpregs, ds);
+	cp_reg(gpregs, arch->gpregs, es);
+	cp_reg(gpregs, arch->gpregs, fs);
+	cp_reg(gpregs, arch->gpregs, gs);
+
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = arch_ptrace(current, PTRACE_SETREGS, 0, (unsigned long)&gpregs);
+	set_fs(old_fs);
+	if (ret)
+		goto out;
+
+	*regs = *task_pt_regs(current);
+
+	thread->usersp	= arch->gpregs.sp;
+	thread->ds	= arch->gpregs.ds;
+	thread->es	= arch->gpregs.es;
+	thread->fs	= arch->gpregs.fs;
+	thread->gs	= arch->gpregs.gs;
+
+	thread->fsindex	= thread->fs;
+	thread->gsindex = thread->gs;
+
+	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) {
+		thread->tls_array[i].a = arch->tls_array[i].a;
+		thread->tls_array[i].b = arch->tls_array[i].b;
+	}
+
+	if (arch->gpregs.fs_base) {
+		ret = do_arch_prctl(current, ARCH_SET_FS, arch->gpregs.fs_base);
+		if (ret)
+			goto out;
+	}
+
+	if (arch->gpregs.gs_base) {
+		ret = do_arch_prctl(current, ARCH_SET_GS, arch->gpregs.gs_base);
+		if (ret)
+			goto out;
+	}
+
+	/* Restoring FPU */
+	if (core_entry->task_flags & PF_USED_MATH) {
+
+		cp_reg(fpregs, arch->fpregs, cwd);
+		cp_reg(fpregs, arch->fpregs, swd);
+		cp_reg(fpregs, arch->fpregs, twd);
+		cp_reg(fpregs, arch->fpregs, fop);
+		cp_reg(fpregs, arch->fpregs, rip);
+		cp_reg(fpregs, arch->fpregs, rdp);
+		cp_reg(fpregs, arch->fpregs, mxcsr);
+		cp_reg(fpregs, arch->fpregs, mxcsr_mask);
+
+		for (i = 0; i < ARRAY_SIZE(arch->fpregs.st_space); i++)
+			cp_reg(fpregs, arch->fpregs, st_space[i]);
+
+		for (i = 0; i < ARRAY_SIZE(arch->fpregs.xmm_space); i++)
+			cp_reg(fpregs, arch->fpregs, xmm_space[i]);
+
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		ret = arch_ptrace(current, PTRACE_SETFPREGS, 0, (unsigned long)&fpregs);
+		set_fs(old_fs);
+		if (ret)
+			goto out;
+	}
+
+out:
+	return ret;
+}
+
+#endif /* CONFIG_X86_64 */
Index: linux-2.6.git/arch/x86/vdso/vma.c
===================================================================
--- linux-2.6.git.orig/arch/x86/vdso/vma.c
+++ linux-2.6.git/arch/x86/vdso/vma.c
@@ -137,6 +137,28 @@ up_fail:
 	return ret;
 }
 
+int arch_setup_additional_pages_at(struct linux_binprm *bprm, void *addr, int uses_interp)
+{
+	struct mm_struct *mm = current->mm;
+	int ret;
+
+	if (!vdso_enabled)
+		return 0;
+
+	down_write(&mm->mmap_sem);
+	current->mm->context.vdso = addr;
+	ret = install_special_mapping(mm, (unsigned long)addr, vdso_size,
+				      VM_READ | VM_EXEC |
+				      VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC |
+				      VM_ALWAYSDUMP,
+				      vdso_pages);
+	if (ret)
+		current->mm->context.vdso = NULL;
+
+	up_write(&mm->mmap_sem);
+	return ret;
+}
+
 static __init int vdso_setup(char *s)
 {
 	vdso_enabled = simple_strtoul(s, NULL, 0);
Index: linux-2.6.git/fs/Kconfig.binfmt
===================================================================
--- linux-2.6.git.orig/fs/Kconfig.binfmt
+++ linux-2.6.git/fs/Kconfig.binfmt
@@ -23,6 +23,17 @@ config BINFMT_ELF
 	  ld.so (check the file <file:Documentation/Changes> for location and
 	  latest version).
 
+config BINFMT_ELF_CKPT
+	tristate "Kernel support for CKPT ELF binaries"
+	default n
+	depends on BINFMT_ELF && X86_64
+	help
+	  ELF CKPT (checkpoint) is an extension to ELF format to restore
+	  checkpointed processes. It's not confirmed yet and highly
+	  experimental.
+
+	  If unsure, say N.
+
 config COMPAT_BINFMT_ELF
 	bool
 	depends on COMPAT && BINFMT_ELF
Index: linux-2.6.git/fs/Makefile
===================================================================
--- linux-2.6.git.orig/fs/Makefile
+++ linux-2.6.git/fs/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_BINFMT_MISC)	+= binfmt_misc
 obj-y				+= binfmt_script.o
 
 obj-$(CONFIG_BINFMT_ELF)	+= binfmt_elf.o
+obj-$(CONFIG_BINFMT_ELF_CKPT)	+= binfmt_elf_ckpt.o
 obj-$(CONFIG_COMPAT_BINFMT_ELF)	+= compat_binfmt_elf.o
 obj-$(CONFIG_BINFMT_ELF_FDPIC)	+= binfmt_elf_fdpic.o
 obj-$(CONFIG_BINFMT_SOM)	+= binfmt_som.o
Index: linux-2.6.git/fs/binfmt_elf.c
===================================================================
--- linux-2.6.git.orig/fs/binfmt_elf.c
+++ linux-2.6.git/fs/binfmt_elf.c
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/random.h>
 #include <linux/elf.h>
+#include <linux/elf_ckpt.h>
 #include <linux/utsname.h>
 #include <linux/coredump.h>
 #include <asm/uaccess.h>
@@ -592,7 +593,11 @@ static int load_elf_binary(struct linux_
 	if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
 		goto out;
 
-	if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
+	if (loc->elf_ex.e_type != ET_EXEC &&
+#ifdef CONFIG_BINFMT_ELF_CKPT
+	    loc->elf_ex.e_type != ET_CKPT  &&
+#endif
+	    loc->elf_ex.e_type != ET_DYN)
 		goto out;
 	if (!elf_check_arch(&loc->elf_ex))
 		goto out;
@@ -619,6 +624,16 @@ static int load_elf_binary(struct linux_
 		goto out_free_ph;
 	}
 
+#ifdef CONFIG_BINFMT_ELF_CKPT
+	if (loc->elf_ex.e_type == ET_CKPT) {
+		retval = load_elf_ckpt(bprm, regs, &loc->elf_ex,
+				       (struct elf_phdr *)elf_phdata);
+		if (!retval)
+			set_binfmt(&elf_format);
+		goto out_free_ph;
+	}
+#endif
+
 	elf_ppnt = elf_phdata;
 	elf_bss = 0;
 	elf_brk = 0;
Index: linux-2.6.git/fs/binfmt_elf_ckpt.c
===================================================================
--- /dev/null
+++ linux-2.6.git/fs/binfmt_elf_ckpt.c
@@ -0,0 +1,356 @@
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/binfmts.h>
+#include <linux/string.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/personality.h>
+#include <linux/elfcore.h>
+#include <linux/init.h>
+#include <linux/highuid.h>
+#include <linux/compiler.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+#include <linux/random.h>
+#include <linux/elf.h>
+#include <linux/utsname.h>
+#include <linux/coredump.h>
+#include <linux/regset.h>
+
+#include <asm/uaccess.h>
+#include <asm/param.h>
+#include <asm/page.h>
+#include <asm/prctl.h>
+#include <asm/proto.h>
+#include <asm/i387.h>
+
+#include <linux/elf_ckpt.h>
+#include <asm/elf_ckpt.h>
+
+#include <linux/flex_array.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+
+int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs,
+		  struct elfhdr *elf_ex, struct elf_phdr *elf_phdr)
+{
+	struct elf_phdr *elf_phdr_pages;
+	struct flex_array *fa = NULL;
+	struct vma_entry *vma_entry_ptr;
+	int nr_vma_found, nr_vma_mapped;
+	struct vma_entry vma_entry;
+	struct file *file = NULL;
+	unsigned long map_addr;
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+	unsigned long vdso = -1UL;
+#endif
+
+	struct core_entry *core_entry = NULL;
+	unsigned long start_stack = -1UL;
+
+	int i, ret = -ENOEXEC;
+	loff_t off;
+
+	BUILD_BUG_ON(CKPT_TASK_COMM_LEN != TASK_COMM_LEN);
+	BUILD_BUG_ON(CKPT_PAGE_SIZE != PAGE_SIZE);
+	BUILD_BUG_ON(CKPT_CORE_SIZE != sizeof(*core_entry));
+
+	elf_phdr_pages	= NULL;
+	nr_vma_found	= 0;
+	nr_vma_mapped	= 0;
+
+	/*
+	 * An early check for header version so if we fail here
+	 * we would not need to use flex array at all.
+	 */
+	for (i = 0; i < elf_ex->e_phnum; i++) {
+		if (elf_phdr[i].p_type != PT_CKPT_CORE)
+			continue;
+
+		core_entry = vmalloc(sizeof(*core_entry));
+		if (!core_entry) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = kernel_read(bprm->file, elf_phdr[i].p_offset,
+				  (char *)core_entry, sizeof(*core_entry));
+		if (ret != sizeof(*core_entry)) {
+			pr_err("elf-ckpt: Can't read core_entry\n");
+			ret = -EIO;
+			goto out;
+		}
+
+		if (core_entry->header.version != CKPT_HEADER_VERSION) {
+			pr_err("elf-ckpt: Unsupported or corrupted header\n");
+			ret = -ENOEXEC;
+			goto out;
+		}
+
+		break;
+	}
+
+	if (i == elf_ex->e_phnum) {
+		pr_err("elf-ckpt: No header found\n");
+		ret = -ENOEXEC;
+		goto out;
+	}
+
+
+	fa = flex_array_alloc(sizeof(vma_entry), elf_ex->e_phnum, GFP_KERNEL);
+	if (!fa || flex_array_prealloc(fa, 0, elf_ex->e_phnum, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		if (fa) {
+			flex_array_free(fa);
+			fa = NULL;
+			goto out;
+		}
+	}
+
+	ret = flush_exec_keep_thread(bprm);
+	if (ret)
+		goto out;
+
+	current->flags &= ~PF_FORKNOEXEC;
+	current->mm->def_flags = 0;
+
+	/*
+	 * We don't care about parameters passed (such as argc, argv, env)
+	 * when execute checkpoint file because we're to substitute
+	 * all things anyway.
+	 */
+	do_munmap(current->mm, 0, TASK_SIZE);
+
+	SET_PERSONALITY(loc->elf_ex);
+
+	for (i = 0; i < elf_ex->e_phnum; i++) {
+
+		switch (elf_phdr[i].p_type) {
+		case PT_CKPT_VMA:
+			ret = kernel_read(bprm->file, elf_phdr[i].p_offset,
+					  (char *)&vma_entry, sizeof(vma_entry));
+			if (ret != sizeof(vma_entry)) {
+				pr_err("elf-ckpt: Can't read vma_entry\n");
+				ret = -EIO;
+				goto out;
+			}
+			if (flex_array_put(fa, i, &vma_entry, GFP_KERNEL))
+				BUG();
+
+			/* We need to know if there is executable stack */
+			if (vma_entry.status & VMA_AREA_STACK) {
+				if (vma_entry.flags & PROT_EXEC)
+					current->personality |= READ_IMPLIES_EXEC;
+			}
+
+			nr_vma_found++;
+			continue;
+		case PT_CKPT_PAGES:
+			elf_phdr_pages = &elf_phdr[i];
+			continue;
+		default:
+			continue;
+		}
+	}
+
+	/* Be sure it has the file structure we expected to see. */
+	if (!elf_phdr_pages || !nr_vma_found) {
+		ret = -ENOEXEC;
+		goto out;
+	}
+
+	/*
+	 * VMA randomization still needs to be set (just in case if
+	 * the program we restore will exec() something else later).
+	 */
+	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+		current->flags |= PF_RANDOMIZE;
+
+	/*
+	 * FIXME: Note it flushes signal handlers as well,
+	 * so we need to dump queued signals and restore
+	 * them here.
+	 */
+	setup_new_exec(bprm);
+
+	current->mm->free_area_cache = current->mm->mmap_base;
+	current->mm->cached_hole_size = 0;
+
+	for (i = 0; i < nr_vma_found; i++) {
+		vma_entry_ptr = flex_array_get(fa, i);
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+		if (vma_entry_ptr->status & VMA_AREA_VDSO)
+			vdso = vma_entry_ptr->start;
+#endif
+
+		if (vma_entry_ptr->status & VMA_AREA_STACK) {
+			/* Note if stack is VM_GROWSUP -- it should be reversed */
+			start_stack = vma_entry_ptr->start;
+		}
+
+		/* Anything special should be ignored */
+		if (!(vma_entry_ptr->status & VMA_AREA_REGULAR))
+			continue;
+
+		/* It's a file mmap'ed */
+		if (vma_entry_ptr->fd != -1) {
+			file = fget((unsigned int)vma_entry_ptr->fd);
+			if (!file) {
+				ret = -EBADF;
+				goto out_unmap;
+			}
+
+			/* Reuse this field to handle error cases */
+			vma_entry_ptr->fd = (__u64)file;
+		} else
+			file = NULL;
+
+		down_write(&current->mm->mmap_sem);
+		map_addr = do_mmap(file,
+				   vma_entry_ptr->start,
+				   vma_entry_ptr->end - vma_entry_ptr->start,
+				   vma_entry_ptr->prot,
+				   vma_entry_ptr->flags | MAP_FIXED,
+				   vma_entry_ptr->pgoff);
+		up_write(&current->mm->mmap_sem);
+
+		if (file) {
+			fput(file);
+			do_close((unsigned int)vma_entry_ptr->fd);
+		}
+
+		if ((unsigned long)(map_addr) >= TASK_SIZE) {
+			ret = IS_ERR((void *)map_addr) ? PTR_ERR((void*)map_addr) : -EINVAL;
+			goto out_unmap;
+		}
+
+		nr_vma_mapped++;
+	}
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+	if (vdso == -1UL) {
+		pr_err("elf-ckpt: Can't find VDSO address\n");
+		ret = -ENOEXEC;
+		goto out_unmap;
+	}
+#endif
+
+	if (start_stack == -1UL) {
+		pr_err("elf-ckpt: Can't find stack VMA\n");
+		ret = -ENOEXEC;
+		goto out_unmap;
+	}
+
+	/* The name it has before */
+	set_task_comm(current, core_entry->task_comm);
+
+	bprm->p = core_entry->mm_start_stack;
+
+	current->mm->start_code		= core_entry->mm_start_code;
+	current->mm->end_code		= core_entry->mm_end_code;
+	current->mm->start_data		= core_entry->mm_start_data;
+	current->mm->end_data		= core_entry->mm_end_data;
+	current->mm->start_stack	= core_entry->mm_start_stack;
+	current->mm->start_brk		= core_entry->mm_start_brk;
+	current->mm->brk		= core_entry->mm_brk;
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+	ret = arch_setup_additional_pages_at(bprm, (void *)vdso, 0);
+	if (ret) {
+		pr_err("elf-ckpt: Can't setup additional pages at %lx with %d\n",
+			vdso, ret);
+		goto out_unmap;
+	}
+#endif
+
+	/*
+	 * Restore pages
+	 */
+	off = elf_phdr_pages->p_offset;
+	while (1) {
+		struct vm_area_struct *vma;
+		struct page *page;
+		void *page_data;
+		__u64 va;
+
+		ret = kernel_read(bprm->file, off, (char *)&va, sizeof(va));
+		if (ret != sizeof(va)) {
+			pr_err("elf-ckpt: Can't read page virtual address: "
+			       "ret = %d off = %lx\n", ret, (unsigned long)off);
+			ret = -EIO;
+			goto out_unmap;
+		}
+
+		/* End of pages reached */
+		if (!va)
+			break;
+
+		vma = find_vma(current->mm, (unsigned long)va);
+		if (!vma) {
+			pr_err("elf-ckpt: No VMA for page: %16lx\n", (unsigned long)va);
+			ret = -ESRCH;
+			goto out_unmap;
+		}
+
+		ret = get_user_pages(current, current->mm, (unsigned long)va,
+				     1, 1, 1, &page, NULL);
+		if (ret != 1) {
+			pr_err("elf-ckpt: Can't get user page: %16lx\n", (unsigned long)va);
+			ret = -EFAULT;
+			goto out_unmap;
+		}
+
+		page_data = kmap(page);
+		ret = kernel_read(bprm->file, off + sizeof(va), page_data, PAGE_SIZE);
+		kunmap(page);
+		put_page(page);
+
+		if (ret != PAGE_SIZE) {
+			pr_err("elf-ckpt: Can't read data on page: %16lx\n", (unsigned long)va);
+			ret = -EFAULT;
+			goto out_unmap;
+		}
+
+		off += sizeof(va) + PAGE_SIZE;
+	}
+
+	/*
+	 * Architecture specific setup for registers
+	 * and friends, it's done lately since if
+	 * an error happened before there is no much
+	 * point to setup arch-specific things at all.
+	*/
+	ret = load_elf_ckpt_arch(current, regs, core_entry);
+	if (ret)
+		goto out_unmap;
+
+	/* We're done */
+	ret = 0;
+out:
+	if (core_entry)
+		vfree(core_entry);
+
+	if (fa)
+		flex_array_free(fa);
+	return ret;
+
+out_unmap:
+	for (i = 0; i < nr_vma_mapped; i++) {
+		vma_entry_ptr = flex_array_get(fa, i);
+		down_write(&current->mm->mmap_sem);
+		do_munmap(current->mm, vma_entry_ptr->start,
+			  vma_entry_ptr->end - vma_entry_ptr->start);
+		up_write(&current->mm->mmap_sem);
+	}
+
+	send_sig(SIGKILL, current, 0);
+	goto out;
+}
Index: linux-2.6.git/fs/exec.c
===================================================================
--- linux-2.6.git.orig/fs/exec.c
+++ linux-2.6.git/fs/exec.c
@@ -1071,18 +1071,10 @@ void set_task_comm(struct task_struct *t
 	perf_event_comm(tsk);
 }
 
-int flush_old_exec(struct linux_binprm * bprm)
+int flush_exec_keep_thread(struct linux_binprm * bprm)
 {
 	int retval;
 
-	/*
-	 * Make sure we have a private signal table and that
-	 * we are unassociated from the previous thread group.
-	 */
-	retval = de_thread(current);
-	if (retval)
-		goto out;
-
 	set_mm_exe_file(bprm->mm, bprm->file);
 
 	/*
@@ -1101,10 +1093,25 @@ int flush_old_exec(struct linux_binprm *
 	current->personality &= ~bprm->per_clear;
 
 	return 0;
-
 out:
 	return retval;
 }
+EXPORT_SYMBOL(flush_exec_keep_thread);
+
+int flush_old_exec(struct linux_binprm * bprm)
+{
+	int retval;
+
+	/*
+	 * Make sure we have a private signal table and that
+	 * we are unassociated from the previous thread group.
+	 */
+	retval = de_thread(current);
+	if (retval)
+		return retval;
+
+	return flush_exec_keep_thread(bprm);
+}
 EXPORT_SYMBOL(flush_old_exec);
 
 void would_dump(struct linux_binprm *bprm, struct file *file)
Index: linux-2.6.git/include/linux/binfmts.h
===================================================================
--- linux-2.6.git.orig/include/linux/binfmts.h
+++ linux-2.6.git/include/linux/binfmts.h
@@ -110,6 +110,7 @@ extern int prepare_binprm(struct linux_b
 extern int __must_check remove_arg_zero(struct linux_binprm *);
 extern int search_binary_handler(struct linux_binprm *, struct pt_regs *);
 extern int flush_old_exec(struct linux_binprm * bprm);
+extern int flush_exec_keep_thread(struct linux_binprm * bprm);
 extern void setup_new_exec(struct linux_binprm * bprm);
 extern void would_dump(struct linux_binprm *, struct file *);
 
Index: linux-2.6.git/include/linux/elf_ckpt.h
===================================================================
--- /dev/null
+++ linux-2.6.git/include/linux/elf_ckpt.h
@@ -0,0 +1,103 @@
+#ifndef _LINUX_ELF_CHECKPOINT_H
+#define _LINUX_ELF_CHECKPOINT_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/elf-em.h>
+
+#include <asm/elf.h>
+#include <asm/elf_ckpt.h>
+
+/*
+ * Elf extension includes new Elf file type
+ * and program header types as well.
+ */
+#define ET_CKPT				5
+
+#define PT_CKPT_OFFSET			0x01010101
+
+#define PT_CKPT_VMA			(PT_LOOS + PT_CKPT_OFFSET + 1)
+#define PT_CKPT_CORE			(PT_LOOS + PT_CKPT_OFFSET + 2)
+#define PT_CKPT_PAGES			(PT_LOOS + PT_CKPT_OFFSET + 3)
+
+#define CKPT_PAGE_SIZE			4096
+#define CKPT_TASK_COMM_LEN		16
+
+#define CKPT_HEADER_VERSION		1
+#define CKPT_HEADER_ARCH_X86_64		1
+
+#define VMA_AREA_REGULAR		(1 <<  0)
+#define VMA_AREA_STACK			(1 <<  1)
+#define VMA_AREA_VSYSCALL		(1 <<  2)
+#define VMA_AREA_VDSO			(1 <<  3)
+#define VMA_FORCE_READ			(1 <<  4)
+#define VMA_AREA_HEAP			(1 <<  5)
+#define VMA_FILE_PRIVATE		(1 <<  6)
+#define VMA_FILE_SHARED			(1 <<  7)
+#define VMA_ANON_SHARED			(1 <<  8)
+#define VMA_ANON_PRIVATE		(1 <<  9)
+#define VMA_FORCE_WRITE			(1 << 10)
+
+struct vma_entry {
+	__u64	start;
+	__u64	end;
+	__u64	pgoff;
+	__u32	prot;
+	__u32	flags;
+	__u32	status;			/* from VMA_x above */
+	__u32	pid;			/* pid VMA belongs to */
+	__s64	fd;
+	__u64	ino;
+	__u32	dev_maj;
+	__u32	dev_min;
+} __packed;
+
+struct page_entry {
+	__u64	va;			/* page virtual address */
+	__u8	data[CKPT_PAGE_SIZE];	/* page contents */
+} __packed;
+
+struct image_header {
+	__u16	version;
+	__u16	arch;
+	__u32	flags;
+} __packed;
+
+#define CKPT_ARCH_SIZE		(2 * 4096)
+#define CKPT_CORE_SIZE		(4 * 4096)
+
+struct core_entry {
+  union {
+    struct {
+	struct image_header		header;
+	__u8				arch[CKPT_ARCH_SIZE];	/* should be enough for all archs */
+	__u32				task_personality;
+	__u8				task_comm[CKPT_TASK_COMM_LEN];
+	__u32				task_flags;
+	__u64				mm_start_code;
+	__u64				mm_end_code;
+	__u64				mm_start_data;
+	__u64				mm_end_data;
+	__u64				mm_start_stack;
+	__u64				mm_start_brk;
+	__u64				mm_brk;
+    };
+    __u8				__core_pad[CKPT_CORE_SIZE];
+  };
+} __packed;
+
+#ifdef CONFIG_BINFMT_ELF_CKPT
+extern int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs,
+			 struct elfhdr *elf_ex, struct elf_phdr *elf_phdr);
+#else
+static inline int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs,
+				struct elfhdr *elf_ex, struct elf_phdr *elf_phdr)
+{
+	return -ENOEXEC;
+}
+#endif
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_ELF_CHECKPOINT_H */


  parent reply	other threads:[~2011-10-14 11:05 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-10-14 11:04 [patch 0/5] [RFC] Checkpoint/restore and Elf extension Cyrill Gorcunov
2011-10-14 11:04 ` [patch 1/5] proc: Introduce the Children: line in /proc/<pid>/status Cyrill Gorcunov
2011-10-14 16:36   ` Tejun Heo
2011-10-14 11:04 ` [patch 2/5] fs: Add do_close helper Cyrill Gorcunov
2011-10-14 11:04 ` [patch 3/5] fs, proc: Add /proc/$pid/tls entry Cyrill Gorcunov
2011-10-14 16:40   ` Tejun Heo
2011-10-14 16:43     ` Cyrill Gorcunov
2011-10-14 11:04 ` [patch 4/5] fs, proc: Add start_data, end_data, start_brk members to /proc/$pid/stat Cyrill Gorcunov
2011-10-14 11:04 ` Cyrill Gorcunov [this message]
2011-10-14 17:10   ` [patch 5/5] elf: Add support for loading ET_CKPT files Tejun Heo
2011-10-14 17:33     ` Tejun Heo
2011-10-19  9:03       ` Pavel Emelyanov
2011-10-19 18:22         ` Tejun Heo
2011-10-19 18:49           ` Cyrill Gorcunov
2011-10-19 18:52             ` Cyrill Gorcunov
2011-10-19 18:53               ` Tejun Heo
2011-10-19 19:56           ` Cyrill Gorcunov
2011-10-21 18:26             ` Tejun Heo
2011-10-21 18:36               ` Cyrill Gorcunov
2011-10-21 18:42                 ` Cyrill Gorcunov
2011-10-21 18:48                   ` Tejun Heo
2011-10-21 18:53                     ` Cyrill Gorcunov
2011-10-22  6:34                     ` Pavel Emelyanov
2011-10-20  8:33           ` Pavel Emelyanov
2011-10-20 15:56             ` Tejun Heo
2011-10-20 16:04               ` Cyrill Gorcunov
2011-10-20 17:30               ` Pavel Emelyanov
2011-10-15 18:59     ` Cyrill Gorcunov
2011-10-21 11:06     ` Glauber Costa
2011-10-21 11:20       ` Cyrill Gorcunov
2011-10-21 11:21         ` Glauber Costa
2011-10-21 11:35           ` Cyrill Gorcunov
2011-10-22 16:49     ` Dan Merillat

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20111014110511.670174429@openvz.org \
    --to=gorcunov@openvz.org \
    --cc=adobriyan@gmail.com \
    --cc=avagin@parallels.com \
    --cc=dave@linux.vnet.ibm.com \
    --cc=dlezcano@fr.ibm.com \
    --cc=ebiederm@xmission.com \
    --cc=glommer@parallels.com \
    --cc=hpa@zytor.com \
    --cc=jbottomley@parallels.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=tj@kernel.org \
    --cc=xemul@parallels.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).