All of lore.kernel.org
 help / color / mirror / Atom feed
* c/r: support for x86-64 arch
@ 2009-12-06 20:31 Oren Laadan
       [not found] ` <1260131469-2917-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Oren Laadan @ 2009-12-06 20:31 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: Alexey Dobriyan, Louis Rilling, Dave Hansen

The following patches add experimental support for x86-64 arch. The
code is based on Alexey's submission from a while ago.
                                                                     
The basic case of 64bit process checkpoint/restart works. Other cases
such as 32bit processes checkpoint/restart on 64->64, 32->64 and also
64->32 are not tested. Nor is self-checkpoint.
                                                                    
Being far from an expert on x86-64, I collected bits and pieces from
other places in the kernel - so this needs a serious review, including:

- How load_cpu_regs() restores the task's current state - I tried to
 follow similar work done by context switch code

- For self-checkpoint make sure we get the correct "running" state
 from current registers (e.g. segments), not from ptregs.
                                                                    
The first patch relocates and splits current x86-32 code. The second
patch adds support for x86-64. The third patch provides the user-cr
eclone() wrapper based on Dave and Louis's work.
     
Oren.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/2] c/r: [x86_32] sys_restore to use ptregs prototype
       [not found] ` <1260131469-2917-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-12-06 20:31   ` Oren Laadan
       [not found]     ` <1260131469-2917-2-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Oren Laadan @ 2009-12-06 20:31 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: Alexey Dobriyan, Louis Rilling, Dave Hansen

Similar to other select syscalls (fork, clone, execve), sys_restart
needs to access the pt_regs structure, so that it can modify it to
restore the original state from the time of the checkpoint.

(This is less of an issue for x86-32, however is required for those
architectures that otherwise save/restore partial state (e.g. not all
registers) during syscall entry/exit, like x86-64.

This patch prepares to support c/r on x86-64, specifically:

* Changes the syscall prototype and definition to accept the pt_regs
  struct as an argument (into %eax register).

* Move arch/x86/mm/checkpoint*.c to arch/x86/kernel/...

* Split 32bit-dependent part of arch/x86/kernel/checkpoint.c into a
  new arch/x86/kernel/checkpoint_32.c

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 arch/x86/include/asm/syscalls.h      |    5 +
 arch/x86/kernel/Makefile             |    8 +
 arch/x86/{mm => kernel}/checkpoint.c |  293 +++++++++-------------------------
 arch/x86/kernel/checkpoint_32.c      |  191 ++++++++++++++++++++++
 arch/x86/kernel/entry_32.S           |    3 +
 arch/x86/kernel/syscall_table_32.S   |    2 +-
 arch/x86/mm/Makefile                 |    2 -
 checkpoint/sys.c                     |    5 +-
 include/linux/checkpoint.h           |    2 +
 include/linux/syscalls.h             |    2 -
 10 files changed, 288 insertions(+), 225 deletions(-)
 rename arch/x86/{mm => kernel}/checkpoint.c (77%)
 create mode 100644 arch/x86/kernel/checkpoint_32.c

diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 2cadb8e..1079447 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -43,6 +43,11 @@ int sys_clone(struct pt_regs *);
 int sys_eclone(struct pt_regs *);
 int sys_execve(struct pt_regs *);
 
+/* kernel/checkpoint_32.c */
+#ifdef CONFIG_CHECKPOINT
+long sys_restart(struct pt_regs *);
+#endif
+
 /* kernel/signal.c */
 asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
 asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d8e5d0c..2821fd6 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -117,6 +117,14 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
 
 obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o
 
+obj-$(CONFIG_CHECKPOINT)	+= checkpoint.o
+
+###
+# 32 bit specific files
+ifeq ($(CONFIG_X86_32),y)
+	obj-$(CONFIG_CHECKPOINT)	+= checkpoint_32.o
+endif
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/mm/checkpoint.c b/arch/x86/kernel/checkpoint.c
similarity index 77%
rename from arch/x86/mm/checkpoint.c
rename to arch/x86/kernel/checkpoint.c
index 2752fdf..fbe9521 100644
--- a/arch/x86/mm/checkpoint.c
+++ b/arch/x86/kernel/checkpoint.c
@@ -18,59 +18,11 @@
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 
-/*
- * helpers to encode/decode/validate registers/segments/eflags
- */
-
-static int check_eflags(__u32 eflags)
-{
-#define X86_EFLAGS_CKPT_MASK  \
-	(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | \
-	 X86_EFLAGS_SF | X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_OF | \
-	 X86_EFLAGS_NT | X86_EFLAGS_AC | X86_EFLAGS_ID | X86_EFLAGS_RF)
-
-	if ((eflags & ~X86_EFLAGS_CKPT_MASK) != (X86_EFLAGS_IF | 0x2))
-		return 0;
-	return 1;
-}
-
-static void restore_eflags(struct pt_regs *regs, __u32 eflags)
-{
-	/*
-	 * A task may have had X86_EFLAGS_RF set at checkpoint, .e.g:
-	 * 1) It ran in a KVM guest, and the guest was being debugged,
-	 * 2) The kernel was debugged using kgbd,
-	 * 3) From Intel's manual: "When calling an event handler,
-	 *    Intel 64 and IA-32 processors establish the value of the
-	 *    RF flag in the EFLAGS image pushed on the stack:
-	 *  - For any fault-class exception except a debug exception
-	 *    generated in response to an instruction breakpoint, the
-	 *    value pushed for RF is 1.
-	 *  - For any interrupt arriving after any iteration of a
-	 *    repeated string instruction but the last iteration, the
-	 *    value pushed for RF is 1.
-	 *  - For any trap-class exception generated by any iteration
-	 *    of a repeated string instruction but the last iteration,
-	 *    the value pushed for RF is 1.
-	 *  - For other cases, the value pushed for RF is the value
-	 *    that was in EFLAG.RF at the time the event handler was
-	 *    called.
-	 *  [from: http://www.intel.com/Assets/PDF/manual/253668.pdf]
-	 *
-	 * The RF flag may be set in EFLAGS by the hardware, or by
-	 * kvm/kgdb, or even by the user with ptrace or by setting a
-	 * suitable context when returning from a signal handler.
-	 *
-	 * Therefore, on restart we (1) prserve X86_EFLAGS_RF from
-	 * checkpoint time, and (2) preserve a X86_EFLAGS_RF of the
-	 * restarting process if it already exists on saved EFLAGS.
-	 * Disable preemption to protect EFLAG test-and-change.
-	 */
-	preempt_disable();
-	eflags |= (regs->flags & X86_EFLAGS_RF);
-	regs->flags = eflags;
-	preempt_enable();
-}
+extern int check_segment(__u16 seg);
+extern __u16 encode_segment(unsigned short seg);
+extern unsigned short decode_segment(__u16 seg);
+extern void save_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t);
+extern int load_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t);
 
 static int check_tls(struct desc_struct *desc)
 {
@@ -81,70 +33,6 @@ static int check_tls(struct desc_struct *desc)
 	return 1;
 }
 
-static int check_segment(__u16 seg)
-{
-	int ret = 0;
-
-	switch (seg) {
-	case CKPT_X86_SEG_NULL:
-	case CKPT_X86_SEG_USER32_CS:
-	case CKPT_X86_SEG_USER32_DS:
-		return 1;
-	}
-	if (seg & CKPT_X86_SEG_TLS) {
-		seg &= ~CKPT_X86_SEG_TLS;
-		if (seg <= GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN)
-			ret = 1;
-	} else if (seg & CKPT_X86_SEG_LDT) {
-		seg &= ~CKPT_X86_SEG_LDT;
-		if (seg <= 0x1fff)
-			ret = 1;
-	}
-	return ret;
-}
-
-static __u16 encode_segment(unsigned short seg)
-{
-	if (seg == 0)
-		return CKPT_X86_SEG_NULL;
-	BUG_ON((seg & 3) != 3);
-
-	if (seg == __USER_CS)
-		return CKPT_X86_SEG_USER32_CS;
-	if (seg == __USER_DS)
-		return CKPT_X86_SEG_USER32_DS;
-
-	if (seg & 4)
-		return CKPT_X86_SEG_LDT | (seg >> 3);
-
-	seg >>= 3;
-	if (GDT_ENTRY_TLS_MIN <= seg && seg <= GDT_ENTRY_TLS_MAX)
-		return CKPT_X86_SEG_TLS | (seg - GDT_ENTRY_TLS_MIN);
-
-	printk(KERN_ERR "c/r: (decode) bad segment %#hx\n", seg);
-	BUG();
-}
-
-static unsigned short decode_segment(__u16 seg)
-{
-	if (seg == CKPT_X86_SEG_NULL)
-		return 0;
-	if (seg == CKPT_X86_SEG_USER32_CS)
-		return __USER_CS;
-	if (seg == CKPT_X86_SEG_USER32_DS)
-		return __USER_DS;
-
-	if (seg & CKPT_X86_SEG_TLS) {
-		seg &= ~CKPT_X86_SEG_TLS;
-		return ((GDT_ENTRY_TLS_MIN + seg) << 3) | 3;
-	}
-	if (seg & CKPT_X86_SEG_LDT) {
-		seg &= ~CKPT_X86_SEG_LDT;
-		return (seg << 3) | 7;
-	}
-	BUG();
-}
-
 #define CKPT_X86_TIF_UNSUPPORTED   (_TIF_SECCOMP | _TIF_IO_BITMAP)
 
 /**************************************************************************
@@ -153,10 +41,12 @@ static unsigned short decode_segment(__u16 seg)
 
 static int may_checkpoint_thread(struct ckpt_ctx *ctx, struct task_struct *t)
 {
+#ifdef CONFIG_X86_32
 	if (t->thread.vm86_info) {
 		ckpt_err(ctx, -EBUSY, "%(T)Task in VM86 mode\n");
 		return -EBUSY;
 	}
+#endif
 	if (task_thread_info(t)->flags & CKPT_X86_TIF_UNSUPPORTED) {
 		ckpt_err(ctx, -EBUSY, "%(T)Bad thread info flags %#lx\n",
 			 task_thread_info(t)->flags);
@@ -195,64 +85,10 @@ int checkpoint_thread(struct ckpt_ctx *ctx, struct task_struct *t)
 	return ret;
 }
 
-#ifdef CONFIG_X86_32
-
-static void save_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t)
-{
-	struct thread_struct *thread = &t->thread;
-	struct pt_regs *regs = task_pt_regs(t);
-	unsigned long _gs;
-
-	h->bp = regs->bp;
-	h->bx = regs->bx;
-	h->ax = regs->ax;
-	h->cx = regs->cx;
-	h->dx = regs->dx;
-	h->si = regs->si;
-	h->di = regs->di;
-	h->orig_ax = regs->orig_ax;
-	h->ip = regs->ip;
-
-	h->flags = regs->flags;
-	h->sp = regs->sp;
-
-	h->cs = encode_segment(regs->cs);
-	h->ss = encode_segment(regs->ss);
-	h->ds = encode_segment(regs->ds);
-	h->es = encode_segment(regs->es);
-
-	/*
-	 * for checkpoint in process context (from within a container)
-	 * the GS segment register should be saved from the hardware;
-	 * otherwise it is already saved on the thread structure
-	 */
-	if (t == current)
-		_gs = get_user_gs(regs);
-	else
-		_gs = thread->gs;
-
-	h->fsindex = encode_segment(regs->fs);
-	h->gsindex = encode_segment(_gs);
-
-	/*
-	 * for checkpoint in process context (from within a container),
-	 * the actual syscall is taking place at this very moment; so
-	 * we (optimistically) subtitute the future return value (0) of
-	 * this syscall into the orig_eax, so that upon restart it will
-	 * succeed (or it will endlessly retry checkpoint...)
-	 */
-	if (t == current) {
-		BUG_ON(h->orig_ax < 0);
-		h->ax = 0;
-	}
-}
-
 static void save_cpu_debug(struct ckpt_hdr_cpu *h, struct task_struct *t)
 {
 	struct thread_struct *thread = &t->thread;
 
-	/* debug regs */
-
 	/*
 	 * for checkpoint in process context (from within a container),
 	 * get the actual registers; otherwise get the saved values.
@@ -315,8 +151,6 @@ static int checkpoint_cpu_fpu(struct ckpt_ctx *ctx, struct task_struct *t)
 	return ret;
 }
 
-#endif	/* CONFIG_X86_32 */
-
 /* dump the cpu state and registers of a given task */
 int checkpoint_cpu(struct ckpt_ctx *ctx, struct task_struct *t)
 {
@@ -438,6 +272,13 @@ int restore_thread(struct ckpt_ctx *ctx)
 	load_TLS(thread, cpu);
 	put_cpu();
 
+#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT)
+	if (h->thread_info_flags & _TIF_IA32)
+		set_thread_flag(TIF_IA32);
+	else
+		clear_thread_flag(TIF_IA32);
+#endif
+
 	/* TODO: restore TIF flags as necessary (e.g. TIF_NOTSC) */
 
 	ret = 0;
@@ -446,49 +287,6 @@ int restore_thread(struct ckpt_ctx *ctx)
 	return ret;
 }
 
-#ifdef CONFIG_X86_32
-
-static int load_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t)
-{
-	struct thread_struct *thread = &t->thread;
-	struct pt_regs *regs = task_pt_regs(t);
-
-	if (!check_eflags(h->flags))
-		return -EINVAL;
-	if (h->cs == CKPT_X86_SEG_NULL)
-		return -EINVAL;
-	if (!check_segment(h->cs) || !check_segment(h->ds) ||
-	    !check_segment(h->es) || !check_segment(h->ss) ||
-	    !check_segment(h->fsindex) || !check_segment(h->gsindex))
-		return -EINVAL;
-
-	regs->bp = h->bp;
-	regs->bx = h->bx;
-	regs->ax = h->ax;
-	regs->cx = h->cx;
-	regs->dx = h->dx;
-	regs->si = h->si;
-	regs->di = h->di;
-	regs->orig_ax = h->orig_ax;
-	regs->ip = h->ip;
-
-	restore_eflags(regs, h->flags);
-	regs->sp = h->sp;
-
-	regs->ds = decode_segment(h->ds);
-	regs->es = decode_segment(h->es);
-	regs->cs = decode_segment(h->cs);
-	regs->ss = decode_segment(h->ss);
-
-	regs->fs = decode_segment(h->fsindex);
-	regs->gs = decode_segment(h->gsindex);
-
-	thread->gs = regs->gs;
-	lazy_load_gs(regs->gs);
-
-	return 0;
-}
-
 static int load_cpu_debug(struct ckpt_hdr_cpu *h, struct task_struct *t)
 {
 	int ret;
@@ -548,7 +346,65 @@ static int restore_cpu_fpu(struct ckpt_ctx *ctx, struct task_struct *t)
 	return ret;
 }
 
-#endif	/* CONFIG_X86_32 */
+static int check_eflags(__u32 eflags)
+{
+#define X86_EFLAGS_CKPT_MASK  \
+	(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | \
+	 X86_EFLAGS_SF | X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_OF | \
+	 X86_EFLAGS_NT | X86_EFLAGS_AC | X86_EFLAGS_ID | X86_EFLAGS_RF)
+
+	if ((eflags & ~X86_EFLAGS_CKPT_MASK) != (X86_EFLAGS_IF | 0x2))
+		return 0;
+	return 1;
+}
+
+static void restore_eflags(struct pt_regs *regs, __u32 eflags)
+{
+	/*
+	 * A task may have had X86_EFLAGS_RF set at checkpoint, .e.g:
+	 * 1) It ran in a KVM guest, and the guest was being debugged,
+	 * 2) The kernel was debugged using kgbd,
+	 * 3) From Intel's manual: "When calling an event handler,
+	 *    Intel 64 and IA-32 processors establish the value of the
+	 *    RF flag in the EFLAGS image pushed on the stack:
+	 *  - For any fault-class exception except a debug exception
+	 *    generated in response to an instruction breakpoint, the
+	 *    value pushed for RF is 1.
+	 *  - For any interrupt arriving after any iteration of a
+	 *    repeated string instruction but the last iteration, the
+	 *    value pushed for RF is 1.
+	 *  - For any trap-class exception generated by any iteration
+	 *    of a repeated string instruction but the last iteration,
+	 *    the value pushed for RF is 1.
+	 *  - For other cases, the value pushed for RF is the value
+	 *    that was in EFLAG.RF at the time the event handler was
+	 *    called.
+	 *  [from: http://www.intel.com/Assets/PDF/manual/253668.pdf]
+	 *
+	 * The RF flag may be set in EFLAGS by the hardware, or by
+	 * kvm/kgdb, or even by the user with ptrace or by setting a
+	 * suitable context when returning from a signal handler.
+	 *
+	 * Therefore, on restart we (1) prserve X86_EFLAGS_RF from
+	 * checkpoint time, and (2) preserve a X86_EFLAGS_RF of the
+	 * restarting process if it already exists on saved EFLAGS.
+	 * Disable preemption to protect EFLAG test-and-change.
+	 */
+	preempt_disable();
+	eflags |= (regs->flags & X86_EFLAGS_RF);
+	regs->flags = eflags;
+	preempt_enable();
+}
+
+static int load_cpu_eflags(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+	struct pt_regs *regs = task_pt_regs(t);
+
+	if (!check_eflags(h->flags))
+		return -EINVAL;
+	restore_eflags(regs, h->flags);
+	return 0;
+}
 
 /* read the cpu state and registers for the current task */
 int restore_cpu(struct ckpt_ctx *ctx)
@@ -566,6 +422,9 @@ int restore_cpu(struct ckpt_ctx *ctx)
 	ret = load_cpu_regs(h, t);
 	if (ret < 0)
 		goto out;
+	ret = load_cpu_eflags(h, t);
+	if (ret < 0)
+		goto out;
 	ret = load_cpu_debug(h, t);
 	if (ret < 0)
 		goto out;
diff --git a/arch/x86/kernel/checkpoint_32.c b/arch/x86/kernel/checkpoint_32.c
new file mode 100644
index 0000000..d5ea6a0
--- /dev/null
+++ b/arch/x86/kernel/checkpoint_32.c
@@ -0,0 +1,191 @@
+/*
+ *  Checkpoint/restart - architecture specific support for x86_32
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/elf.h>
+
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+/*
+ * sys_restart needs to access and modify the pt_regs structure to
+ * restore the original state from the time of the checkpoint.
+ */
+long sys_restart(struct pt_regs *regs)
+{
+	unsigned long flags;
+	int fd, logfd;
+	pid_t pid;
+
+	pid = regs->bx;
+	fd = regs->cx;
+	flags = regs->dx;
+	logfd = regs->di;
+
+	return do_sys_restart(pid, fd, flags, logfd);
+}
+
+/* helpers to encode/decode/validate segments */
+
+static int check_segment(__u16 seg)
+{
+	int ret = 0;
+
+	switch (seg) {
+	case CKPT_X86_SEG_NULL:
+	case CKPT_X86_SEG_USER32_CS:
+	case CKPT_X86_SEG_USER32_DS:
+		return 1;
+	}
+	if (seg & CKPT_X86_SEG_TLS) {
+		seg &= ~CKPT_X86_SEG_TLS;
+		if (seg <= GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN)
+			ret = 1;
+	} else if (seg & CKPT_X86_SEG_LDT) {
+		seg &= ~CKPT_X86_SEG_LDT;
+		if (seg <= 0x1fff)
+			ret = 1;
+	}
+	return ret;
+}
+
+static __u16 encode_segment(unsigned short seg)
+{
+	if (seg == 0)
+		return CKPT_X86_SEG_NULL;
+	BUG_ON((seg & 3) != 3);
+
+	if (seg == __USER_CS)
+		return CKPT_X86_SEG_USER32_CS;
+	if (seg == __USER_DS)
+		return CKPT_X86_SEG_USER32_DS;
+
+	if (seg & 4)
+		return CKPT_X86_SEG_LDT | (seg >> 3);
+
+	seg >>= 3;
+	if (GDT_ENTRY_TLS_MIN <= seg && seg <= GDT_ENTRY_TLS_MAX)
+		return CKPT_X86_SEG_TLS | (seg - GDT_ENTRY_TLS_MIN);
+
+	printk(KERN_ERR "c/r: (decode) bad segment %#hx\n", seg);
+	BUG();
+}
+
+static unsigned short decode_segment(__u16 seg)
+{
+	if (seg == CKPT_X86_SEG_NULL)
+		return 0;
+	if (seg == CKPT_X86_SEG_USER32_CS)
+		return __USER_CS;
+	if (seg == CKPT_X86_SEG_USER32_DS)
+		return __USER_DS;
+
+	if (seg & CKPT_X86_SEG_TLS) {
+		seg &= ~CKPT_X86_SEG_TLS;
+		return ((GDT_ENTRY_TLS_MIN + seg) << 3) | 3;
+	}
+	if (seg & CKPT_X86_SEG_LDT) {
+		seg &= ~CKPT_X86_SEG_LDT;
+		return (seg << 3) | 7;
+	}
+	BUG();
+}
+
+void save_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+	struct thread_struct *thread = &t->thread;
+	struct pt_regs *regs = task_pt_regs(t);
+	unsigned long _gs;
+
+	h->bp = regs->bp;
+	h->bx = regs->bx;
+	h->ax = regs->ax;
+	h->cx = regs->cx;
+	h->dx = regs->dx;
+	h->si = regs->si;
+	h->di = regs->di;
+	h->orig_ax = regs->orig_ax;
+	h->ip = regs->ip;
+
+	h->flags = regs->flags;
+	h->sp = regs->sp;
+
+	h->cs = encode_segment(regs->cs);
+	h->ss = encode_segment(regs->ss);
+	h->ds = encode_segment(regs->ds);
+	h->es = encode_segment(regs->es);
+
+	/*
+	 * for checkpoint in process context (from within a container)
+	 * the GS segment register should be saved from the hardware;
+	 * otherwise it is already saved on the thread structure
+	 */
+	if (t == current)
+		_gs = get_user_gs(regs);
+	else
+		_gs = thread->gs;
+
+	h->fsindex = encode_segment(regs->fs);
+	h->gsindex = encode_segment(_gs);
+
+	/*
+	 * for checkpoint in process context (from within a container),
+	 * the actual syscall is taking place at this very moment; so
+	 * we (optimistically) subtitute the future return value (0) of
+	 * this syscall into the orig_eax, so that upon restart it will
+	 * succeed (or it will endlessly retry checkpoint...)
+	 */
+	if (t == current) {
+		BUG_ON(h->orig_ax < 0);
+		h->ax = 0;
+	}
+}
+
+int load_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+	struct thread_struct *thread = &t->thread;
+	struct pt_regs *regs = task_pt_regs(t);
+
+	if (h->cs == CKPT_X86_SEG_NULL)
+		return -EINVAL;
+	if (!check_segment(h->cs) || !check_segment(h->ds) ||
+	    !check_segment(h->es) || !check_segment(h->ss) ||
+	    !check_segment(h->fsindex) || !check_segment(h->gsindex))
+		return -EINVAL;
+
+	regs->bp = h->bp;
+	regs->bx = h->bx;
+	regs->ax = h->ax;
+	regs->cx = h->cx;
+	regs->dx = h->dx;
+	regs->si = h->si;
+	regs->di = h->di;
+	regs->orig_ax = h->orig_ax;
+	regs->ip = h->ip;
+
+	regs->sp = h->sp;
+
+	regs->ds = decode_segment(h->ds);
+	regs->es = decode_segment(h->es);
+	regs->cs = decode_segment(h->cs);
+	regs->ss = decode_segment(h->ss);
+
+	regs->fs = decode_segment(h->fsindex);
+	regs->gs = decode_segment(h->gsindex);
+
+	thread->gs = regs->gs;
+	lazy_load_gs(regs->gs);
+
+	return 0;
+}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 7e7f3c8..ecefd09 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -726,6 +726,9 @@ PTREGSCALL(sigreturn)
 PTREGSCALL(rt_sigreturn)
 PTREGSCALL(vm86)
 PTREGSCALL(vm86old)
+#ifdef CONFIG_CHECKPOINT
+PTREGSCALL(restart)
+#endif
 
 .macro FIXUP_ESPFIX_STACK
 /*
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index a1bc7f7..1ca053e 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -338,4 +338,4 @@ ENTRY(sys_call_table)
 	.long sys_perf_event_open
 	.long ptregs_eclone
 	.long sys_checkpoint
-	.long sys_restart
+	.long ptregs_restart
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 735c0b2..06630d2 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -26,5 +26,3 @@ obj-$(CONFIG_K8_NUMA)		+= k8topology_64.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat_$(BITS).o
 
 obj-$(CONFIG_MEMTEST)		+= memtest.o
-
-obj-$(CONFIG_CHECKPOINT)	+= checkpoint.o
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index afcfa1e..89056d6 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -648,7 +648,7 @@ SYSCALL_DEFINE4(checkpoint, pid_t, pid, int, fd,
 }
 
 /**
- * sys_restart - restart a container
+ * do_sys_restart - restart a container
  * @pid: pid of task root (in coordinator's namespace), or 0
  * @fd: file from which read the checkpoint image
  * @flags: restart operation flags
@@ -657,8 +657,7 @@ SYSCALL_DEFINE4(checkpoint, pid_t, pid, int, fd,
  * Returns negative value on error, or otherwise returns in the realm
  * of the original checkpoint
  */
-SYSCALL_DEFINE4(restart, pid_t, pid, int, fd,
-		unsigned long, flags, int, logfd)
+long do_sys_restart(pid_t pid, int fd, unsigned long flags, int logfd)
 {
 	struct ckpt_ctx *ctx = NULL;
 	long ret;
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index c6c8d56..d81c59c 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -60,6 +60,8 @@
 #define CKPT_LSM_INFO_LEN 200
 #define CKPT_LSM_STRING_MAX 1024
 
+extern long do_sys_restart(pid_t pid, int fd, unsigned long flags, int logfd);
+
 extern int walk_task_subtree(struct task_struct *task,
 			     int (*func)(struct task_struct *, void *),
 			     void *data);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 9ed192f..264a02e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -874,8 +874,6 @@ asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
 			  size_t);
 asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags,
 			       int logfd);
-asmlinkage long sys_restart(pid_t pid, int fd, unsigned long flags,
-			    int logfd);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
-- 
1.6.3.3

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH] user-cr: eclone x86-64 wrapper
       [not found]     ` <1260131469-2917-2-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-12-06 20:31       ` Oren Laadan
       [not found]         ` <1260131469-2917-3-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-12-06 22:51       ` [PATCH 1/2] c/r: [x86_32] sys_restore to use ptregs prototype Oren Laadan
  1 sibling, 1 reply; 9+ messages in thread
From: Oren Laadan @ 2009-12-06 20:31 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: Alexey Dobriyan, Louis Rilling, Dave Hansen

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 clone_x86_64.c |   88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 88 insertions(+), 0 deletions(-)
 create mode 100644 clone_x86_64.c

diff --git a/clone_x86_64.c b/clone_x86_64.c
new file mode 100644
index 0000000..d6d7e6f
--- /dev/null
+++ b/clone_x86_64.c
@@ -0,0 +1,88 @@
+/*
+ *  clone_x86_64.c: support for eclone() on x86_64
+ *
+ *  Copyright (C) Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
+ *  Copyright (C) Dave Hansen <daveh-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#define _GNU_SOURCE
+
+#include <unistd.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <asm/unistd.h>
+
+/*
+ * libc doesn't support eclone() yet...
+ * below is arch-dependent code to use the syscall
+ */
+#include <linux/checkpoint.h>
+
+#include "eclone.h"
+
+#ifndef __NR_eclone
+#define __NR_eclone 299
+#endif
+
+int eclone(int (*fn)(void *), void *fn_arg, int clone_flags_low,
+	   struct clone_args *clone_args, pid_t *pids)
+{
+	struct clone_args my_args;
+	long retval;
+	void **newstack;
+
+	if (clone_args->child_stack) {
+		/*
+		 * Set up the stack for child:
+		 *  - fn_arg will be the argument for the child function
+		 *  - the fn pointer will be loaded into ebx after the clone
+		 */
+		newstack = (void **)(unsigned long)(clone_args->child_stack +
+					    clone_args->child_stack_size);
+		*--newstack = fn_arg;
+		*--newstack = fn;
+	} else
+		newstack = (void **)0;
+
+	my_args = *clone_args;
+	my_args.child_stack = (unsigned long)newstack;
+	my_args.child_stack_size = 0;
+
+        __asm__  __volatile__(
+		"movq %6, %%r10\n\t"	/* pids in r10*/
+		"syscall\n\t"		/* Linux/x86_64 system call */
+		"testq %0,%0\n\t"	/* check return value */
+		"jne 1f\n\t"		/* jump if parent */
+		"popq %%rax\n\t"	/* get subthread function */
+		"popq %%rdi\n\t"	/* get the subthread function arg */
+		"call *%%rax\n\t"	/* start subthread function */
+		"movq %2,%0\n\t"
+		"syscall\n"		/* exit system call: exit subthread */
+		"1:\n\t"
+		:"=a" (retval)
+		:"0" (__NR_eclone), "i" (__NR_exit),
+		 "D" (clone_flags_low),	/* rdi */
+		 "S" (&my_args),	/* rsi */
+		 "d" (sizeof(my_args)),	/* rdx */
+		 "m" (pids)		/* gets moved to r10 */
+		:"rcx", "r10", "r11", "cc"
+		);
+				        /*
+         * glibc lists 'cc' as clobbered, so we might as
+	 * well do it too.  'r11' and 'rcx' are clobbered
+	 * by the 'syscall' instruction itself.  'r8' and
+	 * 'r9' are clobbered by the clone, but that
+	 * thread will exit before getting back out to C.
+         */
+
+	if (retval < 0) {
+		errno = -retval;
+		retval = -1;
+	}
+	return retval;
+}
-- 
1.6.3.3

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/2] c/r: x86-64: checkpoint/restart implementation
       [not found]         ` <1260131469-2917-3-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-12-06 20:31           ` Oren Laadan
  2009-12-06 20:35           ` [PATCH] user-cr: eclone x86-64 wrapper Oren Laadan
  1 sibling, 0 replies; 9+ messages in thread
From: Oren Laadan @ 2009-12-06 20:31 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: Alexey Dobriyan, Louis Rilling, Dave Hansen

Support for checkpoint and restart for X86_32 architecture.
Partly based on Alexey's work.

 Checkpoint          Restart
 (app/arch)         (app/arch)
--------------------------------
  64/x86-64	->  64/x86-64	  works
  32/x86-64	->  32/x86-64	  ?
  32/x86-64	->  32/x86-32	  ?
  32/x86-32	->  32/x86-64	  ?

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 arch/x86/Kconfig                      |    2 +-
 arch/x86/include/asm/checkpoint_hdr.h |    6 +
 arch/x86/include/asm/syscalls.h       |    6 +
 arch/x86/include/asm/unistd_64.h      |    4 +
 arch/x86/kernel/Makefile              |    2 +
 arch/x86/kernel/checkpoint_64.c       |  251 +++++++++++++++++++++++++++++++++
 arch/x86/kernel/entry_64.S            |    5 +
 include/linux/checkpoint_hdr.h        |    2 +
 8 files changed, 277 insertions(+), 1 deletions(-)
 create mode 100644 arch/x86/kernel/checkpoint_64.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 69d6077..f6260f5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -88,7 +88,7 @@ config HAVE_LATENCYTOP_SUPPORT
 
 config CHECKPOINT_SUPPORT
 	bool
-	default y if X86_32
+	default y
 
 config MMU
 	def_bool y
diff --git a/arch/x86/include/asm/checkpoint_hdr.h b/arch/x86/include/asm/checkpoint_hdr.h
index 65511ca..0033bfe 100644
--- a/arch/x86/include/asm/checkpoint_hdr.h
+++ b/arch/x86/include/asm/checkpoint_hdr.h
@@ -36,6 +36,10 @@
 #include <asm/processor.h>
 #endif
 
+#ifdef CONFIG_X86_64
+#define CKPT_ARCH_ID	CKPT_ARCH_X86_64
+#endif
+
 #ifdef CONFIG_X86_32
 #define CKPT_ARCH_ID	CKPT_ARCH_X86_32
 #endif
@@ -135,6 +139,8 @@ struct ckpt_hdr_cpu {
 #define CKPT_X86_SEG_NULL	0
 #define CKPT_X86_SEG_USER32_CS	1
 #define CKPT_X86_SEG_USER32_DS	2
+#define CKPT_X86_SEG_USER64_CS	3
+#define CKPT_X86_SEG_USER64_DS	4
 #define CKPT_X86_SEG_TLS	0x4000	/* 0100 0000 0000 00xx */
 #define CKPT_X86_SEG_LDT	0x8000	/* 100x xxxx xxxx xxxx */
 
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 1079447..063cdd0 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -88,6 +88,12 @@ asmlinkage long sys_execve(char __user *, char __user * __user *,
 			   struct pt_regs *);
 long sys_arch_prctl(int, unsigned long);
 
+/* kernel/checkpoint_64.c */
+#ifdef CONFIG_CHECKPOINT
+asmlinkage long sys_restart(pid_t pid, int fd, unsigned long flags, int logfd,
+			    struct pt_regs *regs);
+#endif
+
 /* kernel/signal.c */
 asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
 				struct pt_regs *);
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index d2ffc89..c360707 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,10 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
 __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 #define __NR_eclone                   		299
 __SYSCALL(__NR_eclone, stub_eclone)
+#define __NR_checkpoint                   	300
+__SYSCALL(__NR_checkpoint, sys_checkpoint)
+#define __NR_restart                   		301
+__SYSCALL(__NR_restart, stub_restart)
 
 
 #ifndef __NO_STUBS
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2821fd6..ded0ee2 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -138,4 +138,6 @@ ifeq ($(CONFIG_X86_64),y)
 
 	obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o
 	obj-y				+= vsmp_64.o
+
+	obj-$(CONFIG_CHECKPOINT)	+= checkpoint_64.o
 endif
diff --git a/arch/x86/kernel/checkpoint_64.c b/arch/x86/kernel/checkpoint_64.c
new file mode 100644
index 0000000..3901a53
--- /dev/null
+++ b/arch/x86/kernel/checkpoint_64.c
@@ -0,0 +1,251 @@
+/*
+ *  Checkpoint/restart - architecture specific support for x86_64
+ *
+ *  Copyright (C) 2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/elf.h>
+
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+/*
+ * sys_restart needs to access and modify the pt_regs structure to
+ * restore the original state from the time of the checkpoint.
+ */
+asmlinkage long sys_restart(pid_t pid, int fd, unsigned long flags, int logfd,
+			    struct pt_regs *regs)
+{
+	return do_sys_restart(pid, fd, flags, logfd);
+}
+
+/* helpers to encode/decode/validate segments */
+
+int check_segment(__u16 seg)
+{
+	int ret = 0;
+
+	switch (seg) {
+	case CKPT_X86_SEG_NULL:
+	case CKPT_X86_SEG_USER64_CS:
+	case CKPT_X86_SEG_USER64_DS:
+#ifdef CONFIG_COMPAT
+	case CKPT_X86_SEG_USER32_CS:
+	case CKPT_X86_SEG_USER32_DS:
+#endif
+		return 1;
+	}
+	if (seg & CKPT_X86_SEG_TLS) {
+		seg &= ~CKPT_X86_SEG_TLS;
+		if (seg <= GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN)
+			ret = 1;
+	} else if (seg & CKPT_X86_SEG_LDT) {
+		seg &= ~CKPT_X86_SEG_LDT;
+		if (seg <= 0x1fff)
+			ret = 1;
+	}
+	return ret;
+}
+
+__u16 encode_segment(unsigned short seg)
+{
+	if (seg == 0)
+		return CKPT_X86_SEG_NULL;
+	BUG_ON((seg & 3) != 3);
+
+	if (seg == __USER_CS)
+		return CKPT_X86_SEG_USER64_CS;
+	if (seg == __USER_DS)
+		return CKPT_X86_SEG_USER64_DS;
+#ifdef CONFIG_COMPAT
+	if (seg == __USER32_CS)
+		return CKPT_X86_SEG_USER32_CS;
+	if (seg == __USER32_DS)
+		return CKPT_X86_SEG_USER32_DS;
+#endif
+
+	if (seg & 4)
+		return CKPT_X86_SEG_LDT | (seg >> 3);
+
+	seg >>= 3;
+	if (GDT_ENTRY_TLS_MIN <= seg && seg <= GDT_ENTRY_TLS_MAX)
+		return CKPT_X86_SEG_TLS | (seg - GDT_ENTRY_TLS_MIN);
+
+	printk(KERN_ERR "c/r: (decode) bad segment %#hx\n", seg);
+	BUG();
+}
+
+unsigned short decode_segment(__u16 seg)
+{
+	if (seg == CKPT_X86_SEG_NULL)
+		return 0;
+
+	if (seg == CKPT_X86_SEG_USER64_CS)
+		return __USER_CS;
+	if (seg == CKPT_X86_SEG_USER64_DS)
+		return __USER_DS;
+#ifdef CONFIG_COMPAT
+	if (seg == CKPT_X86_SEG_USER32_CS)
+		return __USER32_CS;
+	if (seg == CKPT_X86_SEG_USER32_DS)
+		return __USER32_DS;
+#endif
+
+	if (seg & CKPT_X86_SEG_TLS) {
+		seg &= ~CKPT_X86_SEG_TLS;
+		return ((GDT_ENTRY_TLS_MIN + seg) << 3) | 3;
+	}
+	if (seg & CKPT_X86_SEG_LDT) {
+		seg &= ~CKPT_X86_SEG_LDT;
+		return (seg << 3) | 7;
+	}
+	BUG();
+}
+
+void save_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+	struct pt_regs *regs = task_pt_regs(t);
+	unsigned long _ds, _es, _fs, _gs;
+
+	h->r15 = regs->r15;
+	h->r14 = regs->r14;
+	h->r13 = regs->r13;
+	h->r12 = regs->r12;
+	h->r11 = regs->r11;
+	h->r10 = regs->r10;
+	h->r9 = regs->r9;
+	h->r8 = regs->r8;
+
+	h->bp = regs->bp;
+	h->bx = regs->bx;
+	h->ax = regs->ax;
+	h->cx = regs->cx;
+	h->dx = regs->dx;
+	h->si = regs->si;
+	h->di = regs->di;
+	h->orig_ax = regs->orig_ax;
+	h->ip = regs->ip;
+
+	h->flags = regs->flags;
+	h->sp = regs->sp;
+
+	/*
+	 * for checkpoint in process context (from within a container)
+	 * DS, ES, FS, GS registers should be saved from the hardware;
+	 * otherwise they are already saved on the thread structure
+	 */
+
+	h->cs = encode_segment(regs->cs);
+	h->ss = encode_segment(regs->ss);
+
+	if (t == current) {
+		savesegment(ds, _ds);
+		savesegment(es, _es);
+		savesegment(fs, _fs);
+		savesegment(gs, _gs);
+	} else {
+		_ds = t->thread.ds;
+		_es = t->thread.es;
+		_fs = t->thread.fsindex;
+		_gs = t->thread.gsindex;
+	}
+	h->ds = encode_segment(_ds);
+	h->es = encode_segment(_es);
+	h->fsindex = encode_segment(_fs);
+	h->gsindex = encode_segment(_gs);
+
+	if (!test_tsk_thread_flag(t, TIF_IA32)) {
+		h->fs = t->thread.fs;
+		h->gs = t->thread.gs;
+	}
+
+	/*
+	 * for checkpoint in process context (from within a container),
+	 * the actual syscall is taking place at this very moment; so
+	 * we (optimistically) subtitute the future return value (0) of
+	 * this syscall into the orig_eax, so that upon restart it will
+	 * succeed (or it will endlessly retry checkpoint...)
+	 */
+	if (t == current) {
+		BUG_ON(h->orig_ax < 0);
+		h->ax = 0;
+	}
+}
+
+int load_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+	struct thread_struct *thread = &t->thread;
+	struct pt_regs *regs = task_pt_regs(t);
+
+	if (h->cs == CKPT_X86_SEG_NULL)
+		return -EINVAL;
+	if (!check_segment(h->cs) || !check_segment(h->ds) ||
+	    !check_segment(h->es) || !check_segment(h->ss) ||
+	    !check_segment(h->fsindex) || !check_segment(h->gsindex))
+		return -EINVAL;
+
+#ifdef CONFIG_COMPAT
+	if (test_tsk_thread_flag(t, TIF_IA32) &&
+	    (!check_segment(h->fs) || !check_segment(h->gs)))
+		return -EINVAL;
+#endif
+
+	regs->r15 = h->r15;
+	regs->r14 = h->r14;
+	regs->r13 = h->r13;
+	regs->r12 = h->r12;
+	regs->r11 = h->r11;
+	regs->r10 = h->r10;
+	regs->r9 = h->r9;
+	regs->r8 = h->r8;
+
+	regs->bp = h->bp;
+	regs->bx = h->bx;
+	regs->ax = h->ax;
+	regs->cx = h->cx;
+	regs->dx = h->dx;
+	regs->si = h->si;
+	regs->di = h->di;
+	regs->orig_ax = h->orig_ax;
+	regs->ip = h->ip;
+
+	regs->sp = h->sp;
+	thread->usersp = h->sp;
+
+	preempt_disable();
+
+	regs->cs = decode_segment(h->cs);
+	regs->ss = decode_segment(h->ss);
+	thread->ds = decode_segment(h->ds);
+	thread->es = decode_segment(h->es);
+	thread->fsindex = decode_segment(h->fsindex);
+	thread->gsindex = decode_segment(h->gsindex);
+
+#ifdef CONFIG_COMPAT
+	if (!test_tsk_thread_flag(t, TIF_IA32)) {
+		thread->fs = h->fs;
+		thread->gs = h->gs;
+	}
+#endif
+
+	/* XXX - unsure is this really needed ... */
+	loadsegment(fs, thread->fsindex);
+        if (thread->fs)
+		wrmsrl(MSR_FS_BASE, thread->fs);
+	load_gs_index(thread->gsindex);
+        if (thread->gs)
+		wrmsrl(MSR_KERNEL_GS_BASE, thread->gs);
+
+	preempt_enable();
+
+	return 0;
+}
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 6d60cd1..e692193 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -699,6 +699,11 @@ END(\label)
 	PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
 	PTREGSCALL stub_iopl, sys_iopl, %rsi
 	PTREGSCALL stub_eclone, sys_eclone, %r8
+#ifdef CONFIG_CHECKPOINT
+	PTREGSCALL stub_restart, sys_restart, %r8
+#else
+	PTREGSCALL stub_restart, sys_ni_syscall, %r8
+#endif
 
 ENTRY(ptregscall_common)
 	DEFAULT_FRAME 1 8	/* offset 8: return address */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 4e57d37..6468fa9 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -195,6 +195,8 @@ enum {
 #define CKPT_ARCH_PPC32 CKPT_ARCH_PPC32
 	CKPT_ARCH_PPC64,
 #define CKPT_ARCH_PPC64 CKPT_ARCH_PPC64
+	CKPT_ARCH_X86_64,
+#define CKPT_ARCH_X86_64 CKPT_ARCH_X86_64
 };
 
 /* shared objrects (objref) */
-- 
1.6.3.3

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] user-cr: eclone x86-64 wrapper
       [not found]         ` <1260131469-2917-3-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-12-06 20:31           ` [PATCH 2/2] c/r: x86-64: checkpoint/restart implementation Oren Laadan
@ 2009-12-06 20:35           ` Oren Laadan
  1 sibling, 0 replies; 9+ messages in thread
From: Oren Laadan @ 2009-12-06 20:35 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: Louis Rilling, Alexey Dobriyan, Dave Hansen


To test this, you need to update the kernel headers for user-cr
	$ scripts/extract_headers -s PATH_TO_CR_KERNEL

Oren.

Oren Laadan wrote:
> Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
> ---
>  clone_x86_64.c |   88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 files changed, 88 insertions(+), 0 deletions(-)
>  create mode 100644 clone_x86_64.c
> 
> diff --git a/clone_x86_64.c b/clone_x86_64.c
> new file mode 100644
> index 0000000..d6d7e6f
> --- /dev/null
> +++ b/clone_x86_64.c
> @@ -0,0 +1,88 @@
> +/*
> + *  clone_x86_64.c: support for eclone() on x86_64
> + *
> + *  Copyright (C) Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
> + *  Copyright (C) Dave Hansen <daveh-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> + *
> + *  This file is subject to the terms and conditions of the GNU General Public
> + *  License.  See the file COPYING in the main directory of the Linux
> + *  distribution for more details.
> + */
> +
> +#define _GNU_SOURCE
> +
> +#include <unistd.h>
> +#include <errno.h>
> +#include <sys/types.h>
> +#include <sys/syscall.h>
> +#include <asm/unistd.h>
> +
> +/*
> + * libc doesn't support eclone() yet...
> + * below is arch-dependent code to use the syscall
> + */
> +#include <linux/checkpoint.h>
> +
> +#include "eclone.h"
> +
> +#ifndef __NR_eclone
> +#define __NR_eclone 299
> +#endif
> +
> +int eclone(int (*fn)(void *), void *fn_arg, int clone_flags_low,
> +	   struct clone_args *clone_args, pid_t *pids)
> +{
> +	struct clone_args my_args;
> +	long retval;
> +	void **newstack;
> +
> +	if (clone_args->child_stack) {
> +		/*
> +		 * Set up the stack for child:
> +		 *  - fn_arg will be the argument for the child function
> +		 *  - the fn pointer will be loaded into ebx after the clone
> +		 */
> +		newstack = (void **)(unsigned long)(clone_args->child_stack +
> +					    clone_args->child_stack_size);
> +		*--newstack = fn_arg;
> +		*--newstack = fn;
> +	} else
> +		newstack = (void **)0;
> +
> +	my_args = *clone_args;
> +	my_args.child_stack = (unsigned long)newstack;
> +	my_args.child_stack_size = 0;
> +
> +        __asm__  __volatile__(
> +		"movq %6, %%r10\n\t"	/* pids in r10*/
> +		"syscall\n\t"		/* Linux/x86_64 system call */
> +		"testq %0,%0\n\t"	/* check return value */
> +		"jne 1f\n\t"		/* jump if parent */
> +		"popq %%rax\n\t"	/* get subthread function */
> +		"popq %%rdi\n\t"	/* get the subthread function arg */
> +		"call *%%rax\n\t"	/* start subthread function */
> +		"movq %2,%0\n\t"
> +		"syscall\n"		/* exit system call: exit subthread */
> +		"1:\n\t"
> +		:"=a" (retval)
> +		:"0" (__NR_eclone), "i" (__NR_exit),
> +		 "D" (clone_flags_low),	/* rdi */
> +		 "S" (&my_args),	/* rsi */
> +		 "d" (sizeof(my_args)),	/* rdx */
> +		 "m" (pids)		/* gets moved to r10 */
> +		:"rcx", "r10", "r11", "cc"
> +		);
> +				        /*
> +         * glibc lists 'cc' as clobbered, so we might as
> +	 * well do it too.  'r11' and 'rcx' are clobbered
> +	 * by the 'syscall' instruction itself.  'r8' and
> +	 * 'r9' are clobbered by the clone, but that
> +	 * thread will exit before getting back out to C.
> +         */
> +
> +	if (retval < 0) {
> +		errno = -retval;
> +		retval = -1;
> +	}
> +	return retval;
> +}

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] c/r: [x86_32] sys_restore to use ptregs prototype
       [not found]     ` <1260131469-2917-2-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-12-06 20:31       ` [PATCH] user-cr: eclone x86-64 wrapper Oren Laadan
@ 2009-12-06 22:51       ` Oren Laadan
       [not found]         ` <4B1C357C.2090003-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  1 sibling, 1 reply; 9+ messages in thread
From: Oren Laadan @ 2009-12-06 22:51 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA; +Cc: Nathan Lynch




Oren Laadan wrote:
> Similar to other select syscalls (fork, clone, execve), sys_restart
> needs to access the pt_regs structure, so that it can modify it to
> restore the original state from the time of the checkpoint.
> 
> (This is less of an issue for x86-32, however is required for those
> architectures that otherwise save/restore partial state (e.g. not all
> registers) during syscall entry/exit, like x86-64.
> 
> This patch prepares to support c/r on x86-64, specifically:
> 
> * Changes the syscall prototype and definition to accept the pt_regs
>   struct as an argument (into %eax register).

I forgot to mention that this of course breaks s390 and ppc: you
need to provide an arch-dependent sys_restart() similar to how it's
done here.

Oren.

> 
> * Move arch/x86/mm/checkpoint*.c to arch/x86/kernel/...
> 
> * Split 32bit-dependent part of arch/x86/kernel/checkpoint.c into a
>   new arch/x86/kernel/checkpoint_32.c
> 
> Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
> ---

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] c/r: [x86_32] sys_restore to use ptregs prototype
       [not found]         ` <4B1C357C.2090003-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-12-07 20:55           ` Nathan Lynch
       [not found]             ` <1260219307.7151.3.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Nathan Lynch @ 2009-12-07 20:55 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA


> > * Changes the syscall prototype and definition to accept the pt_regs
> >   struct as an argument (into %eax register).
> 
> I forgot to mention that this of course breaks s390 and ppc: you
> need to provide an arch-dependent sys_restart() similar to how it's
> done here.

Thanks, here's the fixup for powerpc.


From 981dca4f3a879827d6e19a0cf32c7fd25b08a878 Mon Sep 17 00:00:00 2001
From: Nathan Lynch <ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org>
Date: Mon, 7 Dec 2009 14:51:13 -0600
Subject: [PATCH] checkpoint/powerpc: fix up restart code for ptregscall semantics

Signed-off-by: Nathan Lynch <ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org>
---
 arch/powerpc/kernel/process.c |   20 ++++++++++++++++++++
 1 files changed, 20 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 457c269..f9da9eb 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -30,6 +30,7 @@
 #include <linux/init_task.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
+#include <linux/checkpoint.h>
 #include <linux/mqueue.h>
 #include <linux/hardirq.h>
 #include <linux/utsname.h>
@@ -990,6 +991,25 @@ out:
 	return error;
 }
 
+int sys_restart(unsigned long a0, unsigned long a1, unsigned long a2,
+	       unsigned long a3, unsigned long a4, unsigned long a5,
+	       struct pt_regs *regs)
+{
+	unsigned long flags;
+	pid_t pid;
+	int logfd;
+	int fd;
+
+	CHECK_FULL_REGS(regs);
+
+	pid = a0;
+	fd = a1;
+	flags = a2;
+	logfd = a3;
+
+	return do_sys_restart(pid, fd, flags, logfd);
+}
+
 #ifdef CONFIG_IRQSTACKS
 static inline int valid_irq_stack(unsigned long sp, struct task_struct *p,
 				  unsigned long nbytes)
-- 
1.6.0.6

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] c/r: [x86_32] sys_restore to use ptregs prototype
       [not found]             ` <1260219307.7151.3.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2009-12-09 16:52               ` Serge E. Hallyn
  2009-12-09 17:02               ` Serge E. Hallyn
  1 sibling, 0 replies; 9+ messages in thread
From: Serge E. Hallyn @ 2009-12-09 16:52 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

Quoting Nathan Lynch (ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org):
> 
> > > * Changes the syscall prototype and definition to accept the pt_regs
> > >   struct as an argument (into %eax register).
> > 
> > I forgot to mention that this of course breaks s390 and ppc: you
> > need to provide an arch-dependent sys_restart() similar to how it's
> > done here.
> 
> Thanks, here's the fixup for powerpc.

Does this need to be in a #ifdef CONFIG_CHECKPOINT?

Near as I can tell there is no dummy do_sys_restart() for the
CONFIG_CHECKPOINT=n case.

> >From 981dca4f3a879827d6e19a0cf32c7fd25b08a878 Mon Sep 17 00:00:00 2001
> From: Nathan Lynch <ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org>
> Date: Mon, 7 Dec 2009 14:51:13 -0600
> Subject: [PATCH] checkpoint/powerpc: fix up restart code for ptregscall semantics
> 
> Signed-off-by: Nathan Lynch <ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org>
> ---
>  arch/powerpc/kernel/process.c |   20 ++++++++++++++++++++
>  1 files changed, 20 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> index 457c269..f9da9eb 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -30,6 +30,7 @@
>  #include <linux/init_task.h>
>  #include <linux/module.h>
>  #include <linux/kallsyms.h>
> +#include <linux/checkpoint.h>
>  #include <linux/mqueue.h>
>  #include <linux/hardirq.h>
>  #include <linux/utsname.h>
> @@ -990,6 +991,25 @@ out:
>  	return error;
>  }
> 
> +int sys_restart(unsigned long a0, unsigned long a1, unsigned long a2,
> +	       unsigned long a3, unsigned long a4, unsigned long a5,
> +	       struct pt_regs *regs)
> +{
> +	unsigned long flags;
> +	pid_t pid;
> +	int logfd;
> +	int fd;
> +
> +	CHECK_FULL_REGS(regs);
> +
> +	pid = a0;
> +	fd = a1;
> +	flags = a2;
> +	logfd = a3;
> +
> +	return do_sys_restart(pid, fd, flags, logfd);
> +}
> +
>  #ifdef CONFIG_IRQSTACKS
>  static inline int valid_irq_stack(unsigned long sp, struct task_struct *p,
>  				  unsigned long nbytes)
> -- 
> 1.6.0.6
> 
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] c/r: [x86_32] sys_restore to use ptregs prototype
       [not found]             ` <1260219307.7151.3.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  2009-12-09 16:52               ` Serge E. Hallyn
@ 2009-12-09 17:02               ` Serge E. Hallyn
  1 sibling, 0 replies; 9+ messages in thread
From: Serge E. Hallyn @ 2009-12-09 17:02 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

Here I guess is the s390 version.  If we need the pt_regs later, we
can get it using get_pt_regs(current) as the clone wrapper right
above it does.

Subject: [PATCH 1/1] define s390x sys_restart wrapper

Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 arch/s390/kernel/process.c |    9 +++++++++
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 0a59317..087f52c 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -241,6 +241,15 @@ SYSCALL_DEFINE4(clone, unsigned long, newsp, unsigned long, clone_flags,
 		       parent_tidptr, child_tidptr);
 }
 
+#ifdef CONFIG_CHECKPOINT
+extern long do_sys_restart(pid_t pid, int fd, unsigned long flags, int logfd);
+SYSCALL_DEFINE4(restart, pid_t, pid, int, fd, unsigned long, flags,
+		int, logfd)
+{
+	return do_sys_restart(pid, fd, flags, logfd);
+}
+#endif
+
 SYSCALL_DEFINE4(eclone, unsigned int, flags_low, struct clone_args __user *,
 		uca, int, args_size, pid_t __user *, pids)
 {
-- 
1.6.1

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2009-12-09 17:02 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-12-06 20:31 c/r: support for x86-64 arch Oren Laadan
     [not found] ` <1260131469-2917-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-12-06 20:31   ` [PATCH 1/2] c/r: [x86_32] sys_restore to use ptregs prototype Oren Laadan
     [not found]     ` <1260131469-2917-2-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-12-06 20:31       ` [PATCH] user-cr: eclone x86-64 wrapper Oren Laadan
     [not found]         ` <1260131469-2917-3-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-12-06 20:31           ` [PATCH 2/2] c/r: x86-64: checkpoint/restart implementation Oren Laadan
2009-12-06 20:35           ` [PATCH] user-cr: eclone x86-64 wrapper Oren Laadan
2009-12-06 22:51       ` [PATCH 1/2] c/r: [x86_32] sys_restore to use ptregs prototype Oren Laadan
     [not found]         ` <4B1C357C.2090003-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-12-07 20:55           ` Nathan Lynch
     [not found]             ` <1260219307.7151.3.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2009-12-09 16:52               ` Serge E. Hallyn
2009-12-09 17:02               ` Serge E. Hallyn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.