All of lore.kernel.org
 help / color / mirror / Atom feed
From: Magnus Damm <magnus@valinux.co.jp>
To: Keir Fraser <Keir.Fraser@cl.cam.ac.uk>
Cc: Ian Pratt <m+Ian.Pratt@cl.cam.ac.uk>,
	Kazuo Moriwaka <moriwaka@valinux.co.jp>,
	xen-devel@lists.xensource.com,
	Akio Takebe <takebe_akio@jp.fujitsu.com>,
	magnus.damm@gmail.com, Isaku Yamahata <yamahata@valinux.co.jp>,
	Magnus Damm <magnus@valinux.co.jp>, Horms <horms@verge.net.au>
Subject: [PATCH 03/04] Kexec / Kdump: x86_32 specific code
Date: Mon, 23 Oct 2006 18:05:40 +0900	[thread overview]
Message-ID: <20061023090540.26706.58041.sendpatchset@localhost> (raw)
In-Reply-To: <20061023090515.26706.69407.sendpatchset@localhost>

[PATCH 03/04] Kexec / Kdump: x86_32 specific code

This patch contains the x86_32 implementation of Kexec / Kdump for Xen.

Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
---

 Applies on top of xen-unstable-11856.

 buildconfigs/linux-defconfig_xen_x86_32                         |   2
 linux-2.6-xen-sparse/arch/i386/Kconfig                          |   2
 linux-2.6-xen-sparse/arch/i386/kernel/Makefile                  |   2
 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c               |  25
 linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h               |  51 +
 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h  |   8
 patches/linux-2.6.16.29/git-35...c9.patch                       | 401 +++++++
 patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec..code-i386.patch | 169 +++
 patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-xen-i386.patch   |  54 +
 patches/linux-2.6.16.29/series                                  |   3
 xen/arch/x86/x86_32/entry.S                                     |   2
 xen/include/asm-x86/x86_32/elf.h                                |  37
 xen/include/asm-x86/x86_32/kexec.h                              |  84 +-
 13 files changed, 817 insertions(+), 23 deletions(-)

--- 0002/buildconfigs/linux-defconfig_xen_x86_32
+++ work/buildconfigs/linux-defconfig_xen_x86_32	2006-10-23 11:36:16.000000000 +0900
@@ -183,6 +183,7 @@ CONFIG_MTRR=y
 CONFIG_REGPARM=y
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
@@ -1036,6 +1037,7 @@ CONFIG_DNOTIFY=y
 #
 CONFIG_PROC_FS=y
 CONFIG_PROC_KCORE=y
+# CONFIG_PROC_VMCORE is not set
 CONFIG_SYSFS=y
 CONFIG_TMPFS=y
 # CONFIG_HUGETLB_PAGE is not set
--- 0001/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ work/linux-2.6-xen-sparse/arch/i386/Kconfig	2006-10-23 11:36:16.000000000 +0900
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_XEN
+	depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- 0001/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ work/linux-2.6-xen-sparse/arch/i386/kernel/Makefile	2006-10-23 11:36:16.000000000 +0900
@@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- 0001/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ work/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c	2006-10-23 11:36:16.000000000 +0900
@@ -69,6 +69,10 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -943,6 +947,7 @@ static void __init parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -953,6 +958,10 @@ static void __init parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1322,9 +1331,22 @@ void __init setup_bootmem_allocator(void
 	}
 #endif
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end)
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
+#else
+	{
+		xen_kexec_reserve_t reservation;
+		BUG_ON(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_reserve,
+					&reservation));
+		if (reservation.size) {
+			crashk_res.start = reservation.start;
+			crashk_res.end = reservation.start + 
+				reservation.size - 1;
+		}
+	}
+#endif
 #endif
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1389,7 +1411,8 @@ legacy_init_iomem_resources(struct e820e
 			request_resource(res, data_resource);
 #endif
 #ifdef CONFIG_KEXEC
-			request_resource(res, &crashk_res);
+			if (crashk_res.start != crashk_res.end)
+			     request_resource(res, &crashk_res);
 #endif
 		}
 	}
--- /dev/null
+++ work/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h	2006-10-23 11:36:17.000000000 +0900
@@ -0,0 +1,51 @@
+#ifndef _I386_KEXEC_XEN_H
+#define _I386_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_32.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+					struct cpu_user_regs *xen_regs)
+{
+	xen_regs->ebx    = linux_regs->ebx;
+	xen_regs->ecx    = linux_regs->ecx;
+	xen_regs->edx    = linux_regs->edx;
+	xen_regs->esi    = linux_regs->esi;
+	xen_regs->edi    = linux_regs->edi;
+	xen_regs->ebp    = linux_regs->ebp;
+	xen_regs->eax    = linux_regs->eax;
+	xen_regs->esp    = linux_regs->esp;
+	xen_regs->ss     = linux_regs->xss;
+	xen_regs->cs     = linux_regs->xcs;
+	xen_regs->ds     = linux_regs->xds;
+	xen_regs->es     = linux_regs->xes;
+	xen_regs->eflags = linux_regs->eflags;
+}
+
+/* Kexec needs to know about the actual physical addresss.
+ * But in xen, on some architectures, a physical address is a
+ * pseudo-physical addresss. */
+#ifdef CONFIG_XEN
+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#else
+#define kexec_page_to_pfn(page)  page_to_pfn(page)
+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#endif
+
+#endif /* _I386_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- 0001/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ work/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h	2006-10-23 11:36:16.000000000 +0900
@@ -385,5 +385,13 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec_op(
+	unsigned long op, void *args)
+{
+	return _hypercall2(int, kexec_op, op, args);
+}
+
+
 
 #endif /* __HYPERCALL_H__ */
--- /dev/null
+++ work/patches/linux-2.6.16.29/git-3566561bfadffcb5dbc85d576be80c0dbf2cccc9.patch	2006-10-23 11:36:17.000000000 +0900
@@ -0,0 +1,401 @@
+From: Magnus Damm <magnus@valinux.co.jp>
+Date: Tue, 26 Sep 2006 08:52:38 +0000 (+0200)
+Subject: [PATCH] i386: Avoid overwriting the current pgd (V4, i386)
+X-Git-Url: http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=3566561bfadffcb5dbc85d576be80c0dbf2cccc9
+
+[PATCH] i386: Avoid overwriting the current pgd (V4, i386)
+
+kexec: Avoid overwriting the current pgd (V4, i386)
+
+This patch upgrades the i386-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables. These tables are used to provide
+an executable identity mapping without overwriting the current pgd.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+Signed-off-by: Andi Kleen <ak@suse.de>
+---
+
+--- a/arch/i386/kernel/machine_kexec.c
++++ b/arch/i386/kernel/machine_kexec.c
+@@ -21,70 +21,13 @@
+ #include <asm/system.h>
+ 
+ #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+-
+-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L2_ATTR (_PAGE_PRESENT)
+-
+-#define LEVEL0_SIZE (1UL << 12UL)
+-
+-#ifndef CONFIG_X86_PAE
+-#define LEVEL1_SIZE (1UL << 22UL)
+-static u32 pgtable_level1[1024] PAGE_ALIGNED;
+-
+-static void identity_map_page(unsigned long address)
+-{
+-	unsigned long level1_index, level2_index;
+-	u32 *pgtable_level2;
+-
+-	/* Find the current page table */
+-	pgtable_level2 = __va(read_cr3());
+-
+-	/* Find the indexes of the physical address to identity map */
+-	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+-	level2_index = address / LEVEL1_SIZE;
+-
+-	/* Identity map the page table entry */
+-	pgtable_level1[level1_index] = address | L0_ATTR;
+-	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+-
+-	/* Flush the tlb so the new mapping takes effect.
+-	 * Global tlb entries are not flushed but that is not an issue.
+-	 */
+-	load_cr3(pgtable_level2);
+-}
+-
+-#else
+-#define LEVEL1_SIZE (1UL << 21UL)
+-#define LEVEL2_SIZE (1UL << 30UL)
+-static u64 pgtable_level1[512] PAGE_ALIGNED;
+-static u64 pgtable_level2[512] PAGE_ALIGNED;
+-
+-static void identity_map_page(unsigned long address)
+-{
+-	unsigned long level1_index, level2_index, level3_index;
+-	u64 *pgtable_level3;
+-
+-	/* Find the current page table */
+-	pgtable_level3 = __va(read_cr3());
+-
+-	/* Find the indexes of the physical address to identity map */
+-	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+-	level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
+-	level3_index = address / LEVEL2_SIZE;
+-
+-	/* Identity map the page table entry */
+-	pgtable_level1[level1_index] = address | L0_ATTR;
+-	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+-	set_64bit(&pgtable_level3[level3_index],
+-					       __pa(pgtable_level2) | L2_ATTR);
+-
+-	/* Flush the tlb so the new mapping takes effect.
+-	 * Global tlb entries are not flushed but that is not an issue.
+-	 */
+-	load_cr3(pgtable_level3);
+-}
++static u32 kexec_pgd[1024] PAGE_ALIGNED;
++#ifdef CONFIG_X86_PAE
++static u32 kexec_pmd0[1024] PAGE_ALIGNED;
++static u32 kexec_pmd1[1024] PAGE_ALIGNED;
+ #endif
++static u32 kexec_pte0[1024] PAGE_ALIGNED;
++static u32 kexec_pte1[1024] PAGE_ALIGNED;
+ 
+ static void set_idt(void *newidt, __u16 limit)
+ {
+@@ -128,16 +71,6 @@ static void load_segments(void)
+ #undef __STR
+ }
+ 
+-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
+-					unsigned long indirection_page,
+-					unsigned long reboot_code_buffer,
+-					unsigned long start_address,
+-					unsigned int has_pae) ATTRIB_NORET;
+-
+-extern const unsigned char relocate_new_kernel[];
+-extern void relocate_new_kernel_end(void);
+-extern const unsigned int relocate_new_kernel_size;
+-
+ /*
+  * A architecture hook called to validate the
+  * proposed image and prepare the control pages
+@@ -170,25 +103,29 @@ void machine_kexec_cleanup(struct kimage
+  */
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+-	unsigned long page_list;
+-	unsigned long reboot_code_buffer;
+-
+-	relocate_new_kernel_t rnk;
++	unsigned long page_list[PAGES_NR];
++	void *control_page;
+ 
+ 	/* Interrupts aren't acceptable while we reboot */
+ 	local_irq_disable();
+ 
+-	/* Compute some offsets */
+-	reboot_code_buffer = page_to_pfn(image->control_code_page)
+-								<< PAGE_SHIFT;
+-	page_list = image->head;
+-
+-	/* Set up an identity mapping for the reboot_code_buffer */
+-	identity_map_page(reboot_code_buffer);
+-
+-	/* copy it out */
+-	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+-						relocate_new_kernel_size);
++	control_page = page_address(image->control_code_page);
++	memcpy(control_page, relocate_kernel, PAGE_SIZE);
++
++	page_list[PA_CONTROL_PAGE] = __pa(control_page);
++	page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
++	page_list[PA_PGD] = __pa(kexec_pgd);
++	page_list[VA_PGD] = (unsigned long)kexec_pgd;
++#ifdef CONFIG_X86_PAE
++	page_list[PA_PMD_0] = __pa(kexec_pmd0);
++	page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
++	page_list[PA_PMD_1] = __pa(kexec_pmd1);
++	page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
++#endif
++	page_list[PA_PTE_0] = __pa(kexec_pte0);
++	page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
++	page_list[PA_PTE_1] = __pa(kexec_pte1);
++	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+ 
+ 	/* The segment registers are funny things, they have both a
+ 	 * visible and an invisible part.  Whenever the visible part is
+@@ -207,8 +144,8 @@ NORET_TYPE void machine_kexec(struct kim
+ 	set_idt(phys_to_virt(0),0);
+ 
+ 	/* now call it */
+-	rnk = (relocate_new_kernel_t) reboot_code_buffer;
+-	(*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
++	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
++			image->start, cpu_has_pae);
+ }
+ 
+ /* crashkernel=size@addr specifies the location to reserve for
+--- a/arch/i386/kernel/relocate_kernel.S
++++ b/arch/i386/kernel/relocate_kernel.S
+@@ -7,16 +7,138 @@
+  */
+ 
+ #include <linux/linkage.h>
++#include <asm/page.h>
++#include <asm/kexec.h>
++
++/*
++ * Must be relocatable PIC code callable as a C function
++ */
++
++#define PTR(x) (x << 2)
++#define PAGE_ALIGNED (1 << PAGE_SHIFT)
++#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
++#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
++
++	.text
++	.align PAGE_ALIGNED
++	.globl relocate_kernel
++relocate_kernel:
++	movl	8(%esp), %ebp /* list of pages */
++
++#ifdef CONFIG_X86_PAE
++	/* map the control page at its virtual address */
++
++	movl	PTR(VA_PGD)(%ebp), %edi
++	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
++	andl	$0xc0000000, %eax
++	shrl	$27, %eax
++	addl	%edi, %eax
++
++	movl	PTR(PA_PMD_0)(%ebp), %edx
++	orl	$PAE_PGD_ATTR, %edx
++	movl	%edx, (%eax)
++
++	movl	PTR(VA_PMD_0)(%ebp), %edi
++	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
++	andl	$0x3fe00000, %eax
++	shrl	$18, %eax
++	addl	%edi, %eax
++
++	movl	PTR(PA_PTE_0)(%ebp), %edx
++	orl	$PAGE_ATTR, %edx
++	movl	%edx, (%eax)
++
++	movl	PTR(VA_PTE_0)(%ebp), %edi
++	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
++	andl	$0x001ff000, %eax
++	shrl	$9, %eax
++	addl	%edi, %eax
++
++	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edx
++	orl	$PAGE_ATTR, %edx
++	movl	%edx, (%eax)
++
++	/* identity map the control page at its physical address */
++
++	movl	PTR(VA_PGD)(%ebp), %edi
++	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
++	andl	$0xc0000000, %eax
++	shrl	$27, %eax
++	addl	%edi, %eax
++
++	movl	PTR(PA_PMD_1)(%ebp), %edx
++	orl	$PAE_PGD_ATTR, %edx
++	movl	%edx, (%eax)
++
++	movl	PTR(VA_PMD_1)(%ebp), %edi
++	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
++	andl	$0x3fe00000, %eax
++	shrl	$18, %eax
++	addl	%edi, %eax
++
++	movl	PTR(PA_PTE_1)(%ebp), %edx
++	orl	$PAGE_ATTR, %edx
++	movl	%edx, (%eax)
++
++	movl	PTR(VA_PTE_1)(%ebp), %edi
++	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
++	andl	$0x001ff000, %eax
++	shrl	$9, %eax
++	addl	%edi, %eax
++
++	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edx
++	orl	$PAGE_ATTR, %edx
++	movl	%edx, (%eax)
++#else
++	/* map the control page at its virtual address */
++
++	movl	PTR(VA_PGD)(%ebp), %edi
++	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
++	andl	$0xffc00000, %eax
++	shrl	$20, %eax
++	addl	%edi, %eax
++
++	movl	PTR(PA_PTE_0)(%ebp), %edx
++	orl	$PAGE_ATTR, %edx
++	movl	%edx, (%eax)
++
++	movl	PTR(VA_PTE_0)(%ebp), %edi
++	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
++	andl	$0x003ff000, %eax
++	shrl	$10, %eax
++	addl	%edi, %eax
++
++	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edx
++	orl	$PAGE_ATTR, %edx
++	movl	%edx, (%eax)
++
++	/* identity map the control page at its physical address */
++
++	movl	PTR(VA_PGD)(%ebp), %edi
++	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
++	andl	$0xffc00000, %eax
++	shrl	$20, %eax
++	addl	%edi, %eax
++
++	movl	PTR(PA_PTE_1)(%ebp), %edx
++	orl	$PAGE_ATTR, %edx
++	movl	%edx, (%eax)
++
++	movl	PTR(VA_PTE_1)(%ebp), %edi
++	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
++	andl	$0x003ff000, %eax
++	shrl	$10, %eax
++	addl	%edi, %eax
++
++	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edx
++	orl	$PAGE_ATTR, %edx
++	movl	%edx, (%eax)
++#endif
+ 
+-	/*
+-	 * Must be relocatable PIC code callable as a C function, that once
+-	 * it starts can not use the previous processes stack.
+-	 */
+-	.globl relocate_new_kernel
+ relocate_new_kernel:
+ 	/* read the arguments and say goodbye to the stack */
+ 	movl  4(%esp), %ebx /* page_list */
+-	movl  8(%esp), %ebp /* reboot_code_buffer */
++	movl  8(%esp), %ebp /* list of pages */
+ 	movl  12(%esp), %edx /* start address */
+ 	movl  16(%esp), %ecx /* cpu_has_pae */
+ 
+@@ -24,11 +146,26 @@ relocate_new_kernel:
+ 	pushl $0
+ 	popfl
+ 
+-	/* set a new stack at the bottom of our page... */
+-	lea   4096(%ebp), %esp
++	/* get physical address of control page now */
++	/* this is impossible after page table switch */
++	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edi
++
++	/* switch to new set of page tables */
++	movl	PTR(PA_PGD)(%ebp), %eax
++	movl	%eax, %cr3
++
++	/* setup a new stack at the end of the physical control page */
++	lea	4096(%edi), %esp
+ 
+-	/* store the parameters back on the stack */
+-	pushl   %edx /* store the start address */
++	/* jump to identity mapped page */
++	movl    %edi, %eax
++	addl    $(identity_mapped - relocate_kernel), %eax
++	pushl   %eax
++	ret
++
++identity_mapped:
++	/* store the start address on the stack */
++	pushl   %edx
+ 
+ 	/* Set cr0 to a known state:
+ 	 * 31 0 == Paging disabled
+@@ -113,8 +250,3 @@ relocate_new_kernel:
+ 	xorl    %edi, %edi
+ 	xorl    %ebp, %ebp
+ 	ret
+-relocate_new_kernel_end:
+-
+-	.globl relocate_new_kernel_size
+-relocate_new_kernel_size:
+-	.long relocate_new_kernel_end - relocate_new_kernel
+--- a/include/asm-i386/kexec.h
++++ b/include/asm-i386/kexec.h
+@@ -1,6 +1,26 @@
+ #ifndef _I386_KEXEC_H
+ #define _I386_KEXEC_H
+ 
++#define PA_CONTROL_PAGE  0
++#define VA_CONTROL_PAGE  1
++#define PA_PGD           2
++#define VA_PGD           3
++#define PA_PTE_0         4
++#define VA_PTE_0         5
++#define PA_PTE_1         6
++#define VA_PTE_1         7
++#ifdef CONFIG_X86_PAE
++#define PA_PMD_0         8
++#define VA_PMD_0         9
++#define PA_PMD_1         10
++#define VA_PMD_1         11
++#define PAGES_NR         12
++#else
++#define PAGES_NR         8
++#endif
++
++#ifndef __ASSEMBLY__
++
+ #include <asm/fixmap.h>
+ #include <asm/ptrace.h>
+ #include <asm/string.h>
+@@ -72,5 +92,12 @@ static inline void crash_setup_regs(stru
+                newregs->eip = (unsigned long)current_text_addr();
+        }
+ }
++asmlinkage NORET_TYPE void
++relocate_kernel(unsigned long indirection_page,
++		unsigned long control_page,
++		unsigned long start_address,
++		unsigned int has_pae) ATTRIB_NORET;
++
++#endif /* __ASSEMBLY__ */
+ 
+ #endif /* _I386_KEXEC_H */
--- /dev/null
+++ work/patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-move_segment_code-i386.patch	2006-10-23 11:36:17.000000000 +0900
@@ -0,0 +1,169 @@
+kexec: Move asm segment handling code to the assembly file (i386)
+
+This patch moves the idt, gdt, and segment handling code from machine_kexec.c
+to relocate_kernel.S. The main reason behind this move is to avoid code 
+duplication in the Xen hypervisor. With this patch all code required to kexec
+is put on the control page.
+
+On top of that this patch also counts as a cleanup - I think it is much
+nicer to write assembly directly in assembly files than wrap inline assembly
+in C functions for no apparent reason.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+---
+
+ Applies to 2.6.19-rc1.
+
+ machine_kexec.c   |   59 -----------------------------------------------------
+ relocate_kernel.S |   58 +++++++++++++++++++++++++++++++++++++++++++++++-----
+ 2 files changed, 53 insertions(+), 64 deletions(-)
+
+--- 0002/arch/i386/kernel/machine_kexec.c
++++ work/arch/i386/kernel/machine_kexec.c	2006-10-05 15:49:08.000000000 +0900
+@@ -29,48 +29,6 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED
+ static u32 kexec_pte0[1024] PAGE_ALIGNED;
+ static u32 kexec_pte1[1024] PAGE_ALIGNED;
+ 
+-static void set_idt(void *newidt, __u16 limit)
+-{
+-	struct Xgt_desc_struct curidt;
+-
+-	/* ia32 supports unaliged loads & stores */
+-	curidt.size    = limit;
+-	curidt.address = (unsigned long)newidt;
+-
+-	load_idt(&curidt);
+-};
+-
+-
+-static void set_gdt(void *newgdt, __u16 limit)
+-{
+-	struct Xgt_desc_struct curgdt;
+-
+-	/* ia32 supports unaligned loads & stores */
+-	curgdt.size    = limit;
+-	curgdt.address = (unsigned long)newgdt;
+-
+-	load_gdt(&curgdt);
+-};
+-
+-static void load_segments(void)
+-{
+-#define __STR(X) #X
+-#define STR(X) __STR(X)
+-
+-	__asm__ __volatile__ (
+-		"\tljmp $"STR(__KERNEL_CS)",$1f\n"
+-		"\t1:\n"
+-		"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
+-		"\tmovl %%eax,%%ds\n"
+-		"\tmovl %%eax,%%es\n"
+-		"\tmovl %%eax,%%fs\n"
+-		"\tmovl %%eax,%%gs\n"
+-		"\tmovl %%eax,%%ss\n"
+-		::: "eax", "memory");
+-#undef STR
+-#undef __STR
+-}
+-
+ /*
+  * A architecture hook called to validate the
+  * proposed image and prepare the control pages
+@@ -127,23 +85,6 @@ NORET_TYPE void machine_kexec(struct kim
+ 	page_list[PA_PTE_1] = __pa(kexec_pte1);
+ 	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+ 
+-	/* The segment registers are funny things, they have both a
+-	 * visible and an invisible part.  Whenever the visible part is
+-	 * set to a specific selector, the invisible part is loaded
+-	 * with from a table in memory.  At no other time is the
+-	 * descriptor table in memory accessed.
+-	 *
+-	 * I take advantage of this here by force loading the
+-	 * segments, before I zap the gdt with an invalid value.
+-	 */
+-	load_segments();
+-	/* The gdt & idt are now invalid.
+-	 * If you want to load them you must set up your own idt & gdt.
+-	 */
+-	set_gdt(phys_to_virt(0),0);
+-	set_idt(phys_to_virt(0),0);
+-
+-	/* now call it */
+ 	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+ 			image->start, cpu_has_pae);
+ }
+--- 0002/arch/i386/kernel/relocate_kernel.S
++++ work/arch/i386/kernel/relocate_kernel.S	2006-10-05 16:03:21.000000000 +0900
+@@ -154,14 +154,45 @@ relocate_new_kernel:
+ 	movl	PTR(PA_PGD)(%ebp), %eax
+ 	movl	%eax, %cr3
+ 
++	/* setup idt */
++	movl	%edi, %eax
++	addl	$(idt_48 - relocate_kernel), %eax
++	lidtl	(%eax)
++
++	/* setup gdt */
++	movl	%edi, %eax
++	addl	$(gdt - relocate_kernel), %eax
++	movl	%edi, %esi
++	addl	$((gdt_48 - relocate_kernel) + 2), %esi
++	movl	%eax, (%esi)
++	
++	movl	%edi, %eax
++	addl	$(gdt_48 - relocate_kernel), %eax
++	lgdtl	(%eax)
++
++	/* setup data segment registers */
++	mov	$(gdt_ds - gdt), %eax
++	mov	%eax, %ds
++	mov	%eax, %es
++	mov	%eax, %fs
++	mov	%eax, %gs
++	mov	%eax, %ss
++	
+ 	/* setup a new stack at the end of the physical control page */
+ 	lea	4096(%edi), %esp
+ 
+-	/* jump to identity mapped page */
+-	movl    %edi, %eax
+-	addl    $(identity_mapped - relocate_kernel), %eax
+-	pushl   %eax
+-	ret
++	/* load new code segment and jump to identity mapped page */
++	movl	%edi, %esi
++	xorl	%eax, %eax
++	pushl	%eax
++	pushl	%esi
++	pushl	%eax
++	movl	$(gdt_cs - gdt), %eax
++	pushl	%eax	
++	movl	%edi, %eax
++	addl	$(identity_mapped - relocate_kernel),%eax
++	pushl	%eax
++	iretl
+ 
+ identity_mapped:
+ 	/* store the start address on the stack */
+@@ -250,3 +281,20 @@ identity_mapped:
+ 	xorl    %edi, %edi
+ 	xorl    %ebp, %ebp
+ 	ret
++
++	.align	16
++gdt:
++	.quad	0x0000000000000000	/* NULL descriptor */
++gdt_cs:	
++	.quad	0x00cf9a000000ffff	/* kernel 4GB code at 0x00000000 */
++gdt_ds:
++	.quad	0x00cf92000000ffff	/* kernel 4GB data at 0x00000000 */
++gdt_end:
++	
++gdt_48:
++	.word	gdt_end - gdt - 1	/* limit */
++	.long	0			/* base - filled in by code above */
++
++idt_48:
++	.word	0			/* limit */
++	.long	0			/* base */
--- /dev/null
+++ work/patches/linux-2.6.16.29/linux-2.6.19-rc1-kexec-xen-i386.patch	2006-10-23 11:36:17.000000000 +0900
@@ -0,0 +1,54 @@
+--- 0004/arch/i386/kernel/machine_kexec.c
++++ work/arch/i386/kernel/machine_kexec.c	2006-10-11 18:34:06.000000000 +0900
+@@ -20,6 +20,10 @@
+ #include <asm/desc.h>
+ #include <asm/system.h>
+ 
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#endif
++
+ #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+ static u32 kexec_pgd[1024] PAGE_ALIGNED;
+ #ifdef CONFIG_X86_PAE
+@@ -29,6 +33,40 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED
+ static u32 kexec_pte0[1024] PAGE_ALIGNED;
+ static u32 kexec_pte1[1024] PAGE_ALIGNED;
+ 
++#ifdef CONFIG_XEN
++
++#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
++
++#if PAGES_NR > KEXEC_XEN_NO_PAGES
++#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
++#endif
++
++#if PA_CONTROL_PAGE != 0
++#error PA_CONTROL_PAGE is non zero - Xen support will break
++#endif
++
++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
++{
++	void *control_page;
++
++	memset(xki->page_list, 0, sizeof(xki->page_list));
++
++	control_page = page_address(image->control_code_page);
++	memcpy(control_page, relocate_kernel, PAGE_SIZE);
++
++	xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
++	xki->page_list[PA_PGD] = __ma(kexec_pgd);
++#ifdef CONFIG_X86_PAE
++	xki->page_list[PA_PMD_0] = __ma(kexec_pmd0);
++	xki->page_list[PA_PMD_1] = __ma(kexec_pmd1);
++#endif
++	xki->page_list[PA_PTE_0] = __ma(kexec_pte0);
++	xki->page_list[PA_PTE_1] = __ma(kexec_pte1);
++
++}
++
++#endif /* CONFIG_XEN */
++
+ /*
+  * A architecture hook called to validate the
+  * proposed image and prepare the control pages
--- 0004/patches/linux-2.6.16.29/series
+++ work/patches/linux-2.6.16.29/series	2006-10-23 11:36:16.000000000 +0900
@@ -1,6 +1,9 @@
 kexec-generic.patch
 git-2efe55a9cec8418f0e0cde3dc3787a42fddc4411.patch
 git-2a8a3d5b65e86ec1dfef7d268c64a909eab94af7.patch
+git-3566561bfadffcb5dbc85d576be80c0dbf2cccc9.patch
+linux-2.6.19-rc1-kexec-move_segment_code-i386.patch
+linux-2.6.19-rc1-kexec-xen-i386.patch
 blktap-aio-16_03_06.patch
 device_bind.patch
 fix-hz-suspend.patch
--- 0001/xen/arch/x86/x86_32/entry.S
+++ work/xen/arch/x86/x86_32/entry.S	2006-10-23 11:36:16.000000000 +0900
@@ -672,6 +672,7 @@ ENTRY(hypercall_table)
         .long do_hvm_op
         .long do_sysctl             /* 35 */
         .long do_domctl
+        .long do_kexec_op
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -714,6 +715,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_hvm_op            */
         .byte 1 /* do_sysctl            */  /* 35 */
         .byte 1 /* do_domctl            */
+        .byte 2 /* do_kexec_op          */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- 0004/xen/include/asm-x86/x86_32/elf.h
+++ work/xen/include/asm-x86/x86_32/elf.h	2006-10-23 11:36:17.000000000 +0900
@@ -1,14 +1,39 @@
+/*
+ * Based heavily on include/asm-i386/elf.h and
+ * include/asm-i386/system.h from Linux 2.6.16
+ */
+
 #ifndef __X86_32_ELF_H__
 #define __X86_32_ELF_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#define ELF_NGREG 17
 
-#define ELF_NGREG 1	   /* XXX: Define to be at least as large as
-			      however many register slots are needed when
-			      crash notes are written during crash dump */
+/* XXX: Xen doesn't have orig_eax.  For kdump, on a dom0 crash, the values
+ * for the crashing CPU could could be passed down from dom0, but is that
+ * neccessary?
+ * Also, I'm not sure why fs and gs are derived from the CPU
+ * rather than regs */
 
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+#define ELF_CORE_COPY_REGS(pr_reg, regs) do {                           \
+    unsigned i;                                                         \
+    pr_reg[0] = regs->ebx;                                              \
+    pr_reg[1] = regs->ecx;                                              \
+    pr_reg[2] = regs->edx;                                              \
+    pr_reg[3] = regs->esi;                                              \
+    pr_reg[4] = regs->edi;                                              \
+    pr_reg[5] = regs->ebp;                                              \
+    pr_reg[6] = regs->eax;                                              \
+    pr_reg[7] = regs->ds;                                               \
+    pr_reg[8] = regs->es;                                               \
+    asm volatile("mov %%fs,%0":"=rm" (i)); pr_reg[9]  = i;              \
+    asm volatile("mov %%gs,%0":"=rm" (i)); pr_reg[10] = i;              \
+    pr_reg[11] = 0; /* regs->orig_eax; */                               \
+    pr_reg[12] = regs->eip;                                             \
+    pr_reg[13] = regs->cs;                                              \
+    pr_reg[14] = regs->eflags;                                          \
+    pr_reg[15] = regs->esp;                                             \
+    pr_reg[16] = regs->ss;                                              \
+} while(0);
 
 #endif /* __X86_32_ELF_H__ */
 
--- 0004/xen/include/asm-x86/x86_32/kexec.h
+++ work/xen/include/asm-x86/x86_32/kexec.h	2006-10-23 11:36:17.000000000 +0900
@@ -1,36 +1,92 @@
-#ifndef __X86_32_KEXEC_H__
-#define __X86_32_KEXEC_H__
+/******************************************************************************
+ * kexec.h
+ * 
+ * Based heavily on machine_kexec.c and kexec.h from Linux 2.6.19-rc1
+ *
+ */
+  
+#ifndef __X86_KEXEC_X86_32_H__
+#define __X86_KEXEC_X86_32_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
 #include <xen/types.h>
-#include <public/xen.h>
 #include <xen/kexec.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
 
+/* CPU does not save ss and esp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
 static inline void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
-                                      struct cpu_user_regs *oldregs)
+                    struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-    return;
+    memcpy(newregs, oldregs, sizeof(*newregs));
+    newregs->esp = (unsigned long)&(oldregs->esp);
+    __asm__ __volatile__(
+            "xorl %%eax, %%eax\n\t"
+            "movw %%ss, %%ax\n\t"
+            :"=a"(newregs->ss));
 }
-
+  
+/*
+ * This function is responsible for capturing register states if coming
+ * via panic otherwise just fix up the ss and esp if coming via kernel
+ * mode exception.
+ */
 static inline void crash_setup_regs(struct cpu_user_regs *newregs,
-                                    struct cpu_user_regs *oldregs)
+			     struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    if (oldregs)
+        crash_fixup_ss_esp(newregs, oldregs);
+    else {
+        __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx));
+        __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx));
+        __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx));
+        __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi));
+        __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi));
+        __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp));
+        __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax));
+        __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp));
+        __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->ss));
+        __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->cs));
+        __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->ds));
+        __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->es));
+        __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags));
+
+        newregs->eip = (unsigned long)current_text_addr();
+    }
 }
 
+/*              
+ * From Linux 2.6.16's include/asm-i386/mach-xen/asm/ptrace.h
+ *
+ * user_mode_vm(regs) determines whether a register set came from user mode.
+ * This is true if V8086 mode was enabled OR if the register set was from
+ * protected mode with RPL-3 CS value.  This tricky test checks that with
+ * one comparison.  Many places in the kernel can bypass this full check
+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ */
 static inline int user_mode(struct cpu_user_regs *regs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-    return -1;
+    return (regs->cs & 2) != 0;
 }
 
+typedef asmlinkage void (*relocate_new_kernel_t)(
+               unsigned long indirection_page,
+               unsigned long page_list,
+               unsigned long start_address,
+               unsigned int has_pae);
+
 static inline void machine_kexec(xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    relocate_new_kernel_t rnk;
+
+    rnk = (relocate_new_kernel_t) image->page_list[1];
+    (*rnk)(image->indirection_page, (unsigned long)image->page_list, 
+           image->start_address, (unsigned long)cpu_has_pae);
 }
 
-#endif /* __X86_32_KEXEC_H__ */
+#endif /* __X86_KEXEC_X86_32_H__ */
 
 /*
  * Local variables:

  parent reply	other threads:[~2006-10-23  9:05 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-10-23  9:05 [PATCH 00/04] Kexec / Kdump: Release 20061023 (xen-unstable-11856) Magnus Damm
2006-10-23  9:05 ` [PATCH 01/04] Kexec / Kdump: Generic code Magnus Damm
2006-10-23  9:05 ` [PATCH 02/04] Kexec / Kdump: Code shared between x86_32 and x86_64 Magnus Damm
2006-10-23  9:05 ` Magnus Damm [this message]
2006-10-23  9:05 ` [PATCH 04/04] Kexec / Kdump: x86_64 specific code Magnus Damm
2006-10-25 10:10 ` [PATCH 00/04] Kexec / Kdump: Release 20061023 (xen-unstable-11856) Keir Fraser
2006-10-25 11:25   ` Magnus Damm
  -- strict thread matches above, loose matches on Subject: below --
2006-11-30  7:58 [PATCH 00/04] Kexec / Kdump: Release 20061130 (xen-unstable-12621) Magnus Damm
2006-11-30  7:58 ` [PATCH 03/04] Kexec / Kdump: x86_32 specific code Magnus Damm
2006-11-22  7:10 [PATCH 00/04] Kexec / Kdump: Release 20061122 (xen-unstable-12502) Magnus Damm
2006-11-22  7:11 ` [PATCH 03/04] Kexec / Kdump: x86_32 specific code Magnus Damm
2006-11-08 13:48 [PATCH 00/04] Kexec / Kdump: Release 20061108 (xen-unstable-12281) Magnus Damm
2006-11-08 13:49 ` [PATCH 03/04] Kexec / Kdump: x86_32 specific code Magnus Damm
2006-10-30 10:03 [PATCH 00/04] Kexec / Kdump: Release 20061030 (xen-unstable-12025) Magnus Damm
2006-10-30 10:04 ` [PATCH 03/04] Kexec / Kdump: x86_32 specific code Magnus Damm
2006-10-16  8:33 [PATCH 00/04] Kexec / Kdump: Release 20061016 (xen-unstable-11760) Magnus Damm
2006-10-16  8:33 ` [PATCH 03/04] Kexec / Kdump: x86_32 specific code Magnus Damm

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20061023090540.26706.58041.sendpatchset@localhost \
    --to=magnus@valinux.co.jp \
    --cc=Keir.Fraser@cl.cam.ac.uk \
    --cc=horms@verge.net.au \
    --cc=m+Ian.Pratt@cl.cam.ac.uk \
    --cc=magnus.damm@gmail.com \
    --cc=moriwaka@valinux.co.jp \
    --cc=takebe_akio@jp.fujitsu.com \
    --cc=xen-devel@lists.xensource.com \
    --cc=yamahata@valinux.co.jp \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.