All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH]: kexec: framework and i386
@ 2006-04-07  7:42 Horms
  2006-04-07 15:00 ` Don Zickus
                   ` (2 more replies)
  0 siblings, 3 replies; 68+ messages in thread
From: Horms @ 2006-04-07  7:42 UTC (permalink / raw)
  To: xen-devel; +Cc: Magnus Damm

kexec: framework and i386

Here is a first cut of kexec for dom0/xen, which will actually
kexec the physical machine from xen. The approach taken is
to move the architecture-dependant kexec code into a new hypercall.

Some notes:
  * machine_kexec_cleanup() and machine_kexec_prepare() don't do
    anything in i386. So while this patch adds a framework for them,
    I am not sure what parameters are needs at this stage.
  * Only works for UP, as machine_shutdown is not implemented yet
  * kexecing into xen does not seem to work, I think that 
    kexec-tools needs updating, but I have not investigated yet
  * I don't believe that kdump works yet
  * This patch was prepared against xen-unstable.hg 9514
    As of today (9574) two new hypercalls have been added.
    I rediffed and moved the kexec hypercall to 33. However
    this exceedes hypercall_NR, which is currently 32. 
    I tried increasing this, but the dom0 now crashes 
    in entry.S on init. Even after rebuilding both xen and the kernel
    completely from scratch after a make distclean. Help!!

Prepared with the assistance of my colleague Magnus Damm

Signed-Off-By: Horms <horms@verge.net.au>

--- from-0001/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ to-work/linux-2.6-xen-sparse/arch/i386/Kconfig	2006-04-03 15:13:38.000000000 +0900
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_XEN
+	depends on EXPERIMENTAL
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- /dev/null
+++ to-work/linux-2.6-xen-sparse/arch/i386/kernel/crash-xen.c	2006-04-03 15:13:38.000000000 +0900
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific (i386-xen) functions for kexec based crash dumps.
+ *
+ * Created by: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <linux/kernel.h> /* For printk */
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	/* XXX: This should do something */
+	printk("xen-kexec: Need to turn of other CPUS in "
+	       "machine_crash_shutdown()\n");
+}
--- /dev/null
+++ to-work/linux-2.6-xen-sparse/arch/i386/kernel/machine_kexec-xen.c	2006-04-07 12:59:51.000000000 +0900
@@ -0,0 +1,80 @@
+/*
+ * machine_kexec-xen.c - handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Losely based on arch/i386/kernel/machine_kexec-xen.c
+ */
+
+#include <linux/kexec.h>
+#include <xen/interface/kexec.h>
+#include <linux/mm.h>
+#include <asm/hypercall.h>
+
+const extern unsigned char relocate_new_kernel[];
+extern unsigned int relocate_new_kernel_size;
+static kexec_arg_t hypercall_arg;
+
+/*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+ * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
+ * have been allocated, but the segments have yet
+ * been copied into the kernel.
+ *
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ *
+ * Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+    return 0;
+}
+
+/*
+ * Undo anything leftover by machine_kexec_prepare
+ * when an image is freed.
+ */
+void machine_kexec_cleanup(struct kimage *image)
+{
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+	kimage_entry_t *ptr, entry;
+
+	/* 
+	 * Translate addresses inside head from physcical to machine
+	 * In practice, this only needs to change the pointer to
+	 * indirection pages as non-indirected pages are relative.
+	 */
+	ptr = &image->head;
+	while ((entry = *ptr) && !(entry & IND_DONE)) {
+		if (!(entry & IND_DESTINATION))
+			*ptr = phys_to_machine(entry & PAGE_MASK) |
+				(entry & ~PAGE_MASK);
+
+		if (entry & IND_INDIRECTION)
+			ptr = __va(entry & PAGE_MASK);
+		else
+			ptr++;
+	}
+
+	/* Set up arguments to hypercall */
+	hypercall_arg.u.kexec.indirection_page = image->head;
+	hypercall_arg.u.kexec.reboot_code_buffer = 
+		pfn_to_mfn(page_to_pfn(image->control_code_page)) << PAGE_SHIFT;
+	hypercall_arg.u.kexec.start_address = image->start;
+	hypercall_arg.u.kexec.relocate_new_kernel = relocate_new_kernel;
+	hypercall_arg.u.kexec.relocate_new_kernel_size = 
+		relocate_new_kernel_size;
+
+	/* Let Xen do the rest of the work */
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, &hypercall_arg);
+}
--- from-0001/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
+++ to-work/linux-2.6-xen-sparse/drivers/xen/core/reboot.c	2006-04-03 15:13:38.000000000 +0900
@@ -38,6 +38,11 @@ extern void ctrl_alt_del(void);
  */
 #define SHUTDOWN_HALT      4
 
+void machine_shutdown(void) 
+{
+	printk("machine_shutdown: does nothing\n");
+}
+
 void machine_emergency_restart(void)
 {
 	/* We really want to get pending console data out before we die. */
--- from-0001/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ to-work/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h	2006-04-06 11:00:03.000000000 +0900
@@ -37,6 +37,8 @@
 # error "please don't include this file directly"
 #endif
 
+#include <xen/interface/kexec.h>
+
 #define __STR(x) #x
 #define STR(x) __STR(x)
 
@@ -329,6 +331,13 @@ HYPERVISOR_nmi_op(
 	return _hypercall2(int, nmi_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, kexec_arg_t * arg)
+{
+	return _hypercall2(int, kexec_op, op, arg); 
+}
+
 #endif /* __HYPERCALL_H__ */
 
 /*
バイナリー・ファイル/dev/nullとto-work/linux-2.6.16-xen/kernel/.kexec.c.swpは違います
--- from-0001/xen/arch/x86/x86_32/Makefile
+++ to-work/xen/arch/x86/x86_32/Makefile	2006-04-03 16:25:31.000000000 +0900
@@ -5,6 +5,7 @@ obj-y += entry.o
 obj-y += mm.o
 obj-y += seg_fixup.o
 obj-y += traps.o
+obj-y += machine_kexec.o
 
 obj-$(supervisor_mode_kernel) += supervisor_mode_kernel.o
 
--- from-0001/xen/arch/x86/x86_32/entry.S
+++ to-work/xen/arch/x86/x86_32/entry.S	2006-04-04 13:02:36.000000000 +0900
@@ -648,6 +648,7 @@ ENTRY(hypercall_table)
         .long do_acm_op
         .long do_nmi_op
         .long do_arch_sched_op
+        .long do_kexec             /* 30 */
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -683,6 +684,7 @@ ENTRY(hypercall_args_table)
         .byte 1 /* do_acm_op            */
         .byte 2 /* do_nmi_op            */
         .byte 2 /* do_arch_sched_op     */
+        .byte 2 /* do_kexec             */  /* 30 */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- /dev/null
+++ to-work/xen/arch/x86/x86_32/machine_kexec.c	2006-04-07 12:44:16.000000000 +0900
@@ -0,0 +1,168 @@
+/******************************************************************************
+ * arch/x86/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
+ */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/domain_page.h> 
+#include <xen/timer.h>
+#include <xen/sched.h>
+#include <asm/page.h> 
+#include <asm/flushtlb.h>
+#include <public/xen.h>
+#include <public/kexec.h>
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+                    unsigned long indirection_page,
+                    unsigned long reboot_code_buffer,
+                    unsigned long start_address,
+                    unsigned int has_pae);
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L2_ATTR (_PAGE_PRESENT)
+
+#ifndef CONFIG_X86_PAE
+
+static u32 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    unsigned long mfn;
+    u32 *pgtable_level2;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level2 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    write_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level2);
+}
+
+#else
+static u64 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+static u64 pgtable_level2[L2_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    int mfn;
+    intpte_t *pgtable_level3;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level3 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+    set_64bit(&pgtable_level3[l3_table_offset(address)],
+	      __pa(pgtable_level2) | L2_ATTR);
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    load_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level3);
+}
+#endif
+
+static void kexec_load_segments(void)
+{
+#define __SSTR(X) #X
+#define SSTR(X) __SSTR(X)
+    __asm__ __volatile__ (
+        "\tljmp $"SSTR(__HYPERVISOR_CS)",$1f\n"
+        "\t1:\n"
+        "\tmovl $"SSTR(__HYPERVISOR_DS)",%%eax\n"
+        "\tmovl %%eax,%%ds\n"
+        "\tmovl %%eax,%%es\n"
+        "\tmovl %%eax,%%fs\n"
+        "\tmovl %%eax,%%gs\n"
+        "\tmovl %%eax,%%ss\n"
+        ::: "eax", "memory");
+#undef SSTR
+#undef __SSTR
+}
+
+#define kexec_load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
+static void kexec_set_idt(void *newidt, __u16 limit)
+{
+    struct Xgt_desc_struct curidt;
+
+    /* ia32 supports unaliged loads & stores */
+    curidt.size    = limit;
+    curidt.address = (unsigned long)newidt;
+    
+    kexec_load_idt(&curidt);
+
+};
+
+#define kexec_load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
+static void kexec_set_gdt(void *newgdt, __u16 limit)
+{
+    struct Xgt_desc_struct curgdt;
+
+    /* ia32 supports unaligned loads & stores */
+    curgdt.size    = limit;
+    curgdt.address = (unsigned long)newgdt;
+
+    kexec_load_gdt(&curgdt);
+};
+
+int machine_kexec_prepare(struct kexec_arg *arg)
+{
+	return 0;
+}
+
+void machine_kexec_cleanup(struct kexec_arg *arg)
+{
+}
+
+void machine_kexec(struct kexec_arg *arg)
+{
+    relocate_new_kernel_t rnk;
+
+    local_irq_disable();
+
+    identity_map_page(arg->u.kexec.reboot_code_buffer);
+
+    copy_from_user((void *)arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.relocate_new_kernel,
+           arg->u.kexec.relocate_new_kernel_size);
+
+    kexec_load_segments();
+
+    kexec_set_gdt(__va(0),0);
+
+    kexec_set_idt(__va(0),0);
+
+    rnk = (relocate_new_kernel_t) arg->u.kexec.reboot_code_buffer;
+
+    (*rnk)(arg->u.kexec.indirection_page, arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.start_address, cpu_has_pae);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- from-0001/xen/common/Makefile
+++ to-work/xen/common/Makefile	2006-04-03 15:13:38.000000000 +0900
@@ -24,6 +24,7 @@ obj-y += trace.o
 obj-y += timer.o
 obj-y += vsprintf.o
 obj-y += xmalloc.o
+obj-y += kexec.o
 
 obj-$(perfc)       += perfc.o
 obj-$(crash_debug) += gdbstub.o
--- /dev/null
+++ to-work/xen/common/kexec.c	2006-04-07 13:06:54.000000000 +0900
@@ -0,0 +1,54 @@
+/*
+ * Achitecture independent kexec code for Xen
+ *
+ * At this statge, just a switch for the kexec hypercall into
+ * architecture dependent code.
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#include <xen/lib.h>
+#include <xen/errno.h>
+#include <xen/guest_access.h>
+#include <public/xen.h>
+#include <public/kexec.h>
+
+extern int machine_kexec_prepare(struct kexec_arg *arg);
+extern void machine_kexec_cleanup(struct kexec_arg *arg);
+extern void machine_kexec(struct kexec_arg *arg);
+
+int do_kexec(unsigned long op, 
+             GUEST_HANDLE(kexec_arg_t) uarg)
+{
+    struct kexec_arg arg;
+
+    if ( unlikely(copy_from_guest(&arg, uarg, 1) != 0) )
+    {
+        printk("do_kexec: __copy_from_guest failed");
+        return -EFAULT;
+    }
+
+	switch(op) {
+	case KEXEC_CMD_kexec:
+		machine_kexec(&arg);
+        return -EINVAL; /* Not Reached */
+	case KEXEC_CMD_kexec_prepare:
+		return machine_kexec_prepare(&arg);
+	case KEXEC_CMD_kexec_cleanup:
+		machine_kexec_cleanup(&arg);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
--- from-0001/xen/include/asm-x86/hypercall.h
+++ to-work/xen/include/asm-x86/hypercall.h	2006-04-07 13:05:06.000000000 +0900
@@ -6,6 +6,7 @@
 #define __ASM_X86_HYPERCALL_H__
 
 #include <public/physdev.h>
+#include <public/kexec.h>
 
 extern long
 do_set_trap_table(
@@ -79,6 +80,11 @@ extern long
 arch_do_vcpu_op(
     int cmd, struct vcpu *v, GUEST_HANDLE(void) arg);
 
+extern int
+do_kexec(
+    unsigned long op, 
+    GUEST_HANDLE(kexec_arg_t) uarg);
+
 #ifdef __x86_64__
 
 extern long
--- /dev/null
+++ to-work/xen/include/public/kexec.h	2006-04-07 12:44:43.000000000 +0900
@@ -0,0 +1,39 @@
+/*
+ * kexec.h: Xen kexec
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+/*
+ * Scratch space for passing arguments to the kexec hypercall
+ */
+typedef struct kexec_arg {
+    union {
+        struct {
+            unsigned long data; /* Not sure what this should be yet */
+        } helper;
+        struct {
+            unsigned long indirection_page;
+            unsigned long reboot_code_buffer;
+            unsigned long start_address;
+            const char *relocate_new_kernel;
+            unsigned int relocate_new_kernel_size;
+        } kexec;
+    } u;
+} kexec_arg_t;
+DEFINE_GUEST_HANDLE(kexec_arg_t);
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- from-0001/xen/include/public/xen.h
+++ to-work/xen/include/public/xen.h	2006-04-04 13:29:54.000000000 +0900
@@ -60,6 +60,7 @@
 #define __HYPERVISOR_acm_op               27
 #define __HYPERVISOR_nmi_op               28
 #define __HYPERVISOR_sched_op             29
+#define __HYPERVISOR_kexec_op             30
 
 /* 
  * VIRTUAL INTERRUPTS
@@ -206,6 +207,13 @@ DEFINE_GUEST_HANDLE(mmuext_op_t);
 #define VMASST_TYPE_writable_pagetables  2
 #define MAX_VMASST_TYPE 2
 
+/*
+ * Commands to HYPERVISOR_kexec().
+ */
+#define KEXEC_CMD_kexec                 0
+#define KEXEC_CMD_kexec_prepare         1
+#define KEXEC_CMD_kexec_cleanup         2
+
 #ifndef __ASSEMBLY__
 
 typedef uint16_t domid_t;
diff -r 0010df11836d buildconfigs/linux-defconfig_xen_x86_32
--- a/buildconfigs/linux-defconfig_xen_x86_32	Fri Apr  7 00:32:54 2006 +0100
+++ b/buildconfigs/linux-defconfig_xen_x86_32	Fri Apr  7 14:54:45 2006 +0900
@@ -184,6 +184,7 @@ CONFIG_HZ_100=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
+CONFIG_KEXEC=y
 # CONFIG_CRASH_DUMP is not set
 CONFIG_PHYSICAL_START=0x100000
 CONFIG_HOTPLUG_CPU=y

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386
  2006-04-07  7:42 [PATCH]: kexec: framework and i386 Horms
@ 2006-04-07 15:00 ` Don Zickus
  2006-04-10  5:09   ` Hirokazu Takahashi
  2006-04-07 15:09 ` Gerd Hoffmann
  2006-04-12  9:12 ` Horms
  2 siblings, 1 reply; 68+ messages in thread
From: Don Zickus @ 2006-04-07 15:00 UTC (permalink / raw)
  To: Horms; +Cc: Magnus Damm, xen-devel

On Fri, Apr 07, 2006 at 04:42:36PM +0900, Horms wrote:
> kexec: framework and i386
> 
> Here is a first cut of kexec for dom0/xen, which will actually
> kexec the physical machine from xen. The approach taken is
> to move the architecture-dependant kexec code into a new hypercall.
> 
> Some notes:
>   * machine_kexec_cleanup() and machine_kexec_prepare() don't do
>     anything in i386. So while this patch adds a framework for them,
>     I am not sure what parameters are needs at this stage.
>   * Only works for UP, as machine_shutdown is not implemented yet
>   * kexecing into xen does not seem to work, I think that 
>     kexec-tools needs updating, but I have not investigated yet
>   * I don't believe that kdump works yet
>   * This patch was prepared against xen-unstable.hg 9514
>     As of today (9574) two new hypercalls have been added.
>     I rediffed and moved the kexec hypercall to 33. However
>     this exceedes hypercall_NR, which is currently 32. 
>     I tried increasing this, but the dom0 now crashes 
>     in entry.S on init. Even after rebuilding both xen and the kernel
>     completely from scratch after a make distclean. Help!!
> 

I was looking at doing the same but focusing more on kdump initially.
However, the more I understood kexec/kdump and the more I understood the
hypervisor and xend, I realized they both were solving the same problem in
two different ways.  

Instead I was trying to focus on a domain0 failover/backup copy.  By
utilizing xend to set up all the infrastructure of loading the
image/initrd, I all I had to do was set a flag in the hypervisor letting
it know this was a second copy of another domain0.  

Upon reboot/crash, the hypervisor could then look to see if there is a
second copy of a domain0 and if so run that copy (which would perform the
same functionality as kexec AND kdump - minus the memory hole).  

This has the advantage (if done correctly) of not having to reboot the
domainU kernels (which is a _huge_ win).  The only penalty is dealing with
the couple of seconds when the domain0 switches block/net driver control
to the other domain0 and any dropped transactions. 

The infrastructure in xen is there, I am slowing weeding through the lower
layers to set the right bits and such.  Unfortunately, I can't commit all
my time to this little project but this is the direction I am trying to
head towards.  (Any help would be great!)

Like I said, this is my 2cents.  I just thought this approach would be a
better fit with xen, than trying to drag the whole kexec/kdump layer
inside the hypervisor.  Opinions are welcomed.

Cheers,
Don

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386
  2006-04-07  7:42 [PATCH]: kexec: framework and i386 Horms
  2006-04-07 15:00 ` Don Zickus
@ 2006-04-07 15:09 ` Gerd Hoffmann
  2006-04-08  4:39   ` Horms
  2006-04-12  9:12 ` Horms
  2 siblings, 1 reply; 68+ messages in thread
From: Gerd Hoffmann @ 2006-04-07 15:09 UTC (permalink / raw)
  To: Horms; +Cc: Magnus Damm, xen-devel

  Hi,

> Here is a first cut of kexec for dom0/xen, which will actually
> kexec the physical machine from xen. The approach taken is
> to move the architecture-dependant kexec code into a new hypercall.

First you need some more security checks.  On a first quick look it
seems you can zap and takeover the whole machine from within a domU by
kexec-booting the machine.

Second I think we'll need a new kexec flag to indicate we'll go zap the
physical machine, not the virtual machine.  I'm looking into the later,
and I think we'll be able to do both at some point in the future.  Maybe
it is enougth to care about dom0 (physical machine kexec) vs. domU
(virtual machine kexec) only though.  We certainly don't want allow
domUs kexec the whole machine, and virtual machine kexec for dom0
doesn't make that much sense given how tight xen and dom0 work hand-in-hand.

>   * kexecing into xen does not seem to work, I think that 
>     kexec-tools needs updating, but I have not investigated yet

Yep, actually _alot_ of the kexec magic happens in userspace.

cheers,

  Gerd

-- 
Gerd 'just married' Hoffmann <kraxel@suse.de>
I'm the hacker formerly known as Gerd Knorr.
http://www.suse.de/~kraxel/just-married.jpeg

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386
  2006-04-07 15:09 ` Gerd Hoffmann
@ 2006-04-08  4:39   ` Horms
  0 siblings, 0 replies; 68+ messages in thread
From: Horms @ 2006-04-08  4:39 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: Magnus Damm, xen-devel

On Fri, Apr 07, 2006 at 05:09:15PM +0200, Gerd Hoffmann wrote:
>   Hi,
> 
> > Here is a first cut of kexec for dom0/xen, which will actually
> > kexec the physical machine from xen. The approach taken is
> > to move the architecture-dependant kexec code into a new hypercall.
> 
> First you need some more security checks.  On a first quick look it
> seems you can zap and takeover the whole machine from within a domU by
> kexec-booting the machine.

Yes, I think you are right, I had completely forgotten about that.

> Second I think we'll need a new kexec flag to indicate we'll go zap the
> physical machine, not the virtual machine.  I'm looking into the later,
> and I think we'll be able to do both at some point in the future.  Maybe
> it is enougth to care about dom0 (physical machine kexec) vs. domU
> (virtual machine kexec) only though.  We certainly don't want allow
> domUs kexec the whole machine, and virtual machine kexec for dom0
> doesn't make that much sense given how tight xen and dom0 work hand-in-hand.

Sounds fine by me. The focus of what I was trying to achive is to zap
the entire physical machine, which is what the current code does. I am
actually most interested in kdump, though its not working yet. In any
case a flag makes perfect sense. Though it might make sense to add it
when more flexible incarnations of kexec are added.

> >   * kexecing into xen does not seem to work, I think that 
> >     kexec-tools needs updating, but I have not investigated yet
> 
> Yep, actually _alot_ of the kexec magic happens in userspace.

Yes, I became aware of that along the way. I'm pretty confident that
the way I have done things, if you fixed up user-space kexec so
that linux -> xen worked, then xen -> xen would also work.

-- 
Horms

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386
  2006-04-07 15:00 ` Don Zickus
@ 2006-04-10  5:09   ` Hirokazu Takahashi
  2006-04-10 15:38     ` Don Zickus
  0 siblings, 1 reply; 68+ messages in thread
From: Hirokazu Takahashi @ 2006-04-10  5:09 UTC (permalink / raw)
  To: dzickus; +Cc: magnus, horms, xen-devel

Hi Don,

> > kexec: framework and i386
> > 
> > Here is a first cut of kexec for dom0/xen, which will actually
> > kexec the physical machine from xen. The approach taken is
> > to move the architecture-dependant kexec code into a new hypercall.
> > 
> > Some notes:
> >   * machine_kexec_cleanup() and machine_kexec_prepare() don't do
> >     anything in i386. So while this patch adds a framework for them,
> >     I am not sure what parameters are needs at this stage.
> >   * Only works for UP, as machine_shutdown is not implemented yet
> >   * kexecing into xen does not seem to work, I think that 
> >     kexec-tools needs updating, but I have not investigated yet
> >   * I don't believe that kdump works yet
> >   * This patch was prepared against xen-unstable.hg 9514
> >     As of today (9574) two new hypercalls have been added.
> >     I rediffed and moved the kexec hypercall to 33. However
> >     this exceedes hypercall_NR, which is currently 32. 
> >     I tried increasing this, but the dom0 now crashes 
> >     in entry.S on init. Even after rebuilding both xen and the kernel
> >     completely from scratch after a make distclean. Help!!
> > 
> 
> I was looking at doing the same but focusing more on kdump initially.
> However, the more I understood kexec/kdump and the more I understood the
> hypervisor and xend, I realized they both were solving the same problem in
> two different ways.  
> 
> Instead I was trying to focus on a domain0 failover/backup copy.  By
> utilizing xend to set up all the infrastructure of loading the
> image/initrd, I all I had to do was set a flag in the hypervisor letting
> it know this was a second copy of another domain0.  
> 
> Upon reboot/crash, the hypervisor could then look to see if there is a
> second copy of a domain0 and if so run that copy (which would perform the
> same functionality as kexec AND kdump - minus the memory hole).  
> 
> This has the advantage (if done correctly) of not having to reboot the
> domainU kernels (which is a _huge_ win).  The only penalty is dealing with
> the couple of seconds when the domain0 switches block/net driver control
> to the other domain0 and any dropped transactions. 
> 
> The infrastructure in xen is there, I am slowing weeding through the lower
> layers to set the right bits and such.  Unfortunately, I can't commit all
> my time to this little project but this is the direction I am trying to
> head towards.  (Any help would be great!)
> 
> Like I said, this is my 2cents.  I just thought this approach would be a
> better fit with xen, than trying to drag the whole kexec/kdump layer
> inside the hypervisor.  Opinions are welcomed.
> 
> Cheers,
> Don


Would you let me confirm my understanding is correct?

You prefer kexec/kdump approach to take over a crashed domain0
than HA approach where the backup domain stands by.
This is because the former can reset its whole hardware
while it would be harder with the latter, right?


Thanks,
Hirokazu Takahashi.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386
  2006-04-10  5:09   ` Hirokazu Takahashi
@ 2006-04-10 15:38     ` Don Zickus
  2006-04-11  1:44       ` Horms
  0 siblings, 1 reply; 68+ messages in thread
From: Don Zickus @ 2006-04-10 15:38 UTC (permalink / raw)
  To: Hirokazu Takahashi; +Cc: magnus, horms, xen-devel

On Mon, Apr 10, 2006 at 02:09:17PM +0900, Hirokazu Takahashi wrote:
> Hi Don,
> 
> > > kexec: framework and i386
> > > 
> > > Here is a first cut of kexec for dom0/xen, which will actually
> > > kexec the physical machine from xen. The approach taken is
> > > to move the architecture-dependant kexec code into a new hypercall.
> > > 
> > > Some notes:
> > >   * machine_kexec_cleanup() and machine_kexec_prepare() don't do
> > >     anything in i386. So while this patch adds a framework for them,
> > >     I am not sure what parameters are needs at this stage.
> > >   * Only works for UP, as machine_shutdown is not implemented yet
> > >   * kexecing into xen does not seem to work, I think that 
> > >     kexec-tools needs updating, but I have not investigated yet
> > >   * I don't believe that kdump works yet
> > >   * This patch was prepared against xen-unstable.hg 9514
> > >     As of today (9574) two new hypercalls have been added.
> > >     I rediffed and moved the kexec hypercall to 33. However
> > >     this exceedes hypercall_NR, which is currently 32. 
> > >     I tried increasing this, but the dom0 now crashes 
> > >     in entry.S on init. Even after rebuilding both xen and the kernel
> > >     completely from scratch after a make distclean. Help!!
> > > 
> > 
> > I was looking at doing the same but focusing more on kdump initially.
> > However, the more I understood kexec/kdump and the more I understood the
> > hypervisor and xend, I realized they both were solving the same problem in
> > two different ways.  
> > 
> > Instead I was trying to focus on a domain0 failover/backup copy.  By
> > utilizing xend to set up all the infrastructure of loading the
> > image/initrd, I all I had to do was set a flag in the hypervisor letting
> > it know this was a second copy of another domain0.  
> > 
> > Upon reboot/crash, the hypervisor could then look to see if there is a
> > second copy of a domain0 and if so run that copy (which would perform the
> > same functionality as kexec AND kdump - minus the memory hole).  
> > 
> > This has the advantage (if done correctly) of not having to reboot the
> > domainU kernels (which is a _huge_ win).  The only penalty is dealing with
> > the couple of seconds when the domain0 switches block/net driver control
> > to the other domain0 and any dropped transactions. 
> > 
> > The infrastructure in xen is there, I am slowing weeding through the lower
> > layers to set the right bits and such.  Unfortunately, I can't commit all
> > my time to this little project but this is the direction I am trying to
> > head towards.  (Any help would be great!)
> > 
> > Like I said, this is my 2cents.  I just thought this approach would be a
> > better fit with xen, than trying to drag the whole kexec/kdump layer
> > inside the hypervisor.  Opinions are welcomed.
> > 
> > Cheers,
> > Don
> 
> 
> Would you let me confirm my understanding is correct?
> 
> You prefer kexec/kdump approach to take over a crashed domain0
> than HA approach where the backup domain stands by.
> This is because the former can reset its whole hardware
> while it would be harder with the latter, right?
> 
Actually the opposite.  I prefer the HA approach over kexec/kdump.  It
seemed like it gave more flexibility (reset dom0 or the whole machine).  

As much as I would like to see kexec/kdump in xen, for some reason it just
doesn't make sense to me.  


Cheers,
Don

> 
> Thanks,
> Hirokazu Takahashi.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386
  2006-04-10 15:38     ` Don Zickus
@ 2006-04-11  1:44       ` Horms
  2006-04-12 15:56         ` Don Zickus
  0 siblings, 1 reply; 68+ messages in thread
From: Horms @ 2006-04-11  1:44 UTC (permalink / raw)
  To: Don Zickus; +Cc: Hirokazu Takahashi, magnus, xen-devel

Hi Don, Hi all,

The key reason why I think that kexec/kdump does makes sense for xen, at
least to some extent, is for the case where the hypervisor goes into a
bad state, and you actually want to get rid of it and kdump into
something else for forensics. There is also the advantage that by
kexecing xen, you get access to the entire physical machine, either for
crash-dump analysis, or because *gasp* you want to get out of xen for
some other crazy reason :) And, on hardware that takes forever and a day
to reboot, I believe that doing a kexec will be quite useful for
hypervisor development.

I would also like to note, that while my patch does involve moving parts
of kexec/kdump into the hypervisor, and more similar parts need to be
added in order to support other architectures, it is by no means all of
kexec/kdump.

-- 
Horms

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386
  2006-04-07  7:42 [PATCH]: kexec: framework and i386 Horms
  2006-04-07 15:00 ` Don Zickus
  2006-04-07 15:09 ` Gerd Hoffmann
@ 2006-04-12  9:12 ` Horms
  2006-04-17  6:06   ` Horms
  2 siblings, 1 reply; 68+ messages in thread
From: Horms @ 2006-04-12  9:12 UTC (permalink / raw)
  To: xen-devel; +Cc: Magnus Damm


kexec: framework and i386

Hi, 

here is a second take at this patch. The main changes over
the predecessor are that kdump now works, mfns are used instead
of pfns (was wrong before), and some code has been moved about.
The code still uses the basic approach of moving architecture
specific opperations into the hypervisor.

Some notes:
  * machine_kexec_cleanup() and machine_kexec_prepare() don't do
    anything in i386. So while this patch adds a framework for them,
    I am not sure what parameters are needs at this stage.
  * Only works for UP, as machine_shutdown is not implemented yet
  * kexecing into xen does not seem to work, I think that 
    kexec-tools needs updating, but I have not investigated yet
  * I don't believe that kdump works yet
  * This patch was prepared against xen-unstable.hg 9514
    As of today (9574) two new hypercalls have been added.
    I rediffed and moved the kexec hypercall to 33. However
    this exceedes hypercall_NR, which is currently 32. 
    I tried increasing this, but the dom0 now crashes 
    in entry.S on init. Even after rebuilding both xen and the kernel
    completely from scratch after a make distclean. Help!!

Prepared with the assistance of my colleague Magnus Damm

Signed-Off-By: Horms <horms@verge.net.au>

--- from-0002/buildconfigs/linux-defconfig_xen_x86_32
+++ to-work/buildconfigs/linux-defconfig_xen_x86_32	2006-04-10 12:29:46.000000000 +0900
@@ -183,6 +183,7 @@ CONFIG_HZ_100=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
+CONFIG_KEXEC=y
 # CONFIG_CRASH_DUMP is not set
 CONFIG_PHYSICAL_START=0x100000
 CONFIG_HOTPLUG_CPU=y
--- from-0001/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ to-work/linux-2.6-xen-sparse/arch/i386/Kconfig	2006-04-10 12:29:46.000000000 +0900
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_XEN
+	depends on EXPERIMENTAL
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- from-0001/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ to-work/linux-2.6-xen-sparse/arch/i386/kernel/Makefile	2006-04-10 12:29:46.000000000 +0900
@@ -92,7 +92,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o machine_kexec.o crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- from-0001/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
+++ to-work/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile	2006-04-10 12:29:46.000000000 +0900
@@ -59,7 +59,7 @@ pci-dma-y			+= ../../i386/kernel/pci-dma
 microcode-$(subst m,y,$(CONFIG_MICROCODE))  := ../../i386/kernel/microcode-xen.o
 quirks-y			:= ../../i386/kernel/quirks-xen.o
 
-n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o
+n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o machine_kexec.o crash.o
 
 include $(srctree)/scripts/Makefile.xen
 
--- from-0001/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
+++ to-work/linux-2.6-xen-sparse/drivers/xen/core/reboot.c	2006-04-10 12:29:46.000000000 +0900
@@ -17,6 +17,11 @@
 #include <linux/kthread.h>
 #include <xen/gnttab.h>
 #include <xen/xencons.h>
+#include <linux/kexec.h>
+#include <xen/interface/kexec.h>
+#include <linux/mm.h>
+#include <asm/hypercall.h>
+
 
 #if defined(__i386__) || defined(__x86_64__)
 /*
@@ -38,6 +43,86 @@ extern void ctrl_alt_del(void);
  */
 #define SHUTDOWN_HALT      4
 
+void machine_shutdown(void) 
+{
+	printk("machine_shutdown: does nothing\n");
+}
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	/* XXX: This should do something */
+	printk("xen-kexec: Need to turn of other CPUS in "
+	       "machine_crash_shutdown()\n");
+}
+
+const extern unsigned char relocate_new_kernel[];
+extern unsigned int relocate_new_kernel_size;
+static kexec_arg_t hypercall_arg;
+
+/*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+ * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
+ * have been allocated, but the segments have yet
+ * been copied into the kernel.
+ *
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ *
+ * Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+    return 0;
+}
+
+/*
+ * Undo anything leftover by machine_kexec_prepare
+ * when an image is freed.
+ */
+void machine_kexec_cleanup(struct kimage *image)
+{
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+	kimage_entry_t *ptr, entry;
+
+	/* 
+	 * Translate addresses inside head from physcical to machine
+	 * In practice, this only needs to change the pointer to
+	 * indirection pages as non-indirected pages are relative.
+	 */
+	ptr = &image->head;
+	while ((entry = *ptr) && !(entry & IND_DONE)) {
+		if (!(entry & IND_DESTINATION))
+			*ptr = phys_to_machine(entry & PAGE_MASK) |
+				(entry & ~PAGE_MASK);
+
+		if (entry & IND_INDIRECTION)
+			ptr = __va(entry & PAGE_MASK);
+		else
+			ptr++;
+	}
+
+	/* Set up arguments to hypercall */
+	hypercall_arg.u.kexec.indirection_page = image->head;
+	hypercall_arg.u.kexec.reboot_code_buffer = 
+		pfn_to_mfn(page_to_pfn(image->control_code_page)) << PAGE_SHIFT;
+	hypercall_arg.u.kexec.start_address = image->start;
+	hypercall_arg.u.kexec.relocate_new_kernel = relocate_new_kernel;
+	hypercall_arg.u.kexec.relocate_new_kernel_size = 
+		relocate_new_kernel_size;
+
+	/* Let Xen do the rest of the work */
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, &hypercall_arg);
+}
+
 void machine_emergency_restart(void)
 {
 	/* We really want to get pending console data out before we die. */
--- from-0001/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ to-work/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h	2006-04-10 12:29:46.000000000 +0900
@@ -37,6 +37,8 @@
 # error "please don't include this file directly"
 #endif
 
+#include <xen/interface/kexec.h>
+
 #define __STR(x) #x
 #define STR(x) __STR(x)
 
@@ -329,6 +331,13 @@ HYPERVISOR_nmi_op(
 	return _hypercall2(int, nmi_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, kexec_arg_t * arg)
+{
+	return _hypercall2(int, kexec_op, op, arg); 
+}
+
 #endif /* __HYPERCALL_H__ */
 
 /*
--- from-0001/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h
+++ to-work/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h	2006-04-10 12:29:46.000000000 +0900
@@ -41,6 +41,8 @@
 # error "please don't include this file directly"
 #endif
 
+#include <xen/interface/kexec.h>
+
 #define __STR(x) #x
 #define STR(x) __STR(x)
 
@@ -330,6 +332,13 @@ HYPERVISOR_nmi_op(
 	return _hypercall2(int, nmi_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, kexec_arg_t * arg)
+{
+	return _hypercall2(int, kexec_op, op, arg); 
+}
+
 #endif /* __HYPERCALL_H__ */
 
 /*
--- from-0001/xen/arch/x86/x86_32/Makefile
+++ to-work/xen/arch/x86/x86_32/Makefile	2006-04-10 12:29:46.000000000 +0900
@@ -5,6 +5,7 @@ obj-y += entry.o
 obj-y += mm.o
 obj-y += seg_fixup.o
 obj-y += traps.o
+obj-y += machine_kexec.o
 
 obj-$(supervisor_mode_kernel) += supervisor_mode_kernel.o
 
--- from-0001/xen/arch/x86/x86_32/entry.S
+++ to-work/xen/arch/x86/x86_32/entry.S	2006-04-10 12:29:46.000000000 +0900
@@ -648,6 +648,7 @@ ENTRY(hypercall_table)
         .long do_acm_op
         .long do_nmi_op
         .long do_arch_sched_op
+        .long do_kexec             /* 30 */
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -683,6 +684,7 @@ ENTRY(hypercall_args_table)
         .byte 1 /* do_acm_op            */
         .byte 2 /* do_nmi_op            */
         .byte 2 /* do_arch_sched_op     */
+        .byte 2 /* do_kexec             */  /* 30 */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- /dev/null
+++ to-work/xen/arch/x86/x86_32/machine_kexec.c	2006-04-10 12:29:46.000000000 +0900
@@ -0,0 +1,168 @@
+/******************************************************************************
+ * arch/x86/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
+ */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/domain_page.h> 
+#include <xen/timer.h>
+#include <xen/sched.h>
+#include <asm/page.h> 
+#include <asm/flushtlb.h>
+#include <public/xen.h>
+#include <public/kexec.h>
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+                    unsigned long indirection_page,
+                    unsigned long reboot_code_buffer,
+                    unsigned long start_address,
+                    unsigned int has_pae);
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L2_ATTR (_PAGE_PRESENT)
+
+#ifndef CONFIG_X86_PAE
+
+static u32 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    unsigned long mfn;
+    u32 *pgtable_level2;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level2 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    write_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level2);
+}
+
+#else
+static u64 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+static u64 pgtable_level2[L2_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    int mfn;
+    intpte_t *pgtable_level3;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level3 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+    set_64bit(&pgtable_level3[l3_table_offset(address)],
+	      __pa(pgtable_level2) | L2_ATTR);
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    load_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level3);
+}
+#endif
+
+static void kexec_load_segments(void)
+{
+#define __SSTR(X) #X
+#define SSTR(X) __SSTR(X)
+    __asm__ __volatile__ (
+        "\tljmp $"SSTR(__HYPERVISOR_CS)",$1f\n"
+        "\t1:\n"
+        "\tmovl $"SSTR(__HYPERVISOR_DS)",%%eax\n"
+        "\tmovl %%eax,%%ds\n"
+        "\tmovl %%eax,%%es\n"
+        "\tmovl %%eax,%%fs\n"
+        "\tmovl %%eax,%%gs\n"
+        "\tmovl %%eax,%%ss\n"
+        ::: "eax", "memory");
+#undef SSTR
+#undef __SSTR
+}
+
+#define kexec_load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
+static void kexec_set_idt(void *newidt, __u16 limit)
+{
+    struct Xgt_desc_struct curidt;
+
+    /* ia32 supports unaliged loads & stores */
+    curidt.size    = limit;
+    curidt.address = (unsigned long)newidt;
+    
+    kexec_load_idt(&curidt);
+
+};
+
+#define kexec_load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
+static void kexec_set_gdt(void *newgdt, __u16 limit)
+{
+    struct Xgt_desc_struct curgdt;
+
+    /* ia32 supports unaligned loads & stores */
+    curgdt.size    = limit;
+    curgdt.address = (unsigned long)newgdt;
+
+    kexec_load_gdt(&curgdt);
+};
+
+int machine_kexec_prepare(struct kexec_arg *arg)
+{
+	return 0;
+}
+
+void machine_kexec_cleanup(struct kexec_arg *arg)
+{
+}
+
+void machine_kexec(struct kexec_arg *arg)
+{
+    relocate_new_kernel_t rnk;
+
+    local_irq_disable();
+
+    identity_map_page(arg->u.kexec.reboot_code_buffer);
+
+    copy_from_user((void *)arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.relocate_new_kernel,
+           arg->u.kexec.relocate_new_kernel_size);
+
+    kexec_load_segments();
+
+    kexec_set_gdt(__va(0),0);
+
+    kexec_set_idt(__va(0),0);
+
+    rnk = (relocate_new_kernel_t) arg->u.kexec.reboot_code_buffer;
+
+    (*rnk)(arg->u.kexec.indirection_page, arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.start_address, cpu_has_pae);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- from-0001/xen/common/Makefile
+++ to-work/xen/common/Makefile	2006-04-10 12:29:46.000000000 +0900
@@ -24,6 +24,7 @@ obj-y += trace.o
 obj-y += timer.o
 obj-y += vsprintf.o
 obj-y += xmalloc.o
+obj-y += kexec.o
 
 obj-$(perfc)       += perfc.o
 obj-$(crash_debug) += gdbstub.o
--- /dev/null
+++ to-work/xen/common/kexec.c	2006-04-10 12:38:29.000000000 +0900
@@ -0,0 +1,58 @@
+/*
+ * Achitecture independent kexec code for Xen
+ *
+ * At this statge, just a switch for the kexec hypercall into
+ * architecture dependent code.
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#include <xen/lib.h>
+#include <xen/errno.h>
+#include <xen/guest_access.h>
+#include <xen/sched.h>
+#include <public/xen.h>
+#include <public/kexec.h>
+
+extern int machine_kexec_prepare(struct kexec_arg *arg);
+extern void machine_kexec_cleanup(struct kexec_arg *arg);
+extern void machine_kexec(struct kexec_arg *arg);
+
+int do_kexec(unsigned long op, 
+             GUEST_HANDLE(kexec_arg_t) uarg)
+{
+    struct kexec_arg arg;
+
+    if ( !IS_PRIV(current->domain) )  
+        return -EPERM;
+
+    if ( unlikely(copy_from_guest(&arg, uarg, 1) != 0) )
+    {
+        printk("do_kexec: __copy_from_guest failed");
+        return -EFAULT;
+    }
+
+	switch(op) {
+	case KEXEC_CMD_kexec:
+		machine_kexec(&arg);
+        return -EINVAL; /* Not Reached */
+	case KEXEC_CMD_kexec_prepare:
+		return machine_kexec_prepare(&arg);
+	case KEXEC_CMD_kexec_cleanup:
+		machine_kexec_cleanup(&arg);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
--- from-0001/xen/include/asm-x86/hypercall.h
+++ to-work/xen/include/asm-x86/hypercall.h	2006-04-10 12:29:46.000000000 +0900
@@ -6,6 +6,7 @@
 #define __ASM_X86_HYPERCALL_H__
 
 #include <public/physdev.h>
+#include <public/kexec.h>
 
 extern long
 do_set_trap_table(
@@ -79,6 +80,11 @@ extern long
 arch_do_vcpu_op(
     int cmd, struct vcpu *v, GUEST_HANDLE(void) arg);
 
+extern int
+do_kexec(
+    unsigned long op, 
+    GUEST_HANDLE(kexec_arg_t) uarg);
+
 #ifdef __x86_64__
 
 extern long
--- /dev/null
+++ to-work/xen/include/public/kexec.h	2006-04-10 12:29:46.000000000 +0900
@@ -0,0 +1,39 @@
+/*
+ * kexec.h: Xen kexec
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+/*
+ * Scratch space for passing arguments to the kexec hypercall
+ */
+typedef struct kexec_arg {
+    union {
+        struct {
+            unsigned long data; /* Not sure what this should be yet */
+        } helper;
+        struct {
+            unsigned long indirection_page;
+            unsigned long reboot_code_buffer;
+            unsigned long start_address;
+            const char *relocate_new_kernel;
+            unsigned int relocate_new_kernel_size;
+        } kexec;
+    } u;
+} kexec_arg_t;
+DEFINE_GUEST_HANDLE(kexec_arg_t);
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- from-0001/xen/include/public/xen.h
+++ to-work/xen/include/public/xen.h	2006-04-10 12:29:46.000000000 +0900
@@ -60,6 +60,7 @@
 #define __HYPERVISOR_acm_op               27
 #define __HYPERVISOR_nmi_op               28
 #define __HYPERVISOR_sched_op             29
+#define __HYPERVISOR_kexec_op             30
 
 /* 
  * VIRTUAL INTERRUPTS
@@ -206,6 +207,13 @@ DEFINE_GUEST_HANDLE(mmuext_op_t);
 #define VMASST_TYPE_writable_pagetables  2
 #define MAX_VMASST_TYPE 2
 
+/*
+ * Commands to HYPERVISOR_kexec().
+ */
+#define KEXEC_CMD_kexec                 0
+#define KEXEC_CMD_kexec_prepare         1
+#define KEXEC_CMD_kexec_cleanup         2
+
 #ifndef __ASSEMBLY__
 
 typedef uint16_t domid_t;

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386
  2006-04-11  1:44       ` Horms
@ 2006-04-12 15:56         ` Don Zickus
  0 siblings, 0 replies; 68+ messages in thread
From: Don Zickus @ 2006-04-12 15:56 UTC (permalink / raw)
  To: Horms; +Cc: Hirokazu Takahashi, magnus, xen-devel

On Tue, Apr 11, 2006 at 10:44:37AM +0900, Horms wrote:
> Hi Don, Hi all,
> 
> The key reason why I think that kexec/kdump does makes sense for xen, at
> least to some extent, is for the case where the hypervisor goes into a
> bad state, and you actually want to get rid of it and kdump into
> something else for forensics. There is also the advantage that by
> kexecing xen, you get access to the entire physical machine, either for
> crash-dump analysis, or because *gasp* you want to get out of xen for
> some other crazy reason :) And, on hardware that takes forever and a day
> to reboot, I believe that doing a kexec will be quite useful for
> hypervisor development.

I guess I never thought about it from the hypervisor prospective.  ;) 
Part of my concern was that the hypervisor had a bunch of this
functionality built-in (like mapping memory and loading cpu context).

However, after re-reading some of the kexec code, you don't use the
hypervisor to load a new kernel into memory?  And I don't know enough
about the low level bits to understand if hypercall to load vcpu context
would be useful.  

> 
> I would also like to note, that while my patch does involve moving parts
> of kexec/kdump into the hypervisor, and more similar parts need to be
> added in order to support other architectures, it is by no means all of
> kexec/kdump.

I understand what you are saying now.  The first patch you sent I skimmed
through and immediately thought you were trying to moving most parts down
into the hypervisor.  Upon reviewing it again, it doesn't seem as
intrusive.  :)

Cheers,
Don

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386
  2006-04-12  9:12 ` Horms
@ 2006-04-17  6:06   ` Horms
  2006-04-21  1:28     ` [PATCH]: kexec: framework and i386 (Take IV) Horms
  2006-04-21  6:10     ` Re: [PATCH]: kexec: framework and i386 Akio Takebe
  0 siblings, 2 replies; 68+ messages in thread
From: Horms @ 2006-04-17  6:06 UTC (permalink / raw)
  To: xen-devel; +Cc: Magnus Damm

On Wed, Apr 12, 2006 at 06:12:30PM +0900, Horms wrote:
> 
> kexec: framework and i386

[snip]

>   * This patch was prepared against xen-unstable.hg 9514
>     As of today (9574) two new hypercalls have been added.
>     I rediffed and moved the kexec hypercall to 33. However
>     this exceedes hypercall_NR, which is currently 32. 
>     I tried increasing this, but the dom0 now crashes 
>     in entry.S on init. Even after rebuilding both xen and the kernel
>     completely from scratch after a make distclean. Help!!

Hi,

I am a bit concerned that this patch is going to start rotting if I
can't at least track the current xen-unstable.hg, or better still get it
merged.

I would really appreciate it if someone could take moments to comment on
the hypercall problem. Is adding a new hypercall, as the current patch
does, the best way? If so could someone point me to how to increase the
hypercall table size. If not, is it best to piggyback of the dom0_op
hypercall? Or is there some other prefered option?

-- 
Horms

^ permalink raw reply	[flat|nested] 68+ messages in thread

* [PATCH]: kexec: framework and i386 (Take IV)
  2006-04-17  6:06   ` Horms
@ 2006-04-21  1:28     ` Horms
  2006-04-21  6:10     ` Re: [PATCH]: kexec: framework and i386 Akio Takebe
  1 sibling, 0 replies; 68+ messages in thread
From: Horms @ 2006-04-21  1:28 UTC (permalink / raw)
  To: xen-devel; +Cc: Magnus Damm

Hi, 

here is the latest update of the kexec xen/dom0 patch.

-- 
Horms

kexec: framework and i386

This is an implementation of kexec for dom0/xen, that allows
kexecing of the physical machine from xen. The approach taken is
to move the architecture-dependant kexec code into a new hypercall.

Some notes:
  * machine_kexec_cleanup() and machine_kexec_prepare() don't do
    anything in i386. So while this patch adds a framework for them,
    I am not sure what parameters are needs at this stage.
  * Only works for UP, as machine_shutdown is not implemented yet
  * kexecing into xen does not seem to work, I think that 
    kexec-tools needs updating, but I have not investigated yet
  * Kdump works by first copying the kernel into dom0 segments
    and relocating them later in xen, the same way that kexec does
    The only difference is that the relocation is made into
    an area reserved by xen
  * Kdump reservation is made using the xen command line parameters,
    kdump_megabytes and kdump_megabytes_base, rather than
    the linux option crashkernel, which is now ignored.
    Two parameters are used instead of one to simplify parsing.
    This can be cleaned up later if desired. But the reservation
    seems to need to be made by xen to make sure that it happens
    early enough.
  * This patch uses dom0_op for hypercalls

Highlights since the previous posted version:
  * Use dom0_op instead of a new kexec hypercall
    - the hypercall table is currently full, so there is no where to 
      put a new kexec hypercall
    - This kexec patch makes sense for dom0 at this stage
  * Kernel notes are filled in for kdump
    - UP only, this patch does not support SMP kdump yet
  * Share x86 code between x86_64 and x86_32 
    (though x86_64 is not finished and not included in this patch)
  * Doesn't break x86_64 build

Prepared by Horms and Magnus Damm

Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
Signed-Off-By: Horms <horms@verge.net.au>

 linux-2.6-xen-sparse/arch/i386/Kconfig                |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/Makefile        |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c     |   26 ++
 linux-2.6-xen-sparse/drivers/xen/core/Makefile        |    1 
 linux-2.6-xen-sparse/drivers/xen/core/crash.c         |   98 +++++++++
 linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c |   78 +++++++
 linux-2.6-xen-sparse/drivers/xen/core/reboot.c        |    7 
 ref-linux-2.6.16/drivers/base/cpu.c                   |    4 
 ref-linux-2.6.16/kernel/kexec.c                       |   52 ++++-
 xen/arch/x86/Makefile                                 |    1 
 xen/arch/x86/dom0_ops.c                               |   33 +++
 xen/arch/x86/machine_kexec.c                          |  174 +++++++++++++++++
 xen/arch/x86/setup.c                                  |   75 ++++++-
 xen/common/page_alloc.c                               |   33 ++-
 xen/include/public/dom0_ops.h                         |   23 ++
 xen/include/public/xen.h                              |    8 
 xen/include/xen/mm.h                                  |    1 
 17 files changed, 585 insertions(+), 33 deletions(-)

--- x/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ x/linux-2.6-xen-sparse/arch/i386/Kconfig
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_XEN
+	depends on EXPERIMENTAL
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
@@ -92,7 +92,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o machine_kexec.o crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
@@ -68,6 +68,10 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/dom0_ops.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -932,6 +936,7 @@ static void __init parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -942,6 +947,10 @@ static void __init parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1318,9 +1327,23 @@ void __init setup_bootmem_allocator(void
 	}
 #endif
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end)
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
+#else
+	{
+		dom0_op_t op;
+		op.cmd = DOM0_KEXEC;
+		op.u.kexec.op = KEXEC_CMD_reserve;
+		BUG_ON(HYPERVISOR_dom0_op(&op));
+		if (op.u.kexec.u.reserve.size) {
+			crashk_res.start = op.u.kexec.u.reserve.start;
+			crashk_res.end = op.u.kexec.u.reserve.start + 
+				op.u.kexec.u.reserve.size - 1;
+		}
+	}
+#endif
 #endif
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1395,6 +1418,9 @@ legacy_init_iomem_resources(struct resou
 		res->end = map[i].end - 1;
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		request_resource(&iomem_resource, res);
+#ifdef CONFIG_KEXEC
+        request_resource(res, &crashk_res);
+#endif
 	}
 
 	free_bootmem(__pa(map), PAGE_SIZE);
--- x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
+++ x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_NET)     += skbuff.o
 obj-$(CONFIG_SMP)     += smpboot.o
 obj-$(CONFIG_SYSFS)   += hypervisor_sysfs.o
 obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
+obj-$(CONFIG_KEXEC)   += machine_kexec.o crash.o
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/crash.c
@@ -0,0 +1,98 @@
+/*
+ * Architecture specific (i386-xen) functions for kexec based crash dumps.
+ *
+ * Created by: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <linux/kernel.h> /* For printk */
+
+/* XXX: final_note(), crash_save_this_cpu() and crash_save_self()
+ * are copied from arch/i386/kernel/crash.c, might be good to either
+ * the original functions non-static and use them, or just
+ * merge this this into that file. 
+ */
+#include <linux/elf.h>     /* For struct elf_note */
+#include <linux/elfcore.h> /* For struct elf_prstatus */
+#include <linux/kexec.h>   /* crash_notes */
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+							       size_t data_len)
+{
+	struct elf_note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = data_len;
+	note.n_type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) +3)/4;
+	memcpy(buf, name, note.n_namesz);
+	buf += (note.n_namesz + 3)/4;
+	memcpy(buf, data, note.n_descsz);
+	buf += (note.n_descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	struct elf_note note;
+
+	note.n_namesz = 0;
+	note.n_descsz = 0;
+	note.n_type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
+{
+	struct elf_prstatus prstatus;
+	u32 *buf;
+
+	if ((cpu < 0) || (cpu >= NR_CPUS))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * I need a well defined structure format
+	 * for the data I pass, and I need tags
+	 * on the data to indicate what information I have
+	 * squirrelled away.  ELF notes happen to provide
+	 * all of that that no need to invent something new.
+	 */
+	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+	if (!buf)
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	prstatus.pr_pid = current->pid;
+	elf_core_copy_regs(&prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+				sizeof(prstatus));
+	final_note(buf);
+}
+
+static void crash_save_self(struct pt_regs *regs)
+{
+	int cpu;
+
+	cpu = smp_processor_id();
+	crash_save_this_cpu(regs, cpu);
+}
+
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	/* XXX: This should do something */
+	printk("xen-kexec: Need to turn of other CPUS in "
+	       "machine_crash_shutdown()\n");
+	crash_save_self(regs);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
@@ -0,0 +1,78 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <linux/kexec.h>
+#include <xen/interface/dom0_ops.h>
+#include <linux/mm.h>
+#include <asm/hypercall.h>
+
+const extern unsigned char relocate_new_kernel[];
+extern unsigned int relocate_new_kernel_size;
+
+/*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+ * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
+ * have been allocated, but the segments have yet
+ * been copied into the kernel.
+ *
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ *
+ * Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+    struct dom0_op op;
+    op.cmd = DOM0_KEXEC;
+    op.u.kexec.op = KEXEC_CMD_kexec_prepare;
+    op.u.kexec.u.helper.data = 0;
+    return HYPERVISOR_dom0_op(&op);
+}
+
+/*
+ * Undo anything leftover by machine_kexec_prepare
+ * when an image is freed.
+ */
+void machine_kexec_cleanup(struct kimage *image)
+{
+    struct dom0_op op;
+    op.cmd = DOM0_KEXEC;
+    op.u.kexec.op = KEXEC_CMD_kexec_cleanup;
+    op.u.kexec.u.helper.data = 0;
+    HYPERVISOR_dom0_op(&op);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+    struct dom0_op op;
+    op.cmd = DOM0_KEXEC;
+    op.u.kexec.op = KEXEC_CMD_kexec;
+    op.u.kexec.u.kexec.indirection_page = image->head;
+    op.u.kexec.u.kexec.reboot_code_buffer = 
+            pfn_to_mfn(page_to_pfn(image->control_code_page)) << PAGE_SHIFT;
+    op.u.kexec.u.kexec.start_address = image->start;
+    op.u.kexec.u.kexec.relocate_new_kernel = relocate_new_kernel;
+    op.u.kexec.u.kexec.relocate_new_kernel_size = relocate_new_kernel_size;
+    HYPERVISOR_dom0_op(&op);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
+++ x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
@@ -370,6 +370,13 @@ static int __init setup_shutdown_event(v
 
 subsys_initcall(setup_shutdown_event);
 
+#ifdef CONFIG_KEXEC
+void machine_shutdown(void) 
+{
+ 	printk("machine_shutdown: does nothing\n");
+}
+#endif
+
 /*
  * Local variables:
  *  c-file-style: "linux"
--- x/ref-linux-2.6.16/drivers/base/cpu.c
+++ x/ref-linux-2.6.16/drivers/base/cpu.c
@@ -101,7 +101,11 @@ static ssize_t show_crash_notes(struct s
 	 * boot up and this data does not change there after. Hence this
 	 * operation should be safe. No locking required.
 	 */
+#ifndef CONFIG_XEN
 	addr = __pa(per_cpu_ptr(crash_notes, cpunum));
+#else
+	addr = virt_to_machine(per_cpu_ptr(crash_notes, cpunum));
+#endif
 	rc = sprintf(buf, "%Lx\n", addr);
 	return rc;
 }
--- x/ref-linux-2.6.16/kernel/kexec.c
+++ x/ref-linux-2.6.16/kernel/kexec.c
@@ -38,6 +38,20 @@ struct resource crashk_res = {
 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
 };
 
+/* Kexec needs to know about the actually physical addresss.
+ * But in xen, a physical address is a pseudo-physical addresss. */
+#ifndef CONFIG_XEN
+#define kexec_page_to_pfn(page)  page_to_pfn(page)
+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#else
+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#endif
+
 int kexec_should_crash(struct task_struct *p)
 {
 	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
@@ -403,7 +417,7 @@ static struct page *kimage_alloc_normal_
 		pages = kimage_alloc_pages(GFP_KERNEL, order);
 		if (!pages)
 			break;
-		pfn   = page_to_pfn(pages);
+		pfn   = kexec_page_to_pfn(pages);
 		epfn  = pfn + count;
 		addr  = pfn << PAGE_SHIFT;
 		eaddr = epfn << PAGE_SHIFT;
@@ -437,6 +451,7 @@ static struct page *kimage_alloc_normal_
 	return pages;
 }
 
+#ifndef CONFIG_XEN
 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 						      unsigned int order)
 {
@@ -490,7 +505,7 @@ static struct page *kimage_alloc_crash_c
 		}
 		/* If I don't overlap any segments I have found my hole! */
 		if (i == image->nr_segments) {
-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+			pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
 			break;
 		}
 	}
@@ -517,6 +532,13 @@ struct page *kimage_alloc_control_pages(
 
 	return pages;
 }
+#else /* !CONFIG_XEN */
+struct page *kimage_alloc_control_pages(struct kimage *image,
+					 unsigned int order)
+{
+	return kimage_alloc_normal_control_pages(image, order);
+}
+#endif
 
 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 {
@@ -532,7 +554,7 @@ static int kimage_add_entry(struct kimag
 			return -ENOMEM;
 
 		ind_page = page_address(page);
-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+		*image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
 		image->entry = ind_page;
 		image->last_entry = ind_page +
 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@ -593,13 +615,13 @@ static int kimage_terminate(struct kimag
 #define for_each_kimage_entry(image, ptr, entry) \
 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 		ptr = (entry & IND_INDIRECTION)? \
-			phys_to_virt((entry & PAGE_MASK)): ptr +1)
+			kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
 
 static void kimage_free_entry(kimage_entry_t entry)
 {
 	struct page *page;
 
-	page = pfn_to_page(entry >> PAGE_SHIFT);
+	page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
 	kimage_free_pages(page);
 }
 
@@ -686,7 +708,7 @@ static struct page *kimage_alloc_page(st
 	 * have a match.
 	 */
 	list_for_each_entry(page, &image->dest_pages, lru) {
-		addr = page_to_pfn(page) << PAGE_SHIFT;
+		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
 		if (addr == destination) {
 			list_del(&page->lru);
 			return page;
@@ -701,12 +723,12 @@ static struct page *kimage_alloc_page(st
 		if (!page)
 			return NULL;
 		/* If the page cannot be used file it away */
-		if (page_to_pfn(page) >
+		if (kexec_page_to_pfn(page) >
 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 			list_add(&page->lru, &image->unuseable_pages);
 			continue;
 		}
-		addr = page_to_pfn(page) << PAGE_SHIFT;
+		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
 
 		/* If it is the destination page we want use it */
 		if (addr == destination)
@@ -729,7 +751,7 @@ static struct page *kimage_alloc_page(st
 			struct page *old_page;
 
 			old_addr = *old & PAGE_MASK;
-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+			old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
 			copy_highpage(page, old_page);
 			*old = addr | (*old & ~PAGE_MASK);
 
@@ -779,7 +801,7 @@ static int kimage_load_normal_segment(st
 			result  = -ENOMEM;
 			goto out;
 		}
-		result = kimage_add_page(image, page_to_pfn(page)
+		result = kimage_add_page(image, kexec_page_to_pfn(page)
 								<< PAGE_SHIFT);
 		if (result < 0)
 			goto out;
@@ -811,6 +833,7 @@ out:
 	return result;
 }
 
+#ifndef CONFIG_XEN
 static int kimage_load_crash_segment(struct kimage *image,
 					struct kexec_segment *segment)
 {
@@ -833,7 +856,7 @@ static int kimage_load_crash_segment(str
 		char *ptr;
 		size_t uchunk, mchunk;
 
-		page = pfn_to_page(maddr >> PAGE_SHIFT);
+		page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
 		if (page == 0) {
 			result  = -ENOMEM;
 			goto out;
@@ -881,6 +904,13 @@ static int kimage_load_segment(struct ki
 
 	return result;
 }
+#else /* CONFIG_XEN */
+static int kimage_load_segment(struct kimage *image,
+				struct kexec_segment *segment)
+{
+	return kimage_load_normal_segment(image, segment);
+}
+#endif
 
 /*
  * Exec Kernel system call: for obvious reasons only root may call it.
--- x/xen/arch/x86/Makefile
+++ x/xen/arch/x86/Makefile
@@ -38,6 +38,7 @@ obj-y += trampoline.o
 obj-y += traps.o
 obj-y += usercopy.o
 obj-y += x86_emulate.o
+obj-y += machine_kexec.o
 
 ifneq ($(pae),n)
 obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o
--- x/xen/arch/x86/dom0_ops.c
+++ x/xen/arch/x86/dom0_ops.c
@@ -29,6 +29,13 @@
 #include <asm/mtrr.h>
 #include "cpu/mtrr/mtrr.h"
 
+extern int machine_kexec_prepare(struct dom0_kexec *arg);
+extern void machine_kexec_cleanup(struct dom0_kexec *arg);
+extern void machine_kexec(struct dom0_kexec *arg);
+
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
 #define TRC_DOM0OP_ENTER_BASE  0x00020000
 #define TRC_DOM0OP_LEAVE_BASE  0x00030000
 
@@ -445,6 +452,32 @@ long arch_do_dom0_op(struct dom0_op *op,
     }
     break;
 
+    case DOM0_KEXEC:
+       	switch(op->u.kexec.op) {
+      	case KEXEC_CMD_kexec:
+            machine_kexec(&op->u.kexec);
+            ret = -EINVAL; /* Not Reached */
+	    break;
+	case KEXEC_CMD_kexec_prepare:
+            ret = machine_kexec_prepare(&op->u.kexec);
+	    break;
+	case KEXEC_CMD_kexec_cleanup:
+            machine_kexec_cleanup(&op->u.kexec);
+	    ret = 0;
+	    break;
+	case KEXEC_CMD_reserve:
+            op->u.kexec.u.reserve.size = opt_kdump_megabytes << 20;
+            op->u.kexec.u.reserve.start = opt_kdump_megabytes_base << 20;
+            if ( unlikely(copy_to_guest(u_dom0_op, op, 1) != 0) )
+            {
+                printk("arch_do_dom0_op: kexec: copy_to_guest failed");
+                return -EFAULT;
+            }
+	    ret = 0;
+	    break;
+	}
+    break;
+
     default:
         ret = -ENOSYS;
         break;
--- /dev/null
+++ x/xen/arch/x86/machine_kexec.c
@@ -0,0 +1,174 @@
+/******************************************************************************
+ * arch/x86/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
+ */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/domain_page.h> 
+#include <xen/timer.h>
+#include <xen/sched.h>
+#include <asm/page.h> 
+#include <asm/flushtlb.h>
+#include <public/xen.h>
+#include <public/dom0_ops.h>
+
+#ifdef CONFIG_X86_32
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+                    unsigned long indirection_page,
+                    unsigned long reboot_code_buffer,
+                    unsigned long start_address,
+                    unsigned int has_pae);
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L2_ATTR (_PAGE_PRESENT)
+
+#ifndef CONFIG_X86_PAE
+
+static u32 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    unsigned long mfn;
+    u32 *pgtable_level2;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level2 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    write_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level2);
+}
+
+#else
+static u64 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+static u64 pgtable_level2[L2_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    int mfn;
+    intpte_t *pgtable_level3;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level3 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+    set_64bit(&pgtable_level3[l3_table_offset(address)],
+	      __pa(pgtable_level2) | L2_ATTR);
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    load_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level3);
+}
+#endif
+
+static void kexec_load_segments(void)
+{
+#define __SSTR(X) #X
+#define SSTR(X) __SSTR(X)
+    __asm__ __volatile__ (
+        "\tljmp $"SSTR(__HYPERVISOR_CS)",$1f\n"
+        "\t1:\n"
+        "\tmovl $"SSTR(__HYPERVISOR_DS)",%%eax\n"
+        "\tmovl %%eax,%%ds\n"
+        "\tmovl %%eax,%%es\n"
+        "\tmovl %%eax,%%fs\n"
+        "\tmovl %%eax,%%gs\n"
+        "\tmovl %%eax,%%ss\n"
+        ::: "eax", "memory");
+#undef SSTR
+#undef __SSTR
+}
+
+#define kexec_load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
+static void kexec_set_idt(void *newidt, __u16 limit)
+{
+    struct Xgt_desc_struct curidt;
+
+    /* ia32 supports unaliged loads & stores */
+    curidt.size    = limit;
+    curidt.address = (unsigned long)newidt;
+    
+    kexec_load_idt(&curidt);
+
+};
+
+#define kexec_load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
+static void kexec_set_gdt(void *newgdt, __u16 limit)
+{
+    struct Xgt_desc_struct curgdt;
+
+    /* ia32 supports unaligned loads & stores */
+    curgdt.size    = limit;
+    curgdt.address = (unsigned long)newgdt;
+
+    kexec_load_gdt(&curgdt);
+};
+
+#endif
+
+int machine_kexec_prepare(struct dom0_kexec *arg)
+{
+	return 0;
+}
+
+void machine_kexec_cleanup(struct dom0_kexec *arg)
+{
+}
+
+void machine_kexec(struct dom0_kexec *arg)
+{
+#ifdef CONFIG_X86_32
+    relocate_new_kernel_t rnk;
+
+    local_irq_disable();
+
+    identity_map_page(arg->u.kexec.reboot_code_buffer);
+
+    copy_from_user((void *)arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.relocate_new_kernel,
+           arg->u.kexec.relocate_new_kernel_size);
+
+    kexec_load_segments();
+
+    kexec_set_gdt(__va(0),0);
+
+    kexec_set_idt(__va(0),0);
+
+    rnk = (relocate_new_kernel_t) arg->u.kexec.reboot_code_buffer;
+
+    (*rnk)(arg->u.kexec.indirection_page, arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.start_address, cpu_has_pae);
+#endif
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/setup.c
+++ x/xen/arch/x86/setup.c
@@ -37,6 +37,11 @@ static unsigned int opt_xenheap_megabyte
 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
 #endif
 
+unsigned int opt_kdump_megabytes = 0;
+integer_param("kdump_megabytes", opt_kdump_megabytes);
+unsigned int opt_kdump_megabytes_base = 0;
+integer_param("kdump_megabytes_base", opt_kdump_megabytes_base);
+
 /* opt_nosmp: If true, secondary processors are ignored. */
 static int opt_nosmp = 0;
 boolean_param("nosmp", opt_nosmp);
@@ -159,6 +164,20 @@ void discard_initial_images(void)
     init_domheap_pages(initial_images_start, initial_images_end);
 }
 
+void __init move_memory(unsigned long dst, 
+                          unsigned long src_start, unsigned long src_end)
+{
+#if defined(CONFIG_X86_32)
+    memmove((void *)dst,  /* use low mapping */
+            (void *)src_start,      /* use low mapping */
+            src_end - src_start);
+#elif defined(CONFIG_X86_64)
+    memmove(__va(dst),
+            __va(src_start),
+            src_end - src_start);
+#endif
+}
+
 void __init __start_xen(multiboot_info_t *mbi)
 {
     char *cmdline;
@@ -289,15 +308,8 @@ void __init __start_xen(multiboot_info_t
         initial_images_start = xenheap_phys_end;
     initial_images_end = initial_images_start + modules_length;
 
-#if defined(CONFIG_X86_32)
-    memmove((void *)initial_images_start,  /* use low mapping */
-            (void *)mod[0].mod_start,      /* use low mapping */
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#elif defined(CONFIG_X86_64)
-    memmove(__va(initial_images_start),
-            __va(mod[0].mod_start),
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#endif
+    move_memory(initial_images_start, 
+                mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
 
     /* Initialise boot-time allocator with all RAM situated after modules. */
     xenheap_phys_start = init_boot_allocator(__pa(&_end));
@@ -344,6 +356,51 @@ void __init __start_xen(multiboot_info_t
 #endif
     }
 
+    if (opt_kdump_megabytes) {
+        unsigned long kdump_start, kdump_size, k;
+
+        /* mark images pages as free for now */
+
+        init_boot_pages(initial_images_start, initial_images_end);
+
+        kdump_start = opt_kdump_megabytes_base << 20;
+        kdump_size = opt_kdump_megabytes << 20;
+
+        printk("Kdump: %luMB (%lukB) at 0x%lx\n", 
+               kdump_size >> 20,
+               kdump_size >> 10,
+               kdump_start);
+
+        if ((kdump_start & ~PAGE_MASK) || (kdump_size & ~PAGE_MASK))
+            panic("Kdump parameters not page aligned\n");
+
+        kdump_start >>= PAGE_SHIFT;
+        kdump_size >>= PAGE_SHIFT;
+
+        /* allocate pages for Kdump memory area */
+
+        k = alloc_boot_pages_at(kdump_size, kdump_start);
+
+        if (k != kdump_start)
+            panic("Unable to reserve Kdump memory\n");
+
+        /* allocate pages for relocated initial images */
+
+        k = ((initial_images_end - initial_images_start) & ~PAGE_MASK) ? 1 : 0;
+        k += (initial_images_end - initial_images_start) >> PAGE_SHIFT;
+
+        k = alloc_boot_pages(k, 1);
+
+        if (!k)
+            panic("Unable to allocate initial images memory\n");
+
+        move_memory(k << PAGE_SHIFT, initial_images_start, initial_images_end);
+
+        initial_images_end -= initial_images_start;
+        initial_images_start = k << PAGE_SHIFT;
+        initial_images_end += initial_images_start;
+    }        
+
     memguard_init();
 
     printk("System RAM: %luMB (%lukB)\n", 
--- x/xen/common/page_alloc.c
+++ x/xen/common/page_alloc.c
@@ -212,24 +212,35 @@ void init_boot_pages(paddr_t ps, paddr_t
     }
 }
 
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at)
+{
+    unsigned long i;
+
+    for ( i = 0; i < nr_pfns; i++ )
+        if ( allocated_in_map(pfn_at + i) )
+             break;
+
+    if ( i == nr_pfns )
+    {
+        map_alloc(pfn_at, nr_pfns);
+        return pfn_at;
+    }
+
+    return 0;
+}
+
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
 {
-    unsigned long pg, i;
+    unsigned long pg, i = 0;
 
     for ( pg = 0; (pg + nr_pfns) < max_page; pg += pfn_align )
     {
-        for ( i = 0; i < nr_pfns; i++ )
-            if ( allocated_in_map(pg + i) )
-                 break;
-
-        if ( i == nr_pfns )
-        {
-            map_alloc(pg, nr_pfns);
-            return pg;
-        }
+        i = alloc_boot_pages_at(nr_pfns, pg);
+        if (i != 0)
+            break;
     }
 
-    return 0;
+    return i;
 }
 
 
--- x/xen/include/public/dom0_ops.h
+++ x/xen/include/public/dom0_ops.h
@@ -472,6 +472,28 @@ typedef struct dom0_hypercall_init {
 } dom0_hypercall_init_t;
 DEFINE_GUEST_HANDLE(dom0_hypercall_init_t);
 
+#define DOM0_KEXEC   49
+typedef struct dom0_kexec{
+    unsigned long op;
+    union {
+        struct {
+            unsigned long data; /* Not sure what this should be yet */
+        } helper;
+        struct {
+            unsigned long indirection_page;
+            unsigned long reboot_code_buffer;
+            unsigned long start_address;
+            const char *relocate_new_kernel;
+            unsigned int relocate_new_kernel_size;
+        } kexec;
+        struct {
+            unsigned long size;
+            unsigned long start;
+        } reserve;
+    } u;
+} dom0_kexec_t;
+DEFINE_GUEST_HANDLE(dom0_kexec_t);
+
 typedef struct dom0_op {
     uint32_t cmd;
     uint32_t interface_version; /* DOM0_INTERFACE_VERSION */
@@ -513,6 +535,7 @@ typedef struct dom0_op {
         struct dom0_irq_permission    irq_permission;
         struct dom0_iomem_permission  iomem_permission;
         struct dom0_hypercall_init    hypercall_init;
+        struct dom0_kexec             kexec;
         uint8_t                       pad[128];
     } u;
 } dom0_op_t;
--- x/xen/include/public/xen.h
+++ x/xen/include/public/xen.h
@@ -215,6 +215,14 @@ DEFINE_GUEST_HANDLE(mmuext_op_t);
 #define VMASST_TYPE_writable_pagetables  2
 #define MAX_VMASST_TYPE 2
 
+/*
+ * Operations for kexec.
+ */
+#define KEXEC_CMD_kexec                 0
+#define KEXEC_CMD_kexec_prepare         1
+#define KEXEC_CMD_kexec_cleanup         2
+#define KEXEC_CMD_reserve               3
+
 #ifndef __ASSEMBLY__
 
 typedef uint16_t domid_t;
--- x/xen/include/xen/mm.h
+++ x/xen/include/xen/mm.h
@@ -40,6 +40,7 @@ struct page_info;
 paddr_t init_boot_allocator(paddr_t bitmap_start);
 void init_boot_pages(paddr_t ps, paddr_t pe);
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align);
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at);
 void end_boot_allocator(void);
 
 /* Generic allocator. These functions are *not* interrupt-safe. */

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386
  2006-04-17  6:06   ` Horms
  2006-04-21  1:28     ` [PATCH]: kexec: framework and i386 (Take IV) Horms
@ 2006-04-21  6:10     ` Akio Takebe
  2006-04-21  6:55       ` horms-home
  2006-04-23 14:45       ` Mark Williamson
  1 sibling, 2 replies; 68+ messages in thread
From: Akio Takebe @ 2006-04-21  6:10 UTC (permalink / raw)
  To: Horms, xen-devel, mark.williamson; +Cc: Magnus Damm

Hi, Horms and Mark

This is good work!
It is very necessary for debugging hypervisor or domain0.
I'm not clear at some points.

1. Is this feature available on uni-processor machine?
2. Could you explain more detail usage?

Mark, what do you think about this kdump implementation?

Best Regards,

Akio Takebe

>On Wed, Apr 12, 2006 at 06:12:30PM +0900, Horms wrote:
>> 
>> kexec: framework and i386
>
>[snip]
>
>>   * This patch was prepared against xen-unstable.hg 9514
>>     As of today (9574) two new hypercalls have been added.
>>     I rediffed and moved the kexec hypercall to 33. However
>>     this exceedes hypercall_NR, which is currently 32. 
>>     I tried increasing this, but the dom0 now crashes 
>>     in entry.S on init. Even after rebuilding both xen and the kernel
>>     completely from scratch after a make distclean. Help!!
>
>Hi,
>
>I am a bit concerned that this patch is going to start rotting if I
>can't at least track the current xen-unstable.hg, or better still get it
>merged.
>
>I would really appreciate it if someone could take moments to comment on
>the hypercall problem. Is adding a new hypercall, as the current patch
>does, the best way? If so could someone point me to how to increase the
>hypercall table size. If not, is it best to piggyback of the dom0_op
>hypercall? Or is there some other prefered option?
>
>-- 
>Horms
>
>_______________________________________________
>Xen-devel mailing list
>Xen-devel@lists.xensource.com
>http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386
  2006-04-21  6:10     ` Re: [PATCH]: kexec: framework and i386 Akio Takebe
@ 2006-04-21  6:55       ` horms-home
  2006-04-21  7:53         ` Akio Takebe
  2006-04-23 14:45       ` Mark Williamson
  1 sibling, 1 reply; 68+ messages in thread
From: horms-home @ 2006-04-21  6:55 UTC (permalink / raw)
  To: Akio Takebe; +Cc: Magnus Damm, xen-devel, mark.williamson

On Fri, Apr 21, 2006 at 03:10:33PM +0900, Akio Takebe wrote:
> Hi, Horms and Mark
> 
> This is good work!
> It is very necessary for debugging hypervisor or domain0.
> I'm not clear at some points.
> 
> 1. Is this feature available on uni-processor machine?

At this stage only uni-processor is supported. However, the framework
supports SMP, and Magnus and I are planning to fill in those bits soon.
Actually, this is the next thing on my list of things to do.

> 2. Could you explain more detail usage?

The usage is mostly the same as kexec in the Linux kernel.  You can
kexec from xen into another kernel by using kexec -l, kexec -e, as per
linux. 

kexec -l /boot/vmlinux --append "ro root=/dev/hda..."
kexec -e

And you can load a kernel that will be run on system crash using
kexec -p. This is discussed at some length in Documentation/kdump/kdump.txt
of the linux source tree. Those instructions can be followed
verbatim for xen.

The main difference with kdump, is that instead of using the crashdump
command line option to linux, you use the kdump_megabytes and
kdump_megabytes_base command line options to xen. When running xen the
crashdump linux command line option is ignored.

The reason for moving the option from linux to xen is that it seems that
the memory needs to be reserved by xen before it starts dom0. The reason
that there are two options instead of one is for simplicity.  Linux
provides infastructure to read the more complex compound option, xen
does not. This can be changed if it is a problem.

In summary, for kdump "linux crashdump=64M@16M"
becomes "xen kdump_megabytes=64 kdump_megabytes_base=16"

The other main point to note for users is that while the following
transition is possible: xen -> linux.  But xen -> xen and linux -> xen
is currently not possible.  This is because kexec-tool, the user-space
component of kexec does not understand enough about xen, and thus needs
to be enhanced in order to make this possible.

> Mark, what do you think about this kdump implementation?
> 
> Best Regards,
> 
> Akio Takebe

-- 
Horms

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386
  2006-04-21  6:55       ` horms-home
@ 2006-04-21  7:53         ` Akio Takebe
  0 siblings, 0 replies; 68+ messages in thread
From: Akio Takebe @ 2006-04-21  7:53 UTC (permalink / raw)
  To: horms-home; +Cc: Akio Takebe, Magnus Damm, xen-devel, mark.williamson

Hi, Horms

Thank you for your kind explanation.

I have a small question.
Is this kdump called only when domain0 panic?
When Xen/Hypervisor panic, is this kdump called?
I think it is necessary for dump feature to be called via NMI handler 
and panic(). (as linux code)

Best Regards,

Akio Takebe

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386
  2006-04-21  6:10     ` Re: [PATCH]: kexec: framework and i386 Akio Takebe
  2006-04-21  6:55       ` horms-home
@ 2006-04-23 14:45       ` Mark Williamson
  2006-04-24  1:10         ` Akio Takebe
  1 sibling, 1 reply; 68+ messages in thread
From: Mark Williamson @ 2006-04-23 14:45 UTC (permalink / raw)
  To: Akio Takebe; +Cc: Magnus Damm, Horms, xen-devel

> This is good work!
> It is very necessary for debugging hypervisor or domain0.

Agreed.

> Mark, what do you think about this kdump implementation?

I think kdump, in general, is a nifty solution for supporting crashdumps, 
since using a separate kernel for crashdumps gives you the best possible 
opportunity to complete them successfully.

Integrating with the Linux crashdump infrastructure seems like a good idea.  
It doesn't stop other dom0 OSes using their own crashdump infrastructure, but 
with kdump we have a chance of getting a dump even if Xen itself crashes 
(whilst admittedly rare, such crashes are something you'd want to get 
debugged quickly!)

So, essentially, I think the idea is good - I'll try and take a look through 
the code.

Cheers,
Mark

> Best Regards,
>
> Akio Takebe
>
> >On Wed, Apr 12, 2006 at 06:12:30PM +0900, Horms wrote:
> >> kexec: framework and i386
> >
> >[snip]
> >
> >>   * This patch was prepared against xen-unstable.hg 9514
> >>     As of today (9574) two new hypercalls have been added.
> >>     I rediffed and moved the kexec hypercall to 33. However
> >>     this exceedes hypercall_NR, which is currently 32.
> >>     I tried increasing this, but the dom0 now crashes
> >>     in entry.S on init. Even after rebuilding both xen and the kernel
> >>     completely from scratch after a make distclean. Help!!
> >
> >Hi,
> >
> >I am a bit concerned that this patch is going to start rotting if I
> >can't at least track the current xen-unstable.hg, or better still get it
> >merged.
> >
> >I would really appreciate it if someone could take moments to comment on
> >the hypercall problem. Is adding a new hypercall, as the current patch
> >does, the best way? If so could someone point me to how to increase the
> >hypercall table size. If not, is it best to piggyback of the dom0_op
> >hypercall? Or is there some other prefered option?
> >
> >--
> >Horms
> >
> >_______________________________________________
> >Xen-devel mailing list
> >Xen-devel@lists.xensource.com
> >http://lists.xensource.com/xen-devel

-- 
Dave: Just a question. What use is a unicyle with no seat?  And no pedals!
Mark: To answer a question with a question: What use is a skateboard?
Dave: Skateboards have wheels.
Mark: My wheel has a wheel!

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386
  2006-04-23 14:45       ` Mark Williamson
@ 2006-04-24  1:10         ` Akio Takebe
  2006-04-24  1:53           ` Hypercall number assignment convension (was Re: Re: [PATCH]: kexec: framework and i386) Isaku Yamahata
  0 siblings, 1 reply; 68+ messages in thread
From: Akio Takebe @ 2006-04-24  1:10 UTC (permalink / raw)
  To: Mark Williamson; +Cc: Magnus Damm, Horms, Akio Takebe, xen-devel

Hi, Mark

Thank you for your comments!

>I think kdump, in general, is a nifty solution for supporting crashdumps, 
>since using a separate kernel for crashdumps gives you the best possible 
>opportunity to complete them successfully.
>
>Integrating with the Linux crashdump infrastructure seems like a good idea.  
>It doesn't stop other dom0 OSes using their own crashdump infrastructure, 
>but 
>with kdump we have a chance of getting a dump even if Xen itself crashes 
>(whilst admittedly rare, such crashes are something you'd want to get 
>debugged quickly!)
>
Yes, I also agree.

>So, essentially, I think the idea is good - I'll try and take a look through 
>the code.
>
I also try to read a new patch.
http://lists.xensource.com/archives/html/xen-devel/2006-04/msg00968.html

Can kexec hypecll use Hypercall 11 or 30?
I think using not hypercall but dom0_op is good idea 
because using kexec is rare. :-)

Best Regards,

Akio Takebe

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Hypercall number assignment convension (was Re: Re: [PATCH]: kexec: framework and i386)
  2006-04-24  1:10         ` Akio Takebe
@ 2006-04-24  1:53           ` Isaku Yamahata
  2006-04-24  7:32             ` Keir Fraser
  0 siblings, 1 reply; 68+ messages in thread
From: Isaku Yamahata @ 2006-04-24  1:53 UTC (permalink / raw)
  To: Akio Takebe; +Cc: Magnus Damm, Horms, Mark Williamson, xen-devel



On Mon, Apr 24, 2006 at 10:10:36AM +0900, Akio Takebe wrote:

> Can kexec hypecll use Hypercall 11 or 30?
> I think using not hypercall but dom0_op is good idea 
> because using kexec is rare. :-)

I think Rusty's xen share also had a similar problem caused by
the hypercall number conflict.
Xen/ia64 with virtual physical model also needs a hypercall number
for its own use.
Currently it large enough (=256) that it is unlikly to be used by xen/x86.

Is there any convension about how to take hypercall number?
At least hypercall numbers for arch-specific purpose and
experimental purpose should be defined.
-- 
yamahata

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Hypercall number assignment convension (was Re: Re: [PATCH]: kexec: framework and i386)
  2006-04-24  1:53           ` Hypercall number assignment convension (was Re: Re: [PATCH]: kexec: framework and i386) Isaku Yamahata
@ 2006-04-24  7:32             ` Keir Fraser
  2006-04-24 11:20               ` Muli Ben-Yehuda
                                 ` (2 more replies)
  0 siblings, 3 replies; 68+ messages in thread
From: Keir Fraser @ 2006-04-24  7:32 UTC (permalink / raw)
  To: Isaku Yamahata
  Cc: Akio Takebe, Magnus Damm, xen-devel, Mark Williamson, Horms


On 24 Apr 2006, at 02:53, Isaku Yamahata wrote:

> I think Rusty's xen share also had a similar problem caused by
> the hypercall number conflict.
> Xen/ia64 with virtual physical model also needs a hypercall number
> for its own use.
> Currently it large enough (=256) that it is unlikly to be used by 
> xen/x86.
>
> Is there any convension about how to take hypercall number?
> At least hypercall numbers for arch-specific purpose and
> experimental purpose should be defined.

The list of __HYPERVISOR_* defines in public/xen.h in the main xen 
repository is the canonical place. For hypercalls in our tree, simply 
grabbing the next number in sequence usually makes sense. I'm not sure 
whether having structure to the hypercall numbers makes sense (e.g., a 
range for arch-specific usage) -- if so then maybe allocating from 64 
upwards would make sense.

  -- Keir

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Hypercall number assignment convension (was Re: Re: [PATCH]: kexec: framework and i386)
  2006-04-24  7:32             ` Keir Fraser
@ 2006-04-24 11:20               ` Muli Ben-Yehuda
  2006-04-25  0:11               ` Horms
  2006-04-26  2:09               ` Hypercall number assignment convension (was Re: Re: [PATCH]: kexec: framework and i386) Isaku Yamahata
  2 siblings, 0 replies; 68+ messages in thread
From: Muli Ben-Yehuda @ 2006-04-24 11:20 UTC (permalink / raw)
  To: Keir Fraser
  Cc: xen-devel, Akio Takebe, Isaku Yamahata, Magnus Damm, Horms,
	Mark Williamson

On Mon, Apr 24, 2006 at 08:32:09AM +0100, Keir Fraser wrote:

> The list of __HYPERVISOR_* defines in public/xen.h in the main xen 
> repository is the canonical place. For hypercalls in our tree, simply 
> grabbing the next number in sequence usually makes sense. I'm not sure 
> whether having structure to the hypercall numbers makes sense (e.g., a 
> range for arch-specific usage) -- if so then maybe allocating from 64 
> upwards would make sense.

Won't having discontigous regions of hcalls break the NR_hypercall
masking check?

Cheers,
Muli

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Hypercall number assignment convension (was Re: Re: [PATCH]: kexec: framework and i386)
  2006-04-24  7:32             ` Keir Fraser
  2006-04-24 11:20               ` Muli Ben-Yehuda
@ 2006-04-25  0:11               ` Horms
  2006-04-25  9:57                 ` Keir Fraser
  2006-04-26  2:09               ` Hypercall number assignment convension (was Re: Re: [PATCH]: kexec: framework and i386) Isaku Yamahata
  2 siblings, 1 reply; 68+ messages in thread
From: Horms @ 2006-04-25  0:11 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Isaku Yamahata, Magnus Damm, xen-devel, Mark Williamson,
	Akio Takebe

On Mon, Apr 24, 2006 at 08:32:09AM +0100, Keir Fraser wrote:
> 
> On 24 Apr 2006, at 02:53, Isaku Yamahata wrote:
> 
> >I think Rusty's xen share also had a similar problem caused by
> >the hypercall number conflict.
> >Xen/ia64 with virtual physical model also needs a hypercall number
> >for its own use.
> >Currently it large enough (=256) that it is unlikly to be used by 
> >xen/x86.
> >
> >Is there any convension about how to take hypercall number?
> >At least hypercall numbers for arch-specific purpose and
> >experimental purpose should be defined.
> 
> The list of __HYPERVISOR_* defines in public/xen.h in the main xen 
> repository is the canonical place. For hypercalls in our tree, simply 
> grabbing the next number in sequence usually makes sense. I'm not sure 
> whether having structure to the hypercall numbers makes sense (e.g., a 
> range for arch-specific usage) -- if so then maybe allocating from 64 
> upwards would make sense.

There is a small problem, in that for x86_32 at least the hypercall
table is currently full with 32 entries (well, the last time I checked
anyway), and my attempts to extend it were futile. Could you give me
some advice on how to increase its size?

-- 
Horms

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Hypercall number assignment convension (was Re: Re: [PATCH]: kexec: framework and i386)
  2006-04-25  0:11               ` Horms
@ 2006-04-25  9:57                 ` Keir Fraser
  2006-04-26  6:08                   ` [PATCH]: kexec: framework and i386 Take V Horms
  0 siblings, 1 reply; 68+ messages in thread
From: Keir Fraser @ 2006-04-25  9:57 UTC (permalink / raw)
  To: Horms; +Cc: Isaku Yamahata, Magnus Damm, xen-devel, Mark Williamson,
	Akio Takebe


On 25 Apr 2006, at 01:11, Horms wrote:

> There is a small problem, in that for x86_32 at least the hypercall
> table is currently full with 32 entries (well, the last time I checked
> anyway), and my attempts to extend it were futile. Could you give me
> some advice on how to increase its size?

Double NR_hypercalls in include/asm-x86/config.h.

  -- Keir

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Hypercall number assignment convension (was Re: Re: [PATCH]: kexec: framework and i386)
  2006-04-24  7:32             ` Keir Fraser
  2006-04-24 11:20               ` Muli Ben-Yehuda
  2006-04-25  0:11               ` Horms
@ 2006-04-26  2:09               ` Isaku Yamahata
  2 siblings, 0 replies; 68+ messages in thread
From: Isaku Yamahata @ 2006-04-26  2:09 UTC (permalink / raw)
  To: Keir Fraser; +Cc: Mark Williamson, Magnus Damm, xen-devel, Akio Takebe, Horms

[-- Attachment #1: Type: text/plain, Size: 697 bytes --]


On Mon, Apr 24, 2006 at 08:32:09AM +0100, Keir Fraser wrote:

> The list of __HYPERVISOR_* defines in public/xen.h in the main xen 
> repository is the canonical place. For hypercalls in our tree, simply 
> grabbing the next number in sequence usually makes sense. I'm not sure 
> whether having structure to the hypercall numbers makes sense (e.g., a 
> range for arch-specific usage) -- if so then maybe allocating from 64 
> upwards would make sense.

Actually xen/ia64 requires only one hypercall number for now.
I attached the patches to take one.
I'm not sure what name you prefer, so I attached two patches.
Please apply which you prefer. (or invent whatever name you like.)

-- 
yamahata

[-- Attachment #2: take_arch_specific_hypercall_number.patch --]
[-- Type: text/plain, Size: 532 bytes --]

take arch-specific purpose hypercall number.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>

diff -r 1ad06bd6832d xen/include/public/xen.h
--- a/xen/include/public/xen.h	Tue Apr 25 18:22:11 2006 +0100
+++ b/xen/include/public/xen.h	Wed Apr 26 11:00:04 2006 +0900
@@ -62,6 +62,7 @@
 #define __HYPERVISOR_sched_op             29
 #define __HYPERVISOR_callback_op          30
 #define __HYPERVISOR_xenoprof_op          31
+#define __HYPERVISOR_arch_specific_0      32 /* arch-specific purpose */
 
 /* 
  * VIRTUAL INTERRUPTS

[-- Attachment #3: take_ia64_dom0vp_op.patch --]
[-- Type: text/plain, Size: 512 bytes --]

take ia64 specific hypercall number.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>

diff -r 1ad06bd6832d xen/include/public/xen.h
--- a/xen/include/public/xen.h	Tue Apr 25 18:22:11 2006 +0100
+++ b/xen/include/public/xen.h	Wed Apr 26 11:01:19 2006 +0900
@@ -62,6 +62,7 @@
 #define __HYPERVISOR_sched_op             29
 #define __HYPERVISOR_callback_op          30
 #define __HYPERVISOR_xenoprof_op          31
+#define __HYPERVISOR_ia64_dom0vp_op       32 /* ia64 only */
 
 /* 
  * VIRTUAL INTERRUPTS

[-- Attachment #4: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386 Take V
  2006-04-25  9:57                 ` Keir Fraser
@ 2006-04-26  6:08                   ` Horms
  2006-05-02  8:17                     ` [PATCH]: kexec: framework and i386 (Take VI) Simon Horman [Horms]
  0 siblings, 1 reply; 68+ messages in thread
From: Horms @ 2006-04-26  6:08 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Isaku Yamahata, Magnus Damm, xen-devel, Mark Williamson,
	Akio Takebe

On Tue, Apr 25, 2006 at 10:57:11AM +0100, Keir Fraser wrote:
> 
> On 25 Apr 2006, at 01:11, Horms wrote:
> 
> >There is a small problem, in that for x86_32 at least the hypercall
> >table is currently full with 32 entries (well, the last time I checked
> >anyway), and my attempts to extend it were futile. Could you give me
> >some advice on how to increase its size?
> 
> Double NR_hypercalls in include/asm-x86/config.h.


Thanks, the new version of the kexec patch below does just that.
I did try that before, but for some reason it didn't work,
most likely because it was Friday afternoon at the time.

Also, this patch takes hypercall 32, which conflicts with Yamahata-san's
ia64 hypercall. Should I move to 33 and submit a patch that just does
hypercall reservation as he did?

-- 
Horms                                           http://www.vergenet.net/~horms/

kexec: framework and i386

This is an implementation of kexec for dom0/xen, that allows
kexecing of the physical machine from xen. The approach taken is
to move the architecture-dependant kexec code into a new hypercall.

Some notes:
  * machine_kexec_cleanup() and machine_kexec_prepare() don't do
    anything in i386. So while this patch adds a framework for them,
    I am not sure what parameters are needs at this stage.
  * Only works for UP, as machine_shutdown is not implemented yet
  * kexecing into xen does not seem to work, I think that 
    kexec-tools needs updating, but I have not investigated yet
  * Kdump works by first copying the kernel into dom0 segments
    and relocating them later in xen, the same way that kexec does
    The only difference is that the relocation is made into
    an area reserved by xen
  * Kdump reservation is made using the xen command line parameters,
    kdump_megabytes and kdump_megabytes_base, rather than
    the linux option crashkernel, which is now ignored.
    Two parameters are used instead of one to simplify parsing.
    This can be cleaned up later if desired. But the reservation
    seems to need to be made by xen to make sure that it happens
    early enough.
  * This patch uses a new kexec hypercall

Highlights since the previous posted version:
 
  * Use new kexec hypercall instead of dom0_op
    - hypercall table is expanded to 64 entries

Prepared by Horms and Magnus Damm

Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
Signed-Off-By: Horms <horms@verge.net.au>

 linux-2.6-xen-sparse/arch/i386/Kconfig                         |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/Makefile                 |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c              |   24 +
 linux-2.6-xen-sparse/drivers/xen/core/Makefile                 |    1 
 linux-2.6-xen-sparse/drivers/xen/core/crash.c                  |   98 +++++
 linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c          |   73 ++++
 linux-2.6-xen-sparse/drivers/xen/core/reboot.c                 |    7 
 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h |   10 
 ref-linux-2.6.16/drivers/base/cpu.c                            |    4 
 ref-linux-2.6.16/kernel/kexec.c                                |   52 ++
 xen/arch/x86/Makefile                                          |    1 
 xen/arch/x86/dom0_ops.c                                        |    3 
 xen/arch/x86/machine_kexec.c                                   |  174 ++++++++++
 xen/arch/x86/setup.c                                           |   75 +++-
 xen/arch/x86/x86_32/entry.S                                    |    2 
 xen/common/Makefile                                            |    1 
 xen/common/kexec.c                                             |   71 ++++
 xen/common/page_alloc.c                                        |   33 +
 xen/include/asm-x86/config.h                                   |    2 
 xen/include/asm-x86/hypercall.h                                |    5 
 xen/include/public/kexec.h                                     |   43 ++
 xen/include/public/xen.h                                       |    9 
 xen/include/xen/mm.h                                           |    1 
 23 files changed, 659 insertions(+), 34 deletions(-)

--- x/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ x/linux-2.6-xen-sparse/arch/i386/Kconfig
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_XEN
+	depends on EXPERIMENTAL
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
@@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o machine_kexec.o crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
@@ -68,6 +68,10 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -932,6 +936,7 @@ static void __init parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -942,6 +947,10 @@ static void __init parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1318,9 +1327,21 @@ void __init setup_bootmem_allocator(void
 	}
 #endif
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end)
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
+#else
+	{
+		struct kexec_arg xen_kexec_arg;
+		BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_reserve, &xen_kexec_arg));
+		if (xen_kexec_arg.u.reserve.size) {
+			crashk_res.start = xen_kexec_arg.u.reserve.start;
+			crashk_res.end = xen_kexec_arg.u.reserve.start + 
+				xen_kexec_arg.u.reserve.size - 1;
+		}
+	}
+#endif
 #endif
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1395,6 +1416,9 @@ legacy_init_iomem_resources(struct resou
 		res->end = map[i].end - 1;
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		request_resource(&iomem_resource, res);
+#ifdef CONFIG_KEXEC
+        request_resource(res, &crashk_res);
+#endif
 	}
 
 	free_bootmem(__pa(map), PAGE_SIZE);
--- x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
+++ x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_NET)     += skbuff.o
 obj-$(CONFIG_SMP)     += smpboot.o
 obj-$(CONFIG_SYSFS)   += hypervisor_sysfs.o
 obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
+obj-$(CONFIG_KEXEC)   += machine_kexec.o crash.o
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/crash.c
@@ -0,0 +1,98 @@
+/*
+ * Architecture specific (i386-xen) functions for kexec based crash dumps.
+ *
+ * Created by: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <linux/kernel.h> /* For printk */
+
+/* XXX: final_note(), crash_save_this_cpu() and crash_save_self()
+ * are copied from arch/i386/kernel/crash.c, might be good to either
+ * the original functions non-static and use them, or just
+ * merge this this into that file. 
+ */
+#include <linux/elf.h>     /* For struct elf_note */
+#include <linux/elfcore.h> /* For struct elf_prstatus */
+#include <linux/kexec.h>   /* crash_notes */
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+							       size_t data_len)
+{
+	struct elf_note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = data_len;
+	note.n_type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) +3)/4;
+	memcpy(buf, name, note.n_namesz);
+	buf += (note.n_namesz + 3)/4;
+	memcpy(buf, data, note.n_descsz);
+	buf += (note.n_descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	struct elf_note note;
+
+	note.n_namesz = 0;
+	note.n_descsz = 0;
+	note.n_type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
+{
+	struct elf_prstatus prstatus;
+	u32 *buf;
+
+	if ((cpu < 0) || (cpu >= NR_CPUS))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * I need a well defined structure format
+	 * for the data I pass, and I need tags
+	 * on the data to indicate what information I have
+	 * squirrelled away.  ELF notes happen to provide
+	 * all of that that no need to invent something new.
+	 */
+	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+	if (!buf)
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	prstatus.pr_pid = current->pid;
+	elf_core_copy_regs(&prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+				sizeof(prstatus));
+	final_note(buf);
+}
+
+static void crash_save_self(struct pt_regs *regs)
+{
+	int cpu;
+
+	cpu = smp_processor_id();
+	crash_save_this_cpu(regs, cpu);
+}
+
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	/* XXX: This should do something */
+	printk("xen-kexec: Need to turn of other CPUS in "
+	       "machine_crash_shutdown()\n");
+	crash_save_self(regs);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
@@ -0,0 +1,73 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <linux/kexec.h>
+#include <xen/interface/kexec.h>
+#include <linux/mm.h>
+#include <asm/hypercall.h>
+
+const extern unsigned char relocate_new_kernel[];
+extern unsigned int relocate_new_kernel_size;
+
+/*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+ * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
+ * have been allocated, but the segments have yet
+ * been copied into the kernel.
+ *
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ *
+ * Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+	kexec_arg_t hypercall_arg;
+       	hypercall_arg.u.helper.data = NULL;
+	return HYPERVISOR_kexec(KEXEC_CMD_kexec_prepare, &hypercall_arg);
+}
+
+/*
+ * Undo anything leftover by machine_kexec_prepare
+ * when an image is freed.
+ */
+void machine_kexec_cleanup(struct kimage *image)
+{
+	kexec_arg_t hypercall_arg;
+	hypercall_arg.u.helper.data = NULL;
+	HYPERVISOR_kexec(KEXEC_CMD_kexec_cleanup, &hypercall_arg);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+	kexec_arg_t hypercall_arg;
+	hypercall_arg.u.kexec.indirection_page = image->head;
+	hypercall_arg.u.kexec.reboot_code_buffer = 
+		pfn_to_mfn(page_to_pfn(image->control_code_page)) << PAGE_SHIFT;
+	hypercall_arg.u.kexec.start_address = image->start;
+	hypercall_arg.u.kexec.relocate_new_kernel = relocate_new_kernel;
+	hypercall_arg.u.kexec.relocate_new_kernel_size = 
+		relocate_new_kernel_size;
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, &hypercall_arg);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
+++ x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
@@ -370,6 +370,13 @@ static int __init setup_shutdown_event(v
 
 subsys_initcall(setup_shutdown_event);
 
+#ifdef CONFIG_KEXEC
+void machine_shutdown(void) 
+{
+ 	printk("machine_shutdown: does nothing\n");
+}
+#endif
+
 /*
  * Local variables:
  *  c-file-style: "linux"
--- x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
@@ -37,6 +37,8 @@
 # error "please don't include this file directly"
 #endif
 
+#include <xen/interface/kexec.h>
+
 #define __STR(x) #x
 #define STR(x) __STR(x)
 
@@ -343,6 +345,14 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, kexec_arg_t * arg)
+{
+	return _hypercall2(int, kexec_op, op, arg); 
+}
+
+
 
 #endif /* __HYPERCALL_H__ */
 
--- x/ref-linux-2.6.16/drivers/base/cpu.c
+++ x/ref-linux-2.6.16/drivers/base/cpu.c
@@ -101,7 +101,11 @@ static ssize_t show_crash_notes(struct s
 	 * boot up and this data does not change there after. Hence this
 	 * operation should be safe. No locking required.
 	 */
+#ifndef CONFIG_XEN
 	addr = __pa(per_cpu_ptr(crash_notes, cpunum));
+#else
+	addr = virt_to_machine(per_cpu_ptr(crash_notes, cpunum));
+#endif
 	rc = sprintf(buf, "%Lx\n", addr);
 	return rc;
 }
--- x/ref-linux-2.6.16/kernel/kexec.c
+++ x/ref-linux-2.6.16/kernel/kexec.c
@@ -38,6 +38,20 @@ struct resource crashk_res = {
 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
 };
 
+/* Kexec needs to know about the actually physical addresss.
+ * But in xen, a physical address is a pseudo-physical addresss. */
+#ifndef CONFIG_XEN
+#define kexec_page_to_pfn(page)  page_to_pfn(page)
+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#else
+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#endif
+
 int kexec_should_crash(struct task_struct *p)
 {
 	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
@@ -403,7 +417,7 @@ static struct page *kimage_alloc_normal_
 		pages = kimage_alloc_pages(GFP_KERNEL, order);
 		if (!pages)
 			break;
-		pfn   = page_to_pfn(pages);
+		pfn   = kexec_page_to_pfn(pages);
 		epfn  = pfn + count;
 		addr  = pfn << PAGE_SHIFT;
 		eaddr = epfn << PAGE_SHIFT;
@@ -437,6 +451,7 @@ static struct page *kimage_alloc_normal_
 	return pages;
 }
 
+#ifndef CONFIG_XEN
 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 						      unsigned int order)
 {
@@ -490,7 +505,7 @@ static struct page *kimage_alloc_crash_c
 		}
 		/* If I don't overlap any segments I have found my hole! */
 		if (i == image->nr_segments) {
-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+			pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
 			break;
 		}
 	}
@@ -517,6 +532,13 @@ struct page *kimage_alloc_control_pages(
 
 	return pages;
 }
+#else /* !CONFIG_XEN */
+struct page *kimage_alloc_control_pages(struct kimage *image,
+					 unsigned int order)
+{
+	return kimage_alloc_normal_control_pages(image, order);
+}
+#endif
 
 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 {
@@ -532,7 +554,7 @@ static int kimage_add_entry(struct kimag
 			return -ENOMEM;
 
 		ind_page = page_address(page);
-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+		*image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
 		image->entry = ind_page;
 		image->last_entry = ind_page +
 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@ -593,13 +615,13 @@ static int kimage_terminate(struct kimag
 #define for_each_kimage_entry(image, ptr, entry) \
 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 		ptr = (entry & IND_INDIRECTION)? \
-			phys_to_virt((entry & PAGE_MASK)): ptr +1)
+			kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
 
 static void kimage_free_entry(kimage_entry_t entry)
 {
 	struct page *page;
 
-	page = pfn_to_page(entry >> PAGE_SHIFT);
+	page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
 	kimage_free_pages(page);
 }
 
@@ -686,7 +708,7 @@ static struct page *kimage_alloc_page(st
 	 * have a match.
 	 */
 	list_for_each_entry(page, &image->dest_pages, lru) {
-		addr = page_to_pfn(page) << PAGE_SHIFT;
+		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
 		if (addr == destination) {
 			list_del(&page->lru);
 			return page;
@@ -701,12 +723,12 @@ static struct page *kimage_alloc_page(st
 		if (!page)
 			return NULL;
 		/* If the page cannot be used file it away */
-		if (page_to_pfn(page) >
+		if (kexec_page_to_pfn(page) >
 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 			list_add(&page->lru, &image->unuseable_pages);
 			continue;
 		}
-		addr = page_to_pfn(page) << PAGE_SHIFT;
+		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
 
 		/* If it is the destination page we want use it */
 		if (addr == destination)
@@ -729,7 +751,7 @@ static struct page *kimage_alloc_page(st
 			struct page *old_page;
 
 			old_addr = *old & PAGE_MASK;
-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+			old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
 			copy_highpage(page, old_page);
 			*old = addr | (*old & ~PAGE_MASK);
 
@@ -779,7 +801,7 @@ static int kimage_load_normal_segment(st
 			result  = -ENOMEM;
 			goto out;
 		}
-		result = kimage_add_page(image, page_to_pfn(page)
+		result = kimage_add_page(image, kexec_page_to_pfn(page)
 								<< PAGE_SHIFT);
 		if (result < 0)
 			goto out;
@@ -811,6 +833,7 @@ out:
 	return result;
 }
 
+#ifndef CONFIG_XEN
 static int kimage_load_crash_segment(struct kimage *image,
 					struct kexec_segment *segment)
 {
@@ -833,7 +856,7 @@ static int kimage_load_crash_segment(str
 		char *ptr;
 		size_t uchunk, mchunk;
 
-		page = pfn_to_page(maddr >> PAGE_SHIFT);
+		page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
 		if (page == 0) {
 			result  = -ENOMEM;
 			goto out;
@@ -881,6 +904,13 @@ static int kimage_load_segment(struct ki
 
 	return result;
 }
+#else /* CONFIG_XEN */
+static int kimage_load_segment(struct kimage *image,
+				struct kexec_segment *segment)
+{
+	return kimage_load_normal_segment(image, segment);
+}
+#endif
 
 /*
  * Exec Kernel system call: for obvious reasons only root may call it.
--- x/xen/arch/x86/Makefile
+++ x/xen/arch/x86/Makefile
@@ -38,6 +38,7 @@ obj-y += trampoline.o
 obj-y += traps.o
 obj-y += usercopy.o
 obj-y += x86_emulate.o
+obj-y += machine_kexec.o
 
 ifneq ($(pae),n)
 obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o
--- x/xen/arch/x86/dom0_ops.c
+++ x/xen/arch/x86/dom0_ops.c
@@ -29,6 +29,9 @@
 #include <asm/mtrr.h>
 #include "cpu/mtrr/mtrr.h"
 
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
 #define TRC_DOM0OP_ENTER_BASE  0x00020000
 #define TRC_DOM0OP_LEAVE_BASE  0x00030000
 
--- /dev/null
+++ x/xen/arch/x86/machine_kexec.c
@@ -0,0 +1,174 @@
+/******************************************************************************
+ * arch/x86/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
+ */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/domain_page.h> 
+#include <xen/timer.h>
+#include <xen/sched.h>
+#include <asm/page.h> 
+#include <asm/flushtlb.h>
+#include <public/xen.h>
+#include <public/kexec.h>
+
+#ifdef CONFIG_X86_32
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+                    unsigned long indirection_page,
+                    unsigned long reboot_code_buffer,
+                    unsigned long start_address,
+                    unsigned int has_pae);
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L2_ATTR (_PAGE_PRESENT)
+
+#ifndef CONFIG_X86_PAE
+
+static u32 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    unsigned long mfn;
+    u32 *pgtable_level2;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level2 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    write_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level2);
+}
+
+#else
+static u64 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+static u64 pgtable_level2[L2_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    int mfn;
+    intpte_t *pgtable_level3;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level3 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+    set_64bit(&pgtable_level3[l3_table_offset(address)],
+	      __pa(pgtable_level2) | L2_ATTR);
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    load_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level3);
+}
+#endif
+
+static void kexec_load_segments(void)
+{
+#define __SSTR(X) #X
+#define SSTR(X) __SSTR(X)
+    __asm__ __volatile__ (
+        "\tljmp $"SSTR(__HYPERVISOR_CS)",$1f\n"
+        "\t1:\n"
+        "\tmovl $"SSTR(__HYPERVISOR_DS)",%%eax\n"
+        "\tmovl %%eax,%%ds\n"
+        "\tmovl %%eax,%%es\n"
+        "\tmovl %%eax,%%fs\n"
+        "\tmovl %%eax,%%gs\n"
+        "\tmovl %%eax,%%ss\n"
+        ::: "eax", "memory");
+#undef SSTR
+#undef __SSTR
+}
+
+#define kexec_load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
+static void kexec_set_idt(void *newidt, __u16 limit)
+{
+    struct Xgt_desc_struct curidt;
+
+    /* ia32 supports unaliged loads & stores */
+    curidt.size    = limit;
+    curidt.address = (unsigned long)newidt;
+    
+    kexec_load_idt(&curidt);
+
+};
+
+#define kexec_load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
+static void kexec_set_gdt(void *newgdt, __u16 limit)
+{
+    struct Xgt_desc_struct curgdt;
+
+    /* ia32 supports unaligned loads & stores */
+    curgdt.size    = limit;
+    curgdt.address = (unsigned long)newgdt;
+
+    kexec_load_gdt(&curgdt);
+};
+
+#endif
+
+int machine_kexec_prepare(struct kexec_arg *arg)
+{
+	return 0;
+}
+
+void machine_kexec_cleanup(struct kexec_arg *arg)
+{
+}
+
+void machine_kexec(struct kexec_arg *arg)
+{
+#ifdef CONFIG_X86_32
+    relocate_new_kernel_t rnk;
+
+    local_irq_disable();
+
+    identity_map_page(arg->u.kexec.reboot_code_buffer);
+
+    copy_from_user((void *)arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.relocate_new_kernel,
+           arg->u.kexec.relocate_new_kernel_size);
+
+    kexec_load_segments();
+
+    kexec_set_gdt(__va(0),0);
+
+    kexec_set_idt(__va(0),0);
+
+    rnk = (relocate_new_kernel_t) arg->u.kexec.reboot_code_buffer;
+
+    (*rnk)(arg->u.kexec.indirection_page, arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.start_address, cpu_has_pae);
+#endif
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/setup.c
+++ x/xen/arch/x86/setup.c
@@ -38,6 +38,11 @@ static unsigned int opt_xenheap_megabyte
 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
 #endif
 
+unsigned int opt_kdump_megabytes = 0;
+integer_param("kdump_megabytes", opt_kdump_megabytes);
+unsigned int opt_kdump_megabytes_base = 0;
+integer_param("kdump_megabytes_base", opt_kdump_megabytes_base);
+
 /* opt_nosmp: If true, secondary processors are ignored. */
 static int opt_nosmp = 0;
 boolean_param("nosmp", opt_nosmp);
@@ -192,6 +197,20 @@ static void percpu_free_unused_areas(voi
                        __pa(__per_cpu_end));
 }
 
+void __init move_memory(unsigned long dst, 
+                          unsigned long src_start, unsigned long src_end)
+{
+#if defined(CONFIG_X86_32)
+    memmove((void *)dst,  /* use low mapping */
+            (void *)src_start,      /* use low mapping */
+            src_end - src_start);
+#elif defined(CONFIG_X86_64)
+    memmove(__va(dst),
+            __va(src_start),
+            src_end - src_start);
+#endif
+}
+
 void __init __start_xen(multiboot_info_t *mbi)
 {
     char __cmdline[] = "", *cmdline = __cmdline;
@@ -327,15 +346,8 @@ void __init __start_xen(multiboot_info_t
         initial_images_start = xenheap_phys_end;
     initial_images_end = initial_images_start + modules_length;
 
-#if defined(CONFIG_X86_32)
-    memmove((void *)initial_images_start,  /* use low mapping */
-            (void *)mod[0].mod_start,      /* use low mapping */
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#elif defined(CONFIG_X86_64)
-    memmove(__va(initial_images_start),
-            __va(mod[0].mod_start),
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#endif
+    move_memory(initial_images_start, 
+                mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
 
     /* Initialise boot-time allocator with all RAM situated after modules. */
     xenheap_phys_start = init_boot_allocator(__pa(&_end));
@@ -383,6 +395,51 @@ void __init __start_xen(multiboot_info_t
 #endif
     }
 
+    if (opt_kdump_megabytes) {
+        unsigned long kdump_start, kdump_size, k;
+
+        /* mark images pages as free for now */
+
+        init_boot_pages(initial_images_start, initial_images_end);
+
+        kdump_start = opt_kdump_megabytes_base << 20;
+        kdump_size = opt_kdump_megabytes << 20;
+
+        printk("Kdump: %luMB (%lukB) at 0x%lx\n", 
+               kdump_size >> 20,
+               kdump_size >> 10,
+               kdump_start);
+
+        if ((kdump_start & ~PAGE_MASK) || (kdump_size & ~PAGE_MASK))
+            panic("Kdump parameters not page aligned\n");
+
+        kdump_start >>= PAGE_SHIFT;
+        kdump_size >>= PAGE_SHIFT;
+
+        /* allocate pages for Kdump memory area */
+
+        k = alloc_boot_pages_at(kdump_size, kdump_start);
+
+        if (k != kdump_start)
+            panic("Unable to reserve Kdump memory\n");
+
+        /* allocate pages for relocated initial images */
+
+        k = ((initial_images_end - initial_images_start) & ~PAGE_MASK) ? 1 : 0;
+        k += (initial_images_end - initial_images_start) >> PAGE_SHIFT;
+
+        k = alloc_boot_pages(k, 1);
+
+        if (!k)
+            panic("Unable to allocate initial images memory\n");
+
+        move_memory(k << PAGE_SHIFT, initial_images_start, initial_images_end);
+
+        initial_images_end -= initial_images_start;
+        initial_images_start = k << PAGE_SHIFT;
+        initial_images_end += initial_images_start;
+    }        
+
     memguard_init();
 
     printk("System RAM: %luMB (%lukB)\n", 
--- x/xen/arch/x86/x86_32/entry.S
+++ x/xen/arch/x86/x86_32/entry.S
@@ -646,6 +646,7 @@ ENTRY(hypercall_table)
         .long do_arch_sched_op
         .long do_callback_op        /* 30 */
         .long do_xenoprof_op
+        .long do_kexec
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -683,6 +684,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_arch_sched_op     */
         .byte 2 /* do_callback_op       */  /* 30 */
         .byte 2 /* do_xenoprof_op       */
+        .byte 2 /* do_kexec       */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- x/xen/common/Makefile
+++ x/xen/common/Makefile
@@ -7,6 +7,7 @@ obj-y += event_channel.o
 obj-y += grant_table.o
 obj-y += kernel.o
 obj-y += keyhandler.o
+obj-y += kexec.o
 obj-y += lib.o
 obj-y += memory.o
 obj-y += multicall.o
--- /dev/null
+++ x/xen/common/kexec.c
@@ -0,0 +1,71 @@
+/*
+ * Achitecture independent kexec code for Xen
+ *
+ * At this statge, just a switch for the kexec hypercall into
+ * architecture dependent code.
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#include <xen/lib.h>
+#include <xen/errno.h>
+#include <xen/guest_access.h>
+#include <xen/sched.h>
+#include <public/xen.h>
+#include <public/kexec.h>
+
+extern int machine_kexec_prepare(struct kexec_arg *arg);
+extern void machine_kexec_cleanup(struct kexec_arg *arg);
+extern void machine_kexec(struct kexec_arg *arg);
+
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
+int do_kexec(unsigned long op, 
+             GUEST_HANDLE(kexec_arg_t) uarg)
+{
+    struct kexec_arg arg;
+
+    if ( !IS_PRIV(current->domain) )  
+        return -EPERM;
+
+    if ( op != KEXEC_CMD_reserve &&
+	 unlikely(copy_from_guest(&arg, uarg, 1) != 0) )
+    {
+        printk("do_kexec: __copy_from_guest failed");
+        return -EFAULT;
+    }
+
+    switch(op) {
+    case KEXEC_CMD_kexec:
+        machine_kexec(&arg);
+        return -EINVAL; /* Not Reached */
+    case KEXEC_CMD_kexec_prepare:
+        return machine_kexec_prepare(&arg);
+    case KEXEC_CMD_kexec_cleanup:
+        machine_kexec_cleanup(&arg);
+        return 0;
+    case KEXEC_CMD_reserve:
+	arg.u.reserve.size = opt_kdump_megabytes << 20;
+	arg.u.reserve.start = opt_kdump_megabytes_base << 20;
+	if ( unlikely(copy_to_guest(uarg, &arg, 1) != 0) )
+	{
+		printk("do_kexec: copy_to_guest failed");
+		return -EFAULT;
+	}
+	return 0;
+    }
+
+    return -EINVAL;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
--- x/xen/common/page_alloc.c
+++ x/xen/common/page_alloc.c
@@ -212,24 +212,35 @@ void init_boot_pages(paddr_t ps, paddr_t
     }
 }
 
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at)
+{
+    unsigned long i;
+
+    for ( i = 0; i < nr_pfns; i++ )
+        if ( allocated_in_map(pfn_at + i) )
+             break;
+
+    if ( i == nr_pfns )
+    {
+        map_alloc(pfn_at, nr_pfns);
+        return pfn_at;
+    }
+
+    return 0;
+}
+
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
 {
-    unsigned long pg, i;
+    unsigned long pg, i = 0;
 
     for ( pg = 0; (pg + nr_pfns) < max_page; pg += pfn_align )
     {
-        for ( i = 0; i < nr_pfns; i++ )
-            if ( allocated_in_map(pg + i) )
-                 break;
-
-        if ( i == nr_pfns )
-        {
-            map_alloc(pg, nr_pfns);
-            return pg;
-        }
+        i = alloc_boot_pages_at(nr_pfns, pg);
+        if (i != 0)
+            break;
     }
 
-    return 0;
+    return i;
 }
 
 
--- x/xen/include/asm-x86/config.h
+++ x/xen/include/asm-x86/config.h
@@ -66,7 +66,7 @@
 #define barrier() __asm__ __volatile__("": : :"memory")
 
 /* A power-of-two value greater than or equal to number of hypercalls. */
-#define NR_hypercalls 32
+#define NR_hypercalls 64
 
 #if NR_hypercalls & (NR_hypercalls - 1)
 #error "NR_hypercalls must be a power-of-two value"
--- x/xen/include/asm-x86/hypercall.h
+++ x/xen/include/asm-x86/hypercall.h
@@ -6,6 +6,7 @@
 #define __ASM_X86_HYPERCALL_H__
 
 #include <public/physdev.h>
+#include <public/kexec.h>
 
 extern long
 do_set_trap_table(
@@ -79,6 +80,10 @@ extern long
 arch_do_vcpu_op(
     int cmd, struct vcpu *v, GUEST_HANDLE(void) arg);
 
+extern int
+do_kexec(
+    unsigned long op, GUEST_HANDLE(kexec_arg_t) uarg);
+
 #ifdef __x86_64__
 
 extern long
--- /dev/null
+++ x/xen/include/public/kexec.h
@@ -0,0 +1,43 @@
+/*
+ * kexec.h: Xen kexec
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+/*
+ * Scratch space for passing arguments to the kexec hypercall
+ */
+typedef struct kexec_arg {
+    union {
+        struct {
+            unsigned long data; /* Not sure what this should be yet */
+        } helper;
+        struct {
+            unsigned long indirection_page;
+            unsigned long reboot_code_buffer;
+            unsigned long start_address;
+            const char *relocate_new_kernel;
+            unsigned int relocate_new_kernel_size;
+        } kexec;
+        struct {
+            unsigned long size;
+            unsigned long start;
+        } reserve;
+    } u;
+} kexec_arg_t;
+DEFINE_GUEST_HANDLE(kexec_arg_t);
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/public/xen.h
+++ x/xen/include/public/xen.h
@@ -62,6 +62,7 @@
 #define __HYPERVISOR_sched_op             29
 #define __HYPERVISOR_callback_op          30
 #define __HYPERVISOR_xenoprof_op          31
+#define __HYPERVISOR_kexec_op             32
 
 /* 
  * VIRTUAL INTERRUPTS
@@ -215,6 +216,14 @@ DEFINE_GUEST_HANDLE(mmuext_op_t);
 #define VMASST_TYPE_writable_pagetables  2
 #define MAX_VMASST_TYPE 2
 
+/*
+ * Operations for kexec.
+ */
+#define KEXEC_CMD_kexec                 0
+#define KEXEC_CMD_kexec_prepare         1
+#define KEXEC_CMD_kexec_cleanup         2
+#define KEXEC_CMD_reserve               3
+
 #ifndef __ASSEMBLY__
 
 typedef uint16_t domid_t;
--- x/xen/include/xen/mm.h
+++ x/xen/include/xen/mm.h
@@ -40,6 +40,7 @@ struct page_info;
 paddr_t init_boot_allocator(paddr_t bitmap_start);
 void init_boot_pages(paddr_t ps, paddr_t pe);
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align);
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at);
 void end_boot_allocator(void);
 
 /* Generic allocator. These functions are *not* interrupt-safe. */

^ permalink raw reply	[flat|nested] 68+ messages in thread

* [PATCH]: kexec: framework and i386 (Take VI)
  2006-04-26  6:08                   ` [PATCH]: kexec: framework and i386 Take V Horms
@ 2006-05-02  8:17                     ` Simon Horman [Horms]
  2006-05-03  7:16                       ` Akio Takebe
  2006-05-06  8:44                       ` Akio Takebe
  0 siblings, 2 replies; 68+ messages in thread
From: Simon Horman [Horms] @ 2006-05-02  8:17 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Isaku Yamahata, Magnus Damm, xen-devel, Mark Williamson,
	Akio Takebe

Hi, 

I will be out of the office until next Monday, so here is the latest and
greatest before I go. Tested against 9896, should also work fine
with tip (9903).

-- 
Horms                                           http://www.vergenet.net/~horms/

kexec: framework and i386

This is an implementation of kexec for dom0/xen, that allows
kexecing of the physical machine from xen. The approach taken is
to move the architecture-dependant kexec code into a new hypercall.

Some notes:
  * machine_kexec_cleanup() and machine_kexec_prepare() don't do
    anything in i386. So while this patch adds a framework for them,
    I am not sure what parameters are needs at this stage.
  * Only works for UP, as machine_shutdown is not implemented yet
  * kexecing into xen does not seem to work, I think that 
    kexec-tools needs updating, but I have not investigated yet
  * Kdump works by first copying the kernel into dom0 segments
    and relocating them later in xen, the same way that kexec does
    The only difference is that the relocation is made into
    an area reserved by xen
  * Kdump reservation is made using the xen command line parameters,
    kdump_megabytes and kdump_megabytes_base, rather than
    the linux option crashkernel, which is now ignored.
    Two parameters are used instead of one to simplify parsing.
    This can be cleaned up later if desired. But the reservation
    seems to need to be made by xen to make sure that it happens
    early enough.
  * This patch uses a new kexec hypercall

Highlights since the previous posted version:
 
  * SMP kexec (not kdump yet)
  * Split x86_32 specific xen code out

Prepared by Horms and Magnus Damm

Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
Signed-Off-By: Horms <horms@verge.net.au>

 linux-2.6-xen-sparse/arch/i386/Kconfig                         |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/Makefile                 |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c              |   24 +
 linux-2.6-xen-sparse/drivers/xen/core/Makefile                 |    1 
 linux-2.6-xen-sparse/drivers/xen/core/crash.c                  |   98 ++++
 linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c          |   73 +++
 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h |   10 
 ref-linux-2.6.16/drivers/base/cpu.c                            |    4 
 ref-linux-2.6.16/kernel/kexec.c                                |   52 +-
 xen/arch/x86/Makefile                                          |    1 
 xen/arch/x86/dom0_ops.c                                        |    3 
 xen/arch/x86/machine_kexec.c                                   |   27 +
 xen/arch/x86/setup.c                                           |   75 +++
 xen/arch/x86/x86_32/Makefile                                   |    1 
 xen/arch/x86/x86_32/entry.S                                    |    2 
 xen/arch/x86/x86_32/machine_kexec.c                            |  206 ++++++++++
 xen/arch/x86/x86_64/Makefile                                   |    1 
 xen/arch/x86/x86_64/machine_kexec.c                            |   24 +
 xen/common/Makefile                                            |    1 
 xen/common/kexec.c                                             |   73 +++
 xen/common/page_alloc.c                                        |   33 +
 xen/include/asm-x86/hypercall.h                                |    5 
 xen/include/public/kexec.h                                     |   46 ++
 xen/include/public/xen.h                                       |    9 
 xen/include/xen/mm.h                                           |    1 
 25 files changed, 741 insertions(+), 33 deletions(-)

--- x/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ x/linux-2.6-xen-sparse/arch/i386/Kconfig
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_XEN
+	depends on EXPERIMENTAL
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
@@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o machine_kexec.o crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
@@ -68,6 +68,10 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -932,6 +936,7 @@ static void __init parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -942,6 +947,10 @@ static void __init parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1318,9 +1327,21 @@ void __init setup_bootmem_allocator(void
 	}
 #endif
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end)
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
+#else
+	{
+		struct kexec_arg xen_kexec_arg;
+		BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_reserve, &xen_kexec_arg));
+		if (xen_kexec_arg.u.reserve.size) {
+			crashk_res.start = xen_kexec_arg.u.reserve.start;
+			crashk_res.end = xen_kexec_arg.u.reserve.start + 
+				xen_kexec_arg.u.reserve.size - 1;
+		}
+	}
+#endif
 #endif
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1395,6 +1416,9 @@ legacy_init_iomem_resources(struct resou
 		res->end = map[i].end - 1;
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		request_resource(&iomem_resource, res);
+#ifdef CONFIG_KEXEC
+        request_resource(res, &crashk_res);
+#endif
 	}
 
 	free_bootmem(__pa(map), PAGE_SIZE);
--- x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
+++ x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_NET)     += skbuff.o
 obj-$(CONFIG_SMP)     += smpboot.o
 obj-$(CONFIG_SYSFS)   += hypervisor_sysfs.o
 obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
+obj-$(CONFIG_KEXEC)   += machine_kexec.o crash.o
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/crash.c
@@ -0,0 +1,98 @@
+/*
+ * Architecture specific (i386-xen) functions for kexec based crash dumps.
+ *
+ * Created by: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <linux/kernel.h> /* For printk */
+
+/* XXX: final_note(), crash_save_this_cpu() and crash_save_self()
+ * are copied from arch/i386/kernel/crash.c, might be good to either
+ * the original functions non-static and use them, or just
+ * merge this this into that file. 
+ */
+#include <linux/elf.h>     /* For struct elf_note */
+#include <linux/elfcore.h> /* For struct elf_prstatus */
+#include <linux/kexec.h>   /* crash_notes */
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+							       size_t data_len)
+{
+	struct elf_note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = data_len;
+	note.n_type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) +3)/4;
+	memcpy(buf, name, note.n_namesz);
+	buf += (note.n_namesz + 3)/4;
+	memcpy(buf, data, note.n_descsz);
+	buf += (note.n_descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	struct elf_note note;
+
+	note.n_namesz = 0;
+	note.n_descsz = 0;
+	note.n_type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
+{
+	struct elf_prstatus prstatus;
+	u32 *buf;
+
+	if ((cpu < 0) || (cpu >= NR_CPUS))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * I need a well defined structure format
+	 * for the data I pass, and I need tags
+	 * on the data to indicate what information I have
+	 * squirrelled away.  ELF notes happen to provide
+	 * all of that that no need to invent something new.
+	 */
+	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+	if (!buf)
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	prstatus.pr_pid = current->pid;
+	elf_core_copy_regs(&prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+				sizeof(prstatus));
+	final_note(buf);
+}
+
+static void crash_save_self(struct pt_regs *regs)
+{
+	int cpu;
+
+	cpu = smp_processor_id();
+	crash_save_this_cpu(regs, cpu);
+}
+
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	/* XXX: This should do something */
+	printk("xen-kexec: Need to turn of other CPUS in "
+	       "machine_crash_shutdown()\n");
+	crash_save_self(regs);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
@@ -0,0 +1,73 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <linux/kexec.h>
+#include <xen/interface/kexec.h>
+#include <linux/mm.h>
+#include <asm/hypercall.h>
+
+const extern unsigned char relocate_new_kernel[];
+extern unsigned int relocate_new_kernel_size;
+
+/*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+ * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
+ * have been allocated, but the segments have yet
+ * been copied into the kernel.
+ *
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ *
+ * Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+	kexec_arg_t hypercall_arg;
+       	hypercall_arg.u.helper.data = NULL;
+	return HYPERVISOR_kexec(KEXEC_CMD_kexec_prepare, &hypercall_arg);
+}
+
+/*
+ * Undo anything leftover by machine_kexec_prepare
+ * when an image is freed.
+ */
+void machine_kexec_cleanup(struct kimage *image)
+{
+	kexec_arg_t hypercall_arg;
+	hypercall_arg.u.helper.data = NULL;
+	HYPERVISOR_kexec(KEXEC_CMD_kexec_cleanup, &hypercall_arg);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+	kexec_arg_t hypercall_arg;
+	hypercall_arg.u.kexec.indirection_page = image->head;
+	hypercall_arg.u.kexec.reboot_code_buffer = 
+		pfn_to_mfn(page_to_pfn(image->control_code_page)) << PAGE_SHIFT;
+	hypercall_arg.u.kexec.start_address = image->start;
+	hypercall_arg.u.kexec.relocate_new_kernel = relocate_new_kernel;
+	hypercall_arg.u.kexec.relocate_new_kernel_size = 
+		relocate_new_kernel_size;
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, &hypercall_arg);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
@@ -37,6 +37,8 @@
 # error "please don't include this file directly"
 #endif
 
+#include <xen/interface/kexec.h>
+
 #define __STR(x) #x
 #define STR(x) __STR(x)
 
@@ -357,6 +359,14 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, kexec_arg_t * arg)
+{
+	return _hypercall2(int, kexec_op, op, arg); 
+}
+
+
 
 #endif /* __HYPERCALL_H__ */
 
--- x/ref-linux-2.6.16/drivers/base/cpu.c
+++ x/ref-linux-2.6.16/drivers/base/cpu.c
@@ -101,7 +101,11 @@ static ssize_t show_crash_notes(struct s
 	 * boot up and this data does not change there after. Hence this
 	 * operation should be safe. No locking required.
 	 */
+#ifndef CONFIG_XEN
 	addr = __pa(per_cpu_ptr(crash_notes, cpunum));
+#else
+	addr = virt_to_machine(per_cpu_ptr(crash_notes, cpunum));
+#endif
 	rc = sprintf(buf, "%Lx\n", addr);
 	return rc;
 }
--- x/ref-linux-2.6.16/kernel/kexec.c
+++ x/ref-linux-2.6.16/kernel/kexec.c
@@ -38,6 +38,20 @@ struct resource crashk_res = {
 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
 };
 
+/* Kexec needs to know about the actually physical addresss.
+ * But in xen, a physical address is a pseudo-physical addresss. */
+#ifndef CONFIG_XEN
+#define kexec_page_to_pfn(page)  page_to_pfn(page)
+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#else
+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#endif
+
 int kexec_should_crash(struct task_struct *p)
 {
 	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
@@ -403,7 +417,7 @@ static struct page *kimage_alloc_normal_
 		pages = kimage_alloc_pages(GFP_KERNEL, order);
 		if (!pages)
 			break;
-		pfn   = page_to_pfn(pages);
+		pfn   = kexec_page_to_pfn(pages);
 		epfn  = pfn + count;
 		addr  = pfn << PAGE_SHIFT;
 		eaddr = epfn << PAGE_SHIFT;
@@ -437,6 +451,7 @@ static struct page *kimage_alloc_normal_
 	return pages;
 }
 
+#ifndef CONFIG_XEN
 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 						      unsigned int order)
 {
@@ -490,7 +505,7 @@ static struct page *kimage_alloc_crash_c
 		}
 		/* If I don't overlap any segments I have found my hole! */
 		if (i == image->nr_segments) {
-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+			pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
 			break;
 		}
 	}
@@ -517,6 +532,13 @@ struct page *kimage_alloc_control_pages(
 
 	return pages;
 }
+#else /* !CONFIG_XEN */
+struct page *kimage_alloc_control_pages(struct kimage *image,
+					 unsigned int order)
+{
+	return kimage_alloc_normal_control_pages(image, order);
+}
+#endif
 
 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 {
@@ -532,7 +554,7 @@ static int kimage_add_entry(struct kimag
 			return -ENOMEM;
 
 		ind_page = page_address(page);
-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+		*image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
 		image->entry = ind_page;
 		image->last_entry = ind_page +
 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@ -593,13 +615,13 @@ static int kimage_terminate(struct kimag
 #define for_each_kimage_entry(image, ptr, entry) \
 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 		ptr = (entry & IND_INDIRECTION)? \
-			phys_to_virt((entry & PAGE_MASK)): ptr +1)
+			kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
 
 static void kimage_free_entry(kimage_entry_t entry)
 {
 	struct page *page;
 
-	page = pfn_to_page(entry >> PAGE_SHIFT);
+	page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
 	kimage_free_pages(page);
 }
 
@@ -686,7 +708,7 @@ static struct page *kimage_alloc_page(st
 	 * have a match.
 	 */
 	list_for_each_entry(page, &image->dest_pages, lru) {
-		addr = page_to_pfn(page) << PAGE_SHIFT;
+		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
 		if (addr == destination) {
 			list_del(&page->lru);
 			return page;
@@ -701,12 +723,12 @@ static struct page *kimage_alloc_page(st
 		if (!page)
 			return NULL;
 		/* If the page cannot be used file it away */
-		if (page_to_pfn(page) >
+		if (kexec_page_to_pfn(page) >
 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 			list_add(&page->lru, &image->unuseable_pages);
 			continue;
 		}
-		addr = page_to_pfn(page) << PAGE_SHIFT;
+		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
 
 		/* If it is the destination page we want use it */
 		if (addr == destination)
@@ -729,7 +751,7 @@ static struct page *kimage_alloc_page(st
 			struct page *old_page;
 
 			old_addr = *old & PAGE_MASK;
-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+			old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
 			copy_highpage(page, old_page);
 			*old = addr | (*old & ~PAGE_MASK);
 
@@ -779,7 +801,7 @@ static int kimage_load_normal_segment(st
 			result  = -ENOMEM;
 			goto out;
 		}
-		result = kimage_add_page(image, page_to_pfn(page)
+		result = kimage_add_page(image, kexec_page_to_pfn(page)
 								<< PAGE_SHIFT);
 		if (result < 0)
 			goto out;
@@ -811,6 +833,7 @@ out:
 	return result;
 }
 
+#ifndef CONFIG_XEN
 static int kimage_load_crash_segment(struct kimage *image,
 					struct kexec_segment *segment)
 {
@@ -833,7 +856,7 @@ static int kimage_load_crash_segment(str
 		char *ptr;
 		size_t uchunk, mchunk;
 
-		page = pfn_to_page(maddr >> PAGE_SHIFT);
+		page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
 		if (page == 0) {
 			result  = -ENOMEM;
 			goto out;
@@ -881,6 +904,13 @@ static int kimage_load_segment(struct ki
 
 	return result;
 }
+#else /* CONFIG_XEN */
+static int kimage_load_segment(struct kimage *image,
+				struct kexec_segment *segment)
+{
+	return kimage_load_normal_segment(image, segment);
+}
+#endif
 
 /*
  * Exec Kernel system call: for obvious reasons only root may call it.
--- x/xen/arch/x86/Makefile
+++ x/xen/arch/x86/Makefile
@@ -39,6 +39,7 @@ obj-y += trampoline.o
 obj-y += traps.o
 obj-y += usercopy.o
 obj-y += x86_emulate.o
+obj-y += machine_kexec.o
 
 ifneq ($(pae),n)
 obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o
--- x/xen/arch/x86/dom0_ops.c
+++ x/xen/arch/x86/dom0_ops.c
@@ -29,6 +29,9 @@
 #include <asm/mtrr.h>
 #include "cpu/mtrr/mtrr.h"
 
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
 #define TRC_DOM0OP_ENTER_BASE  0x00020000
 #define TRC_DOM0OP_LEAVE_BASE  0x00030000
 
--- /dev/null
+++ x/xen/arch/x86/machine_kexec.c
@@ -0,0 +1,27 @@
+/******************************************************************************
+ * arch/x86/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ */
+
+#include <public/kexec.h>
+
+int machine_kexec_prepare(struct kexec_arg *arg)
+{
+	return 0;
+}
+
+void machine_kexec_cleanup(struct kexec_arg *arg)
+{
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/setup.c
+++ x/xen/arch/x86/setup.c
@@ -38,6 +38,11 @@ static unsigned int opt_xenheap_megabyte
 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
 #endif
 
+unsigned int opt_kdump_megabytes = 0;
+integer_param("kdump_megabytes", opt_kdump_megabytes);
+unsigned int opt_kdump_megabytes_base = 0;
+integer_param("kdump_megabytes_base", opt_kdump_megabytes_base);
+
 /* opt_nosmp: If true, secondary processors are ignored. */
 static int opt_nosmp = 0;
 boolean_param("nosmp", opt_nosmp);
@@ -192,6 +197,20 @@ static void percpu_free_unused_areas(voi
                        __pa(__per_cpu_end));
 }
 
+void __init move_memory(unsigned long dst, 
+                          unsigned long src_start, unsigned long src_end)
+{
+#if defined(CONFIG_X86_32)
+    memmove((void *)dst,  /* use low mapping */
+            (void *)src_start,      /* use low mapping */
+            src_end - src_start);
+#elif defined(CONFIG_X86_64)
+    memmove(__va(dst),
+            __va(src_start),
+            src_end - src_start);
+#endif
+}
+
 void __init __start_xen(multiboot_info_t *mbi)
 {
     char __cmdline[] = "", *cmdline = __cmdline;
@@ -327,15 +346,8 @@ void __init __start_xen(multiboot_info_t
         initial_images_start = xenheap_phys_end;
     initial_images_end = initial_images_start + modules_length;
 
-#if defined(CONFIG_X86_32)
-    memmove((void *)initial_images_start,  /* use low mapping */
-            (void *)mod[0].mod_start,      /* use low mapping */
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#elif defined(CONFIG_X86_64)
-    memmove(__va(initial_images_start),
-            __va(mod[0].mod_start),
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#endif
+    move_memory(initial_images_start, 
+                mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
 
     /* Initialise boot-time allocator with all RAM situated after modules. */
     xenheap_phys_start = init_boot_allocator(__pa(&_end));
@@ -383,6 +395,51 @@ void __init __start_xen(multiboot_info_t
 #endif
     }
 
+    if (opt_kdump_megabytes) {
+        unsigned long kdump_start, kdump_size, k;
+
+        /* mark images pages as free for now */
+
+        init_boot_pages(initial_images_start, initial_images_end);
+
+        kdump_start = opt_kdump_megabytes_base << 20;
+        kdump_size = opt_kdump_megabytes << 20;
+
+        printk("Kdump: %luMB (%lukB) at 0x%lx\n", 
+               kdump_size >> 20,
+               kdump_size >> 10,
+               kdump_start);
+
+        if ((kdump_start & ~PAGE_MASK) || (kdump_size & ~PAGE_MASK))
+            panic("Kdump parameters not page aligned\n");
+
+        kdump_start >>= PAGE_SHIFT;
+        kdump_size >>= PAGE_SHIFT;
+
+        /* allocate pages for Kdump memory area */
+
+        k = alloc_boot_pages_at(kdump_size, kdump_start);
+
+        if (k != kdump_start)
+            panic("Unable to reserve Kdump memory\n");
+
+        /* allocate pages for relocated initial images */
+
+        k = ((initial_images_end - initial_images_start) & ~PAGE_MASK) ? 1 : 0;
+        k += (initial_images_end - initial_images_start) >> PAGE_SHIFT;
+
+        k = alloc_boot_pages(k, 1);
+
+        if (!k)
+            panic("Unable to allocate initial images memory\n");
+
+        move_memory(k << PAGE_SHIFT, initial_images_start, initial_images_end);
+
+        initial_images_end -= initial_images_start;
+        initial_images_start = k << PAGE_SHIFT;
+        initial_images_end += initial_images_start;
+    }        
+
     memguard_init();
 
     printk("System RAM: %luMB (%lukB)\n", 
--- x/xen/arch/x86/x86_32/Makefile
+++ x/xen/arch/x86/x86_32/Makefile
@@ -3,5 +3,6 @@ obj-y += entry.o
 obj-y += mm.o
 obj-y += seg_fixup.o
 obj-y += traps.o
+obj-y += machine_kexec.o
 
 obj-$(supervisor_mode_kernel) += supervisor_mode_kernel.o
--- x/xen/arch/x86/x86_32/entry.S
+++ x/xen/arch/x86/x86_32/entry.S
@@ -648,6 +648,7 @@ ENTRY(hypercall_table)
         .long do_xenoprof_op
         .long do_event_channel_op
         .long do_physdev_op
+        .long do_kexec
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -687,6 +688,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_xenoprof_op       */
         .byte 2 /* do_event_channel_op  */
         .byte 2 /* do_physdev_op        */
+        .byte 2 /* do_kexec             */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- /dev/null
+++ x/xen/arch/x86/x86_32/machine_kexec.c
@@ -0,0 +1,206 @@
+/******************************************************************************
+ * arch/x86/x86_32/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
+ */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/domain_page.h> 
+#include <xen/timer.h>
+#include <xen/sched.h>
+#include <xen/reboot.h>
+#include <xen/console.h>
+#include <asm/page.h> 
+#include <asm/flushtlb.h>
+#include <public/xen.h>
+#include <public/kexec.h>
+
+static void __machine_kexec(struct kexec_arg *arg);
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+                    unsigned long indirection_page,
+                    unsigned long reboot_code_buffer,
+                    unsigned long start_address,
+                    unsigned int has_pae);
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L2_ATTR (_PAGE_PRESENT)
+
+#ifndef CONFIG_X86_PAE
+
+static u32 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    unsigned long mfn;
+    u32 *pgtable_level2;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level2 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    write_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level2);
+}
+
+#else
+static u64 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+static u64 pgtable_level2[L2_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    int mfn;
+    intpte_t *pgtable_level3;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level3 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+    set_64bit(&pgtable_level3[l3_table_offset(address)],
+	      __pa(pgtable_level2) | L2_ATTR);
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    load_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level3);
+}
+#endif
+
+static void kexec_load_segments(void)
+{
+#define __SSTR(X) #X
+#define SSTR(X) __SSTR(X)
+    __asm__ __volatile__ (
+        "\tljmp $"SSTR(__HYPERVISOR_CS)",$1f\n"
+        "\t1:\n"
+        "\tmovl $"SSTR(__HYPERVISOR_DS)",%%eax\n"
+        "\tmovl %%eax,%%ds\n"
+        "\tmovl %%eax,%%es\n"
+        "\tmovl %%eax,%%fs\n"
+        "\tmovl %%eax,%%gs\n"
+        "\tmovl %%eax,%%ss\n"
+        ::: "eax", "memory");
+#undef SSTR
+#undef __SSTR
+}
+
+#define kexec_load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
+static void kexec_set_idt(void *newidt, __u16 limit)
+{
+    struct Xgt_desc_struct curidt;
+
+    /* ia32 supports unaliged loads & stores */
+    curidt.size    = limit;
+    curidt.address = (unsigned long)newidt;
+    
+    kexec_load_idt(&curidt);
+
+};
+
+#define kexec_load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
+static void kexec_set_gdt(void *newgdt, __u16 limit)
+{
+    struct Xgt_desc_struct curgdt;
+
+    /* ia32 supports unaligned loads & stores */
+    curgdt.size    = limit;
+    curgdt.address = (unsigned long)newgdt;
+
+    kexec_load_gdt(&curgdt);
+};
+
+static void __machine_shutdown(void *data)
+{
+    struct kexec_arg *arg = (struct kexec_arg *)data;
+
+    printk("__machine_shutdown: cpu=%u\n", smp_processor_id());
+
+    watchdog_disable();
+    console_start_sync();
+
+    smp_send_stop();
+
+#ifdef CONFIG_X86_IO_APIC
+    disable_IO_APIC();
+#endif   
+
+    __machine_kexec(arg);
+}
+
+void machine_shutdown(struct kexec_arg *arg)
+{
+    int reboot_cpu_id;
+    cpumask_t reboot_cpu;
+
+
+    reboot_cpu_id = 0;
+
+    if (!cpu_isset(reboot_cpu_id, cpu_online_map))
+        reboot_cpu_id = smp_processor_id();
+    
+    if (reboot_cpu_id != smp_processor_id()) {
+        cpus_clear(reboot_cpu);
+        cpu_set(reboot_cpu_id, reboot_cpu);
+        on_selected_cpus(reboot_cpu, __machine_shutdown, arg, 1, 0);
+	for (;;)
+		; /* nothing */
+    }
+    else
+        __machine_shutdown(arg);
+    BUG();
+}
+
+static void __machine_kexec(struct kexec_arg *arg)
+{
+    relocate_new_kernel_t rnk;
+
+    local_irq_disable();
+
+    identity_map_page(arg->u.kexec.reboot_code_buffer);
+
+    copy_from_user((void *)arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.relocate_new_kernel,
+           arg->u.kexec.relocate_new_kernel_size);
+
+    kexec_load_segments();
+    kexec_set_gdt(__va(0),0);
+    kexec_set_idt(__va(0),0);
+
+    rnk = (relocate_new_kernel_t) arg->u.kexec.reboot_code_buffer;
+    (*rnk)(arg->u.kexec.indirection_page, arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.start_address, cpu_has_pae);
+}
+
+void machine_kexec(struct kexec_arg *arg)
+{
+    machine_shutdown(arg);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/x86_64/Makefile
+++ x/xen/arch/x86/x86_64/Makefile
@@ -1,3 +1,4 @@
 obj-y += entry.o
 obj-y += mm.o
 obj-y += traps.o
+obj-y += machine_kexec.o
--- /dev/null
+++ x/xen/arch/x86/x86_64/machine_kexec.c
@@ -0,0 +1,24 @@
+/******************************************************************************
+ * arch/x86/x86_64/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
+ */
+
+#include <public/kexec.h>
+
+void machine_kexec(struct kexec_arg *arg)
+{
+    printk("machine_kexec: not implemented\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/Makefile
+++ x/xen/common/Makefile
@@ -7,6 +7,7 @@ obj-y += event_channel.o
 obj-y += grant_table.o
 obj-y += kernel.o
 obj-y += keyhandler.o
+obj-y += kexec.o
 obj-y += lib.o
 obj-y += memory.o
 obj-y += multicall.o
--- /dev/null
+++ x/xen/common/kexec.c
@@ -0,0 +1,73 @@
+/*
+ * Achitecture independent kexec code for Xen
+ *
+ * At this statge, just a switch for the kexec hypercall into
+ * architecture dependent code.
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#include <xen/lib.h>
+#include <xen/errno.h>
+#include <xen/guest_access.h>
+#include <xen/sched.h>
+#include <public/xen.h>
+#include <public/kexec.h>
+
+extern int machine_kexec_prepare(struct kexec_arg *arg);
+extern void machine_kexec_cleanup(struct kexec_arg *arg);
+extern void machine_kexec(struct kexec_arg *arg);
+
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
+int do_kexec(unsigned long op, 
+             XEN_GUEST_HANDLE(kexec_arg_t) uarg)
+{
+    struct kexec_arg arg;
+
+    if ( !IS_PRIV(current->domain) )  
+        return -EPERM;
+
+    if (op == KEXEC_CMD_reserve)
+    {
+	arg.u.reserve.size = opt_kdump_megabytes << 20;
+	arg.u.reserve.start = opt_kdump_megabytes_base << 20;
+	if ( unlikely(copy_to_guest(uarg, &arg, 1) != 0) )
+	{
+		printk("do_kexec: copy_to_guest failed");
+		return -EFAULT;
+	}
+	return 0;
+    }
+
+    if ( unlikely(copy_from_guest(&arg, uarg, 1) != 0) )
+    {
+        printk("do_kexec: __copy_from_guest failed");
+        return -EFAULT;
+    }
+
+    switch(op) {
+    case KEXEC_CMD_kexec:
+        machine_kexec(&arg);
+        return -EINVAL; /* Not Reached */
+    case KEXEC_CMD_kexec_prepare:
+        return machine_kexec_prepare(&arg);
+    case KEXEC_CMD_kexec_cleanup:
+        machine_kexec_cleanup(&arg);
+        return 0;
+    }
+
+    return -EINVAL;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
--- x/xen/common/page_alloc.c
+++ x/xen/common/page_alloc.c
@@ -212,24 +212,35 @@ void init_boot_pages(paddr_t ps, paddr_t
     }
 }
 
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at)
+{
+    unsigned long i;
+
+    for ( i = 0; i < nr_pfns; i++ )
+        if ( allocated_in_map(pfn_at + i) )
+             break;
+
+    if ( i == nr_pfns )
+    {
+        map_alloc(pfn_at, nr_pfns);
+        return pfn_at;
+    }
+
+    return 0;
+}
+
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
 {
-    unsigned long pg, i;
+    unsigned long pg, i = 0;
 
     for ( pg = 0; (pg + nr_pfns) < max_page; pg += pfn_align )
     {
-        for ( i = 0; i < nr_pfns; i++ )
-            if ( allocated_in_map(pg + i) )
-                 break;
-
-        if ( i == nr_pfns )
-        {
-            map_alloc(pg, nr_pfns);
-            return pg;
-        }
+        i = alloc_boot_pages_at(nr_pfns, pg);
+        if (i != 0)
+            break;
     }
 
-    return 0;
+    return i;
 }
 
 
--- x/xen/include/asm-x86/hypercall.h
+++ x/xen/include/asm-x86/hypercall.h
@@ -6,6 +6,7 @@
 #define __ASM_X86_HYPERCALL_H__
 
 #include <public/physdev.h>
+#include <public/kexec.h>
 
 extern long
 do_event_channel_op_compat(
@@ -87,6 +88,10 @@ extern long
 arch_do_vcpu_op(
     int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg);
 
+extern int
+do_kexec(
+    unsigned long op, XEN_GUEST_HANDLE(kexec_arg_t) uarg);
+
 #ifdef __x86_64__
 
 extern long
--- /dev/null
+++ x/xen/include/public/kexec.h
@@ -0,0 +1,46 @@
+/*
+ * kexec.h: Xen kexec public
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+#include <xen/types.h>
+#include <public/xen.h>
+
+/*
+ * Scratch space for passing arguments to the kexec hypercall
+ */
+typedef struct kexec_arg {
+    union {
+        struct {
+            unsigned long data; /* Not sure what this should be yet */
+        } helper;
+        struct {
+            unsigned long indirection_page;
+            unsigned long reboot_code_buffer;
+            unsigned long start_address;
+            const char *relocate_new_kernel;
+            unsigned int relocate_new_kernel_size;
+        } kexec;
+        struct {
+            unsigned long size;
+            unsigned long start;
+        } reserve;
+    } u;
+} kexec_arg_t;
+DEFINE_XEN_GUEST_HANDLE(kexec_arg_t);
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/public/xen.h
+++ x/xen/include/public/xen.h
@@ -64,6 +64,7 @@
 #define __HYPERVISOR_xenoprof_op          31
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
+#define __HYPERVISOR_kexec_op             34
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
@@ -238,6 +239,14 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
 #define VMASST_TYPE_writable_pagetables  2
 #define MAX_VMASST_TYPE 2
 
+/*
+ * Operations for kexec.
+ */
+#define KEXEC_CMD_kexec                 0
+#define KEXEC_CMD_kexec_prepare         1
+#define KEXEC_CMD_kexec_cleanup         2
+#define KEXEC_CMD_reserve               3
+
 #ifndef __ASSEMBLY__
 
 typedef uint16_t domid_t;
--- x/xen/include/xen/mm.h
+++ x/xen/include/xen/mm.h
@@ -40,6 +40,7 @@ struct page_info;
 paddr_t init_boot_allocator(paddr_t bitmap_start);
 void init_boot_pages(paddr_t ps, paddr_t pe);
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align);
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at);
 void end_boot_allocator(void);
 
 /* Generic allocator. These functions are *not* interrupt-safe. */

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386 (Take VI)
  2006-05-02  8:17                     ` [PATCH]: kexec: framework and i386 (Take VI) Simon Horman [Horms]
@ 2006-05-03  7:16                       ` Akio Takebe
  2006-05-05  1:03                         ` horms
  2006-05-15  8:29                         ` Akio Takebe
  2006-05-06  8:44                       ` Akio Takebe
  1 sibling, 2 replies; 68+ messages in thread
From: Akio Takebe @ 2006-05-03  7:16 UTC (permalink / raw)
  To: Simon Horman [Horms], Keir Fraser
  Cc: Isaku Yamahata, Magnus Damm, xen-devel, Mark Williamson,
	Akio Takebe

Hi, Simon and Magnus

I have one question.
When Xen is panic, I seemed kexec is not called.
Only when dom0 is panic, kexec is called.
But in the case of nmi=dom0, can we use kexec by pushing NMI button?
Am I righit?

I'll use your patch soon, and report. :-)

Best Regards,

Akio Takebe

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386 (Take VI)
  2006-05-03  7:16                       ` Akio Takebe
@ 2006-05-05  1:03                         ` horms
  2006-05-06  8:46                           ` Akio Takebe
  2006-05-15  8:29                         ` Akio Takebe
  1 sibling, 1 reply; 68+ messages in thread
From: horms @ 2006-05-05  1:03 UTC (permalink / raw)
  To: Akio Takebe; +Cc: Isaku Yamahata, xen-devel, Mark Williamson, Magnus Damm

On Wed, May 03, 2006 at 04:16:22PM +0900, Akio Takebe wrote:
> Hi, Simon and Magnus
> 
> I have one question.
> When Xen is panic, I seemed kexec is not called.
> Only when dom0 is panic, kexec is called.

That is a good point.

> But in the case of nmi=dom0, can we use kexec by pushing NMI button?
> Am I righit?

Probably, I will have to investigate a little further.
Though, I'm not sure that I have ever seen an NMI button.
Are you thinking about the INIT button on some ia64 boxes?
That is a bit different to NMI on x86.

> I'll use your patch soon, and report. :-)

Thanks

-- 
Horms                                           http://www.vergenet.net/~horms/

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386 (Take VI)
  2006-05-02  8:17                     ` [PATCH]: kexec: framework and i386 (Take VI) Simon Horman [Horms]
  2006-05-03  7:16                       ` Akio Takebe
@ 2006-05-06  8:44                       ` Akio Takebe
  2006-05-07  4:45                         ` Horms
  1 sibling, 1 reply; 68+ messages in thread
From: Akio Takebe @ 2006-05-06  8:44 UTC (permalink / raw)
  To: Simon Horman [Horms], Keir Fraser
  Cc: Isaku Yamahata, Magnus Damm, xen-devel, Mark Williamson,
	Akio Takebe

Hi, Horms

Why you modify ref-linux-2.6.16/kernel/{drivers/base/cpu.c, kernel/kexec.c }?
I tried to patch your kexec patch, I fail to patch it.
How do you do patch?

I think you can make a patch in patches/linux-2.6.16/
if you would modify these.

Best Regards,

Akio Takebe

>Hi, 
>
>I will be out of the office until next Monday, so here is the latest and
>greatest before I go. Tested against 9896, should also work fine
>with tip (9903).
>
>-- 
>Horms                                           http://www.vergenet.net/~
>horms/
>
>kexec: framework and i386
>
>This is an implementation of kexec for dom0/xen, that allows
>kexecing of the physical machine from xen. The approach taken is
>to move the architecture-dependant kexec code into a new hypercall.
>
>Some notes:
>  * machine_kexec_cleanup() and machine_kexec_prepare() don't do
>    anything in i386. So while this patch adds a framework for them,
>    I am not sure what parameters are needs at this stage.
>  * Only works for UP, as machine_shutdown is not implemented yet
>  * kexecing into xen does not seem to work, I think that 
>    kexec-tools needs updating, but I have not investigated yet
>  * Kdump works by first copying the kernel into dom0 segments
>    and relocating them later in xen, the same way that kexec does
>    The only difference is that the relocation is made into
>    an area reserved by xen
>  * Kdump reservation is made using the xen command line parameters,
>    kdump_megabytes and kdump_megabytes_base, rather than
>    the linux option crashkernel, which is now ignored.
>    Two parameters are used instead of one to simplify parsing.
>    This can be cleaned up later if desired. But the reservation
>    seems to need to be made by xen to make sure that it happens
>    early enough.
>  * This patch uses a new kexec hypercall
>
>Highlights since the previous posted version:
> 
>  * SMP kexec (not kdump yet)
>  * Split x86_32 specific xen code out
>
>Prepared by Horms and Magnus Damm
>
>Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
>Signed-Off-By: Horms <horms@verge.net.au>
>
> linux-2.6-xen-sparse/arch/i386/Kconfig                         |    2 
> linux-2.6-xen-sparse/arch/i386/kernel/Makefile                 |    2 
> linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c              |   24 +
> linux-2.6-xen-sparse/drivers/xen/core/Makefile                 |    1 
> linux-2.6-xen-sparse/drivers/xen/core/crash.c                  |   98 ++++
> linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c          |   73 +++
> linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h |   10 
> ref-linux-2.6.16/drivers/base/cpu.c                            |    4 
> ref-linux-2.6.16/kernel/kexec.c                                |   52 +-
> xen/arch/x86/Makefile                                          |    1 
> xen/arch/x86/dom0_ops.c                                        |    3 
> xen/arch/x86/machine_kexec.c                                   |   27 +
> xen/arch/x86/setup.c                                           |   75 +++
> xen/arch/x86/x86_32/Makefile                                   |    1 
> xen/arch/x86/x86_32/entry.S                                    |    2 
> xen/arch/x86/x86_32/machine_kexec.c                            |  206 ++++
>++++++
> xen/arch/x86/x86_64/Makefile                                   |    1 
> xen/arch/x86/x86_64/machine_kexec.c                            |   24 +
> xen/common/Makefile                                            |    1 
> xen/common/kexec.c                                             |   73 +++
> xen/common/page_alloc.c                                        |   33 +
> xen/include/asm-x86/hypercall.h                                |    5 
> xen/include/public/kexec.h                                     |   46 ++
> xen/include/public/xen.h                                       |    9 
> xen/include/xen/mm.h                                           |    1 
> 25 files changed, 741 insertions(+), 33 deletions(-)
>
>--- x/linux-2.6-xen-sparse/arch/i386/Kconfig
>+++ x/linux-2.6-xen-sparse/arch/i386/Kconfig
>@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
> 
> config KEXEC
> 	bool "kexec system call (EXPERIMENTAL)"
>-	depends on EXPERIMENTAL && !X86_XEN
>+	depends on EXPERIMENTAL
> 	help
> 	  kexec is a system call that implements the ability to shutdown your
> 	  current kernel, and to start another kernel.  It is like a reboot
>--- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
>+++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
>@@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen
> 
> obj-y += fixup.o
> microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
>-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
>+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o machine_kexec
>.o crash.o
> 
> obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
> obj-y := $(call cherrypickxen, $(obj-y))
>--- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
>+++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
>@@ -68,6 +68,10 @@
> #include "setup_arch_pre.h"
> #include <bios_ebda.h>
> 
>+#ifdef CONFIG_XEN
>+#include <xen/interface/kexec.h>
>+#endif
>+
> /* Forward Declaration. */
> void __init find_max_pfn(void);
> 
>@@ -932,6 +936,7 @@ static void __init parse_cmdline_early (
> 		 * after a kernel panic.
> 		 */
> 		else if (!memcmp(from, "crashkernel=", 12)) {
>+#ifndef CONFIG_XEN
> 			unsigned long size, base;
> 			size = memparse(from+12, &from);
> 			if (*from == '@') {
>@@ -942,6 +947,10 @@ static void __init parse_cmdline_early (
> 				crashk_res.start = base;
> 				crashk_res.end   = base + size - 1;
> 			}
>+#else
>+			printk("Ignoring crashkernel command line, "
>+			       "parameter will be supplied by xen\n");
>+#endif
> 		}
> #endif
> #ifdef CONFIG_PROC_VMCORE
>@@ -1318,9 +1327,21 @@ void __init setup_bootmem_allocator(void
> 	}
> #endif
> #ifdef CONFIG_KEXEC
>+#ifndef CONFIG_XEN
> 	if (crashk_res.start != crashk_res.end)
> 		reserve_bootmem(crashk_res.start,
> 			crashk_res.end - crashk_res.start + 1);
>+#else
>+	{
>+		struct kexec_arg xen_kexec_arg;
>+		BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_reserve, &xen_kexec_arg));
>+		if (xen_kexec_arg.u.reserve.size) {
>+			crashk_res.start = xen_kexec_arg.u.reserve.start;
>+			crashk_res.end = xen_kexec_arg.u.reserve.start + 
>+				xen_kexec_arg.u.reserve.size - 1;
>+		}
>+	}
>+#endif
> #endif
> 
> 	if (!xen_feature(XENFEAT_auto_translated_physmap))
>@@ -1395,6 +1416,9 @@ legacy_init_iomem_resources(struct resou
> 		res->end = map[i].end - 1;
> 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> 		request_resource(&iomem_resource, res);
>+#ifdef CONFIG_KEXEC
>+        request_resource(res, &crashk_res);
>+#endif
> 	}
> 
> 	free_bootmem(__pa(map), PAGE_SIZE);
>--- x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
>+++ x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
>@@ -9,3 +9,4 @@ obj-$(CONFIG_NET)     += skbuff.o
> obj-$(CONFIG_SMP)     += smpboot.o
> obj-$(CONFIG_SYSFS)   += hypervisor_sysfs.o
> obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
>+obj-$(CONFIG_KEXEC)   += machine_kexec.o crash.o
>--- /dev/null
>+++ x/linux-2.6-xen-sparse/drivers/xen/core/crash.c
>@@ -0,0 +1,98 @@
>+/*
>+ * Architecture specific (i386-xen) functions for kexec based crash dumps.
>+ *
>+ * Created by: Horms <horms@verge.net.au>
>+ *
>+ */
>+
>+#include <linux/kernel.h> /* For printk */
>+
>+/* XXX: final_note(), crash_save_this_cpu() and crash_save_self()
>+ * are copied from arch/i386/kernel/crash.c, might be good to either
>+ * the original functions non-static and use them, or just
>+ * merge this this into that file. 
>+ */
>+#include <linux/elf.h>     /* For struct elf_note */
>+#include <linux/elfcore.h> /* For struct elf_prstatus */
>+#include <linux/kexec.h>   /* crash_notes */
>+
>+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
>+							       size_t data_len)
>+{
>+	struct elf_note note;
>+
>+	note.n_namesz = strlen(name) + 1;
>+	note.n_descsz = data_len;
>+	note.n_type   = type;
>+	memcpy(buf, &note, sizeof(note));
>+	buf += (sizeof(note) +3)/4;
>+	memcpy(buf, name, note.n_namesz);
>+	buf += (note.n_namesz + 3)/4;
>+	memcpy(buf, data, note.n_descsz);
>+	buf += (note.n_descsz + 3)/4;
>+
>+	return buf;
>+}
>+
>+static void final_note(u32 *buf)
>+{
>+	struct elf_note note;
>+
>+	note.n_namesz = 0;
>+	note.n_descsz = 0;
>+	note.n_type   = 0;
>+	memcpy(buf, &note, sizeof(note));
>+}
>+
>+static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
>+{
>+	struct elf_prstatus prstatus;
>+	u32 *buf;
>+
>+	if ((cpu < 0) || (cpu >= NR_CPUS))
>+		return;
>+
>+	/* Using ELF notes here is opportunistic.
>+	 * I need a well defined structure format
>+	 * for the data I pass, and I need tags
>+	 * on the data to indicate what information I have
>+	 * squirrelled away.  ELF notes happen to provide
>+	 * all of that that no need to invent something new.
>+	 */
>+	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
>+	if (!buf)
>+		return;
>+	memset(&prstatus, 0, sizeof(prstatus));
>+	prstatus.pr_pid = current->pid;
>+	elf_core_copy_regs(&prstatus.pr_reg, regs);
>+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
>+				sizeof(prstatus));
>+	final_note(buf);
>+}
>+
>+static void crash_save_self(struct pt_regs *regs)
>+{
>+	int cpu;
>+
>+	cpu = smp_processor_id();
>+	crash_save_this_cpu(regs, cpu);
>+}
>+
>+
>+void machine_crash_shutdown(struct pt_regs *regs)
>+{
>+	/* XXX: This should do something */
>+	printk("xen-kexec: Need to turn of other CPUS in "
>+	       "machine_crash_shutdown()\n");
>+	crash_save_self(regs);
>+}
>+
>+/*
>+ * Local variables:
>+ *  c-file-style: "linux"
>+ *  indent-tabs-mode: t
>+ *  c-indent-level: 8
>+ *  c-basic-offset: 8
>+ *  tab-width: 8
>+ * End:
>+ */
>--- /dev/null
>+++ x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
>@@ -0,0 +1,73 @@
>+/*
>+ * machine_kexec.c - handle transition of Linux booting another kernel
>+ *
>+ * Created By: Horms <horms@verge.net.au>
>+ *
>+ * Losely based on arch/i386/kernel/machine_kexec.c
>+ */
>+
>+#include <linux/kexec.h>
>+#include <xen/interface/kexec.h>
>+#include <linux/mm.h>
>+#include <asm/hypercall.h>
>+
>+const extern unsigned char relocate_new_kernel[];
>+extern unsigned int relocate_new_kernel_size;
>+
>+/*
>+ * A architecture hook called to validate the
>+ * proposed image and prepare the control pages
>+ * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
>+ * have been allocated, but the segments have yet
>+ * been copied into the kernel.
>+ *
>+ * Do what every setup is needed on image and the
>+ * reboot code buffer to allow us to avoid allocations
>+ * later.
>+ *
>+ * Currently nothing.
>+ */
>+int machine_kexec_prepare(struct kimage *image)
>+{
>+	kexec_arg_t hypercall_arg;
>+       	hypercall_arg.u.helper.data = NULL;
>+	return HYPERVISOR_kexec(KEXEC_CMD_kexec_prepare, &hypercall_arg);
>+}
>+
>+/*
>+ * Undo anything leftover by machine_kexec_prepare
>+ * when an image is freed.
>+ */
>+void machine_kexec_cleanup(struct kimage *image)
>+{
>+	kexec_arg_t hypercall_arg;
>+	hypercall_arg.u.helper.data = NULL;
>+	HYPERVISOR_kexec(KEXEC_CMD_kexec_cleanup, &hypercall_arg);
>+}
>+
>+/*
>+ * Do not allocate memory (or fail in any way) in machine_kexec().
>+ * We are past the point of no return, committed to rebooting now.
>+ */
>+NORET_TYPE void machine_kexec(struct kimage *image)
>+{
>+	kexec_arg_t hypercall_arg;
>+	hypercall_arg.u.kexec.indirection_page = image->head;
>+	hypercall_arg.u.kexec.reboot_code_buffer = 
>+		pfn_to_mfn(page_to_pfn(image->control_code_page)) << PAGE_SHIFT;
>+	hypercall_arg.u.kexec.start_address = image->start;
>+	hypercall_arg.u.kexec.relocate_new_kernel = relocate_new_kernel;
>+	hypercall_arg.u.kexec.relocate_new_kernel_size = 
>+		relocate_new_kernel_size;
>+	HYPERVISOR_kexec(KEXEC_CMD_kexec, &hypercall_arg);
>+}
>+
>+/*
>+ * Local variables:
>+ *  c-file-style: "linux"
>+ *  indent-tabs-mode: t
>+ *  c-indent-level: 8
>+ *  c-basic-offset: 8
>+ *  tab-width: 8
>+ * End:
>+ */
>--- x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
>+++ x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
>@@ -37,6 +37,8 @@
> # error "please don't include this file directly"
> #endif
> 
>+#include <xen/interface/kexec.h>
>+
> #define __STR(x) #x
> #define STR(x) __STR(x)
> 
>@@ -357,6 +359,14 @@ HYPERVISOR_xenoprof_op(
> 	return _hypercall2(int, xenoprof_op, op, arg);
> }
> 
>+static inline int
>+HYPERVISOR_kexec(
>+	unsigned long op, kexec_arg_t * arg)
>+{
>+	return _hypercall2(int, kexec_op, op, arg); 
>+}
>+
>+
> 
> #endif /* __HYPERCALL_H__ */
> 
>--- x/ref-linux-2.6.16/drivers/base/cpu.c
>+++ x/ref-linux-2.6.16/drivers/base/cpu.c
>@@ -101,7 +101,11 @@ static ssize_t show_crash_notes(struct s
> 	 * boot up and this data does not change there after. Hence this
> 	 * operation should be safe. No locking required.
> 	 */
>+#ifndef CONFIG_XEN
> 	addr = __pa(per_cpu_ptr(crash_notes, cpunum));
>+#else
>+	addr = virt_to_machine(per_cpu_ptr(crash_notes, cpunum));
>+#endif
> 	rc = sprintf(buf, "%Lx\n", addr);
> 	return rc;
> }
>--- x/ref-linux-2.6.16/kernel/kexec.c
>+++ x/ref-linux-2.6.16/kernel/kexec.c
>@@ -38,6 +38,20 @@ struct resource crashk_res = {
> 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
> };
> 
>+/* Kexec needs to know about the actually physical addresss.
>+ * But in xen, a physical address is a pseudo-physical addresss. */
>+#ifndef CONFIG_XEN
>+#define kexec_page_to_pfn(page)  page_to_pfn(page)
>+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
>+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
>+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
>+#else
>+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
>+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
>+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
>+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
>+#endif
>+
> int kexec_should_crash(struct task_struct *p)
> {
> 	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
>@@ -403,7 +417,7 @@ static struct page *kimage_alloc_normal_
> 		pages = kimage_alloc_pages(GFP_KERNEL, order);
> 		if (!pages)
> 			break;
>-		pfn   = page_to_pfn(pages);
>+		pfn   = kexec_page_to_pfn(pages);
> 		epfn  = pfn + count;
> 		addr  = pfn << PAGE_SHIFT;
> 		eaddr = epfn << PAGE_SHIFT;
>@@ -437,6 +451,7 @@ static struct page *kimage_alloc_normal_
> 	return pages;
> }
> 
>+#ifndef CONFIG_XEN
> static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
> 						      unsigned int order)
> {
>@@ -490,7 +505,7 @@ static struct page *kimage_alloc_crash_c
> 		}
> 		/* If I don't overlap any segments I have found my hole! */
> 		if (i == image->nr_segments) {
>-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
>+			pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
> 			break;
> 		}
> 	}
>@@ -517,6 +532,13 @@ struct page *kimage_alloc_control_pages(
> 
> 	return pages;
> }
>+#else /* !CONFIG_XEN */
>+struct page *kimage_alloc_control_pages(struct kimage *image,
>+					 unsigned int order)
>+{
>+	return kimage_alloc_normal_control_pages(image, order);
>+}
>+#endif
> 
> static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
> {
>@@ -532,7 +554,7 @@ static int kimage_add_entry(struct kimag
> 			return -ENOMEM;
> 
> 		ind_page = page_address(page);
>-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
>+		*image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
> 		image->entry = ind_page;
> 		image->last_entry = ind_page +
> 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
>@@ -593,13 +615,13 @@ static int kimage_terminate(struct kimag
> #define for_each_kimage_entry(image, ptr, entry) \
> 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
> 		ptr = (entry & IND_INDIRECTION)? \
>-			phys_to_virt((entry & PAGE_MASK)): ptr +1)
>+			kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
> 
> static void kimage_free_entry(kimage_entry_t entry)
> {
> 	struct page *page;
> 
>-	page = pfn_to_page(entry >> PAGE_SHIFT);
>+	page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
> 	kimage_free_pages(page);
> }
> 
>@@ -686,7 +708,7 @@ static struct page *kimage_alloc_page(st
> 	 * have a match.
> 	 */
> 	list_for_each_entry(page, &image->dest_pages, lru) {
>-		addr = page_to_pfn(page) << PAGE_SHIFT;
>+		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
> 		if (addr == destination) {
> 			list_del(&page->lru);
> 			return page;
>@@ -701,12 +723,12 @@ static struct page *kimage_alloc_page(st
> 		if (!page)
> 			return NULL;
> 		/* If the page cannot be used file it away */
>-		if (page_to_pfn(page) >
>+		if (kexec_page_to_pfn(page) >
> 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
> 			list_add(&page->lru, &image->unuseable_pages);
> 			continue;
> 		}
>-		addr = page_to_pfn(page) << PAGE_SHIFT;
>+		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
> 
> 		/* If it is the destination page we want use it */
> 		if (addr == destination)
>@@ -729,7 +751,7 @@ static struct page *kimage_alloc_page(st
> 			struct page *old_page;
> 
> 			old_addr = *old & PAGE_MASK;
>-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
>+			old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
> 			copy_highpage(page, old_page);
> 			*old = addr | (*old & ~PAGE_MASK);
> 
>@@ -779,7 +801,7 @@ static int kimage_load_normal_segment(st
> 			result  = -ENOMEM;
> 			goto out;
> 		}
>-		result = kimage_add_page(image, page_to_pfn(page)
>+		result = kimage_add_page(image, kexec_page_to_pfn(page)
> 								<< PAGE_SHIFT);
> 		if (result < 0)
> 			goto out;
>@@ -811,6 +833,7 @@ out:
> 	return result;
> }
> 
>+#ifndef CONFIG_XEN
> static int kimage_load_crash_segment(struct kimage *image,
> 					struct kexec_segment *segment)
> {
>@@ -833,7 +856,7 @@ static int kimage_load_crash_segment(str
> 		char *ptr;
> 		size_t uchunk, mchunk;
> 
>-		page = pfn_to_page(maddr >> PAGE_SHIFT);
>+		page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
> 		if (page == 0) {
> 			result  = -ENOMEM;
> 			goto out;
>@@ -881,6 +904,13 @@ static int kimage_load_segment(struct ki
> 
> 	return result;
> }
>+#else /* CONFIG_XEN */
>+static int kimage_load_segment(struct kimage *image,
>+				struct kexec_segment *segment)
>+{
>+	return kimage_load_normal_segment(image, segment);
>+}
>+#endif
> 
> /*
>  * Exec Kernel system call: for obvious reasons only root may call it.
>--- x/xen/arch/x86/Makefile
>+++ x/xen/arch/x86/Makefile
>@@ -39,6 +39,7 @@ obj-y += trampoline.o
> obj-y += traps.o
> obj-y += usercopy.o
> obj-y += x86_emulate.o
>+obj-y += machine_kexec.o
> 
> ifneq ($(pae),n)
> obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o
>--- x/xen/arch/x86/dom0_ops.c
>+++ x/xen/arch/x86/dom0_ops.c
>@@ -29,6 +29,9 @@
> #include <asm/mtrr.h>
> #include "cpu/mtrr/mtrr.h"
> 
>+extern unsigned int opt_kdump_megabytes;
>+extern unsigned int opt_kdump_megabytes_base;
>+
> #define TRC_DOM0OP_ENTER_BASE  0x00020000
> #define TRC_DOM0OP_LEAVE_BASE  0x00030000
> 
>--- /dev/null
>+++ x/xen/arch/x86/machine_kexec.c
>@@ -0,0 +1,27 @@
>+/*************************************************************************
>*****
>+ * arch/x86/machine_kexec.c
>+ * 
>+ * Created By: Horms
>+ *
>+ */
>+
>+#include <public/kexec.h>
>+
>+int machine_kexec_prepare(struct kexec_arg *arg)
>+{
>+	return 0;
>+}
>+
>+void machine_kexec_cleanup(struct kexec_arg *arg)
>+{
>+}
>+
>+/*
>+ * Local variables:
>+ * mode: C
>+ * c-set-style: "BSD"
>+ * c-basic-offset: 4
>+ * tab-width: 4
>+ * indent-tabs-mode: nil
>+ * End:
>+ */
>--- x/xen/arch/x86/setup.c
>+++ x/xen/arch/x86/setup.c
>@@ -38,6 +38,11 @@ static unsigned int opt_xenheap_megabyte
> integer_param("xenheap_megabytes", opt_xenheap_megabytes);
> #endif
> 
>+unsigned int opt_kdump_megabytes = 0;
>+integer_param("kdump_megabytes", opt_kdump_megabytes);
>+unsigned int opt_kdump_megabytes_base = 0;
>+integer_param("kdump_megabytes_base", opt_kdump_megabytes_base);
>+
> /* opt_nosmp: If true, secondary processors are ignored. */
> static int opt_nosmp = 0;
> boolean_param("nosmp", opt_nosmp);
>@@ -192,6 +197,20 @@ static void percpu_free_unused_areas(voi
>                        __pa(__per_cpu_end));
> }
> 
>+void __init move_memory(unsigned long dst, 
>+                          unsigned long src_start, unsigned long src_end)
>+{
>+#if defined(CONFIG_X86_32)
>+    memmove((void *)dst,  /* use low mapping */
>+            (void *)src_start,      /* use low mapping */
>+            src_end - src_start);
>+#elif defined(CONFIG_X86_64)
>+    memmove(__va(dst),
>+            __va(src_start),
>+            src_end - src_start);
>+#endif
>+}
>+
> void __init __start_xen(multiboot_info_t *mbi)
> {
>     char __cmdline[] = "", *cmdline = __cmdline;
>@@ -327,15 +346,8 @@ void __init __start_xen(multiboot_info_t
>         initial_images_start = xenheap_phys_end;
>     initial_images_end = initial_images_start + modules_length;
> 
>-#if defined(CONFIG_X86_32)
>-    memmove((void *)initial_images_start,  /* use low mapping */
>-            (void *)mod[0].mod_start,      /* use low mapping */
>-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
>-#elif defined(CONFIG_X86_64)
>-    memmove(__va(initial_images_start),
>-            __va(mod[0].mod_start),
>-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
>-#endif
>+    move_memory(initial_images_start, 
>+                mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
> 
>     /* Initialise boot-time allocator with all RAM situated after modules.
> */
>     xenheap_phys_start = init_boot_allocator(__pa(&_end));
>@@ -383,6 +395,51 @@ void __init __start_xen(multiboot_info_t
> #endif
>     }
> 
>+    if (opt_kdump_megabytes) {
>+        unsigned long kdump_start, kdump_size, k;
>+
>+        /* mark images pages as free for now */
>+
>+        init_boot_pages(initial_images_start, initial_images_end);
>+
>+        kdump_start = opt_kdump_megabytes_base << 20;
>+        kdump_size = opt_kdump_megabytes << 20;
>+
>+        printk("Kdump: %luMB (%lukB) at 0x%lx\n", 
>+               kdump_size >> 20,
>+               kdump_size >> 10,
>+               kdump_start);
>+
>+        if ((kdump_start & ~PAGE_MASK) || (kdump_size & ~PAGE_MASK))
>+            panic("Kdump parameters not page aligned\n");
>+
>+        kdump_start >>= PAGE_SHIFT;
>+        kdump_size >>= PAGE_SHIFT;
>+
>+        /* allocate pages for Kdump memory area */
>+
>+        k = alloc_boot_pages_at(kdump_size, kdump_start);
>+
>+        if (k != kdump_start)
>+            panic("Unable to reserve Kdump memory\n");
>+
>+        /* allocate pages for relocated initial images */
>+
>+        k = ((initial_images_end - initial_images_start) & ~PAGE_MASK) ? 1
> : 0;
>+        k += (initial_images_end - initial_images_start) >> PAGE_SHIFT;
>+
>+        k = alloc_boot_pages(k, 1);
>+
>+        if (!k)
>+            panic("Unable to allocate initial images memory\n");
>+
>+        move_memory(k << PAGE_SHIFT, initial_images_start, 
>initial_images_end);
>+
>+        initial_images_end -= initial_images_start;
>+        initial_images_start = k << PAGE_SHIFT;
>+        initial_images_end += initial_images_start;
>+    }        
>+
>     memguard_init();
> 
>     printk("System RAM: %luMB (%lukB)\n", 
>--- x/xen/arch/x86/x86_32/Makefile
>+++ x/xen/arch/x86/x86_32/Makefile
>@@ -3,5 +3,6 @@ obj-y += entry.o
> obj-y += mm.o
> obj-y += seg_fixup.o
> obj-y += traps.o
>+obj-y += machine_kexec.o
> 
> obj-$(supervisor_mode_kernel) += supervisor_mode_kernel.o
>--- x/xen/arch/x86/x86_32/entry.S
>+++ x/xen/arch/x86/x86_32/entry.S
>@@ -648,6 +648,7 @@ ENTRY(hypercall_table)
>         .long do_xenoprof_op
>         .long do_event_channel_op
>         .long do_physdev_op
>+        .long do_kexec
>         .rept NR_hypercalls-((.-hypercall_table)/4)
>         .long do_ni_hypercall
>         .endr
>@@ -687,6 +688,7 @@ ENTRY(hypercall_args_table)
>         .byte 2 /* do_xenoprof_op       */
>         .byte 2 /* do_event_channel_op  */
>         .byte 2 /* do_physdev_op        */
>+        .byte 2 /* do_kexec             */
>         .rept NR_hypercalls-(.-hypercall_args_table)
>         .byte 0 /* do_ni_hypercall      */
>         .endr
>--- /dev/null
>+++ x/xen/arch/x86/x86_32/machine_kexec.c
>@@ -0,0 +1,206 @@
>+/*************************************************************************
>*****
>+ * arch/x86/x86_32/machine_kexec.c
>+ * 
>+ * Created By: Horms
>+ *
>+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
>+ */
>+
>+#include <xen/config.h>
>+#include <xen/types.h>
>+#include <xen/domain_page.h> 
>+#include <xen/timer.h>
>+#include <xen/sched.h>
>+#include <xen/reboot.h>
>+#include <xen/console.h>
>+#include <asm/page.h> 
>+#include <asm/flushtlb.h>
>+#include <public/xen.h>
>+#include <public/kexec.h>
>+
>+static void __machine_kexec(struct kexec_arg *arg);
>+
>+typedef asmlinkage void (*relocate_new_kernel_t)(
>+                    unsigned long indirection_page,
>+                    unsigned long reboot_code_buffer,
>+                    unsigned long start_address,
>+                    unsigned int has_pae);
>+
>+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
>+
>+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
>+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
>+#define L2_ATTR (_PAGE_PRESENT)
>+
>+#ifndef CONFIG_X86_PAE
>+
>+static u32 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
>+
>+static void identity_map_page(unsigned long address)
>+{
>+    unsigned long mfn;
>+    u32 *pgtable_level2;
>+
>+    /* Find the current page table */
>+    mfn = read_cr3() >> PAGE_SHIFT;
>+    pgtable_level2 = map_domain_page(mfn);
>+
>+    /* Identity map the page table entry */
>+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
>+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | 
>L1_ATTR;
>+
>+    /* Flush the tlb so the new mapping takes effect.
>+     * Global tlb entries are not flushed but that is not an issue.
>+     */
>+    write_cr3(mfn << PAGE_SHIFT);
>+
>+    unmap_domain_page(pgtable_level2);
>+}
>+
>+#else
>+static u64 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
>+static u64 pgtable_level2[L2_PAGETABLE_ENTRIES] PAGE_ALIGNED;
>+
>+static void identity_map_page(unsigned long address)
>+{
>+    int mfn;
>+    intpte_t *pgtable_level3;
>+
>+    /* Find the current page table */
>+    mfn = read_cr3() >> PAGE_SHIFT;
>+    pgtable_level3 = map_domain_page(mfn);
>+
>+    /* Identity map the page table entry */
>+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
>+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | 
>L1_ATTR;
>+    set_64bit(&pgtable_level3[l3_table_offset(address)],
>+	      __pa(pgtable_level2) | L2_ATTR);
>+
>+    /* Flush the tlb so the new mapping takes effect.
>+     * Global tlb entries are not flushed but that is not an issue.
>+     */
>+    load_cr3(mfn << PAGE_SHIFT);
>+
>+    unmap_domain_page(pgtable_level3);
>+}
>+#endif
>+
>+static void kexec_load_segments(void)
>+{
>+#define __SSTR(X) #X
>+#define SSTR(X) __SSTR(X)
>+    __asm__ __volatile__ (
>+        "\tljmp $"SSTR(__HYPERVISOR_CS)",$1f\n"
>+        "\t1:\n"
>+        "\tmovl $"SSTR(__HYPERVISOR_DS)",%%eax\n"
>+        "\tmovl %%eax,%%ds\n"
>+        "\tmovl %%eax,%%es\n"
>+        "\tmovl %%eax,%%fs\n"
>+        "\tmovl %%eax,%%gs\n"
>+        "\tmovl %%eax,%%ss\n"
>+        ::: "eax", "memory");
>+#undef SSTR
>+#undef __SSTR
>+}
>+
>+#define kexec_load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
>+static void kexec_set_idt(void *newidt, __u16 limit)
>+{
>+    struct Xgt_desc_struct curidt;
>+
>+    /* ia32 supports unaliged loads & stores */
>+    curidt.size    = limit;
>+    curidt.address = (unsigned long)newidt;
>+    
>+    kexec_load_idt(&curidt);
>+
>+};
>+
>+#define kexec_load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
>+static void kexec_set_gdt(void *newgdt, __u16 limit)
>+{
>+    struct Xgt_desc_struct curgdt;
>+
>+    /* ia32 supports unaligned loads & stores */
>+    curgdt.size    = limit;
>+    curgdt.address = (unsigned long)newgdt;
>+
>+    kexec_load_gdt(&curgdt);
>+};
>+
>+static void __machine_shutdown(void *data)
>+{
>+    struct kexec_arg *arg = (struct kexec_arg *)data;
>+
>+    printk("__machine_shutdown: cpu=%u\n", smp_processor_id());
>+
>+    watchdog_disable();
>+    console_start_sync();
>+
>+    smp_send_stop();
>+
>+#ifdef CONFIG_X86_IO_APIC
>+    disable_IO_APIC();
>+#endif   
>+
>+    __machine_kexec(arg);
>+}
>+
>+void machine_shutdown(struct kexec_arg *arg)
>+{
>+    int reboot_cpu_id;
>+    cpumask_t reboot_cpu;
>+
>+
>+    reboot_cpu_id = 0;
>+
>+    if (!cpu_isset(reboot_cpu_id, cpu_online_map))
>+        reboot_cpu_id = smp_processor_id();
>+    
>+    if (reboot_cpu_id != smp_processor_id()) {
>+        cpus_clear(reboot_cpu);
>+        cpu_set(reboot_cpu_id, reboot_cpu);
>+        on_selected_cpus(reboot_cpu, __machine_shutdown, arg, 1, 0);
>+	for (;;)
>+		; /* nothing */
>+    }
>+    else
>+        __machine_shutdown(arg);
>+    BUG();
>+}
>+
>+static void __machine_kexec(struct kexec_arg *arg)
>+{
>+    relocate_new_kernel_t rnk;
>+
>+    local_irq_disable();
>+
>+    identity_map_page(arg->u.kexec.reboot_code_buffer);
>+
>+    copy_from_user((void *)arg->u.kexec.reboot_code_buffer, 
>+           arg->u.kexec.relocate_new_kernel,
>+           arg->u.kexec.relocate_new_kernel_size);
>+
>+    kexec_load_segments();
>+    kexec_set_gdt(__va(0),0);
>+    kexec_set_idt(__va(0),0);
>+
>+    rnk = (relocate_new_kernel_t) arg->u.kexec.reboot_code_buffer;
>+    (*rnk)(arg->u.kexec.indirection_page, arg->u.kexec.reboot_code_buffer, 
>+           arg->u.kexec.start_address, cpu_has_pae);
>+}
>+
>+void machine_kexec(struct kexec_arg *arg)
>+{
>+    machine_shutdown(arg);
>+}
>+
>+/*
>+ * Local variables:
>+ * mode: C
>+ * c-set-style: "BSD"
>+ * c-basic-offset: 4
>+ * tab-width: 4
>+ * indent-tabs-mode: nil
>+ * End:
>+ */
>--- x/xen/arch/x86/x86_64/Makefile
>+++ x/xen/arch/x86/x86_64/Makefile
>@@ -1,3 +1,4 @@
> obj-y += entry.o
> obj-y += mm.o
> obj-y += traps.o
>+obj-y += machine_kexec.o
>--- /dev/null
>+++ x/xen/arch/x86/x86_64/machine_kexec.c
>@@ -0,0 +1,24 @@
>+/*************************************************************************
>*****
>+ * arch/x86/x86_64/machine_kexec.c
>+ * 
>+ * Created By: Horms
>+ *
>+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
>+ */
>+
>+#include <public/kexec.h>
>+
>+void machine_kexec(struct kexec_arg *arg)
>+{
>+    printk("machine_kexec: not implemented\n");
>+}
>+
>+/*
>+ * Local variables:
>+ * mode: C
>+ * c-set-style: "BSD"
>+ * c-basic-offset: 4
>+ * tab-width: 4
>+ * indent-tabs-mode: nil
>+ * End:
>+ */
>--- x/xen/common/Makefile
>+++ x/xen/common/Makefile
>@@ -7,6 +7,7 @@ obj-y += event_channel.o
> obj-y += grant_table.o
> obj-y += kernel.o
> obj-y += keyhandler.o
>+obj-y += kexec.o
> obj-y += lib.o
> obj-y += memory.o
> obj-y += multicall.o
>--- /dev/null
>+++ x/xen/common/kexec.c
>@@ -0,0 +1,73 @@
>+/*
>+ * Achitecture independent kexec code for Xen
>+ *
>+ * At this statge, just a switch for the kexec hypercall into
>+ * architecture dependent code.
>+ *
>+ * Created By: Horms <horms@verge.net.au>
>+ */
>+
>+#include <xen/lib.h>
>+#include <xen/errno.h>
>+#include <xen/guest_access.h>
>+#include <xen/sched.h>
>+#include <public/xen.h>
>+#include <public/kexec.h>
>+
>+extern int machine_kexec_prepare(struct kexec_arg *arg);
>+extern void machine_kexec_cleanup(struct kexec_arg *arg);
>+extern void machine_kexec(struct kexec_arg *arg);
>+
>+extern unsigned int opt_kdump_megabytes;
>+extern unsigned int opt_kdump_megabytes_base;
>+
>+int do_kexec(unsigned long op, 
>+             XEN_GUEST_HANDLE(kexec_arg_t) uarg)
>+{
>+    struct kexec_arg arg;
>+
>+    if ( !IS_PRIV(current->domain) )  
>+        return -EPERM;
>+
>+    if (op == KEXEC_CMD_reserve)
>+    {
>+	arg.u.reserve.size = opt_kdump_megabytes << 20;
>+	arg.u.reserve.start = opt_kdump_megabytes_base << 20;
>+	if ( unlikely(copy_to_guest(uarg, &arg, 1) != 0) )
>+	{
>+		printk("do_kexec: copy_to_guest failed");
>+		return -EFAULT;
>+	}
>+	return 0;
>+    }
>+
>+    if ( unlikely(copy_from_guest(&arg, uarg, 1) != 0) )
>+    {
>+        printk("do_kexec: __copy_from_guest failed");
>+        return -EFAULT;
>+    }
>+
>+    switch(op) {
>+    case KEXEC_CMD_kexec:
>+        machine_kexec(&arg);
>+        return -EINVAL; /* Not Reached */
>+    case KEXEC_CMD_kexec_prepare:
>+        return machine_kexec_prepare(&arg);
>+    case KEXEC_CMD_kexec_cleanup:
>+        machine_kexec_cleanup(&arg);
>+        return 0;
>+    }
>+
>+    return -EINVAL;
>+}
>+
>+/*
>+ * Local variables:
>+ * mode: C
>+ * c-set-style: "BSD"
>+ * c-basic-offset: 4
>+ * tab-width: 4
>+ * indent-tabs-mode: nil
>+ * End:
>+ */
>+
>--- x/xen/common/page_alloc.c
>+++ x/xen/common/page_alloc.c
>@@ -212,24 +212,35 @@ void init_boot_pages(paddr_t ps, paddr_t
>     }
> }
> 
>+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long 
>pfn_at)
>+{
>+    unsigned long i;
>+
>+    for ( i = 0; i < nr_pfns; i++ )
>+        if ( allocated_in_map(pfn_at + i) )
>+             break;
>+
>+    if ( i == nr_pfns )
>+    {
>+        map_alloc(pfn_at, nr_pfns);
>+        return pfn_at;
>+    }
>+
>+    return 0;
>+}
>+
> unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long 
>pfn_align)
> {
>-    unsigned long pg, i;
>+    unsigned long pg, i = 0;
> 
>     for ( pg = 0; (pg + nr_pfns) < max_page; pg += pfn_align )
>     {
>-        for ( i = 0; i < nr_pfns; i++ )
>-            if ( allocated_in_map(pg + i) )
>-                 break;
>-
>-        if ( i == nr_pfns )
>-        {
>-            map_alloc(pg, nr_pfns);
>-            return pg;
>-        }
>+        i = alloc_boot_pages_at(nr_pfns, pg);
>+        if (i != 0)
>+            break;
>     }
> 
>-    return 0;
>+    return i;
> }
> 
> 
>--- x/xen/include/asm-x86/hypercall.h
>+++ x/xen/include/asm-x86/hypercall.h
>@@ -6,6 +6,7 @@
> #define __ASM_X86_HYPERCALL_H__
> 
> #include <public/physdev.h>
>+#include <public/kexec.h>
> 
> extern long
> do_event_channel_op_compat(
>@@ -87,6 +88,10 @@ extern long
> arch_do_vcpu_op(
>     int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg);
> 
>+extern int
>+do_kexec(
>+    unsigned long op, XEN_GUEST_HANDLE(kexec_arg_t) uarg);
>+
> #ifdef __x86_64__
> 
> extern long
>--- /dev/null
>+++ x/xen/include/public/kexec.h
>@@ -0,0 +1,46 @@
>+/*
>+ * kexec.h: Xen kexec public
>+ *
>+ * Created By: Horms <horms@verge.net.au>
>+ */
>+
>+#ifndef _XEN_PUBLIC_KEXEC_H
>+#define _XEN_PUBLIC_KEXEC_H
>+
>+#include <xen/types.h>
>+#include <public/xen.h>
>+
>+/*
>+ * Scratch space for passing arguments to the kexec hypercall
>+ */
>+typedef struct kexec_arg {
>+    union {
>+        struct {
>+            unsigned long data; /* Not sure what this should be yet */
>+        } helper;
>+        struct {
>+            unsigned long indirection_page;
>+            unsigned long reboot_code_buffer;
>+            unsigned long start_address;
>+            const char *relocate_new_kernel;
>+            unsigned int relocate_new_kernel_size;
>+        } kexec;
>+        struct {
>+            unsigned long size;
>+            unsigned long start;
>+        } reserve;
>+    } u;
>+} kexec_arg_t;
>+DEFINE_XEN_GUEST_HANDLE(kexec_arg_t);
>+
>+#endif
>+
>+/*
>+ * Local variables:
>+ * mode: C
>+ * c-set-style: "BSD"
>+ * c-basic-offset: 4
>+ * tab-width: 4
>+ * indent-tabs-mode: nil
>+ * End:
>+ */
>--- x/xen/include/public/xen.h
>+++ x/xen/include/public/xen.h
>@@ -64,6 +64,7 @@
> #define __HYPERVISOR_xenoprof_op          31
> #define __HYPERVISOR_event_channel_op     32
> #define __HYPERVISOR_physdev_op           33
>+#define __HYPERVISOR_kexec_op             34
> 
> /* Architecture-specific hypercall definitions. */
> #define __HYPERVISOR_arch_0               48
>@@ -238,6 +239,14 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
> #define VMASST_TYPE_writable_pagetables  2
> #define MAX_VMASST_TYPE 2
> 
>+/*
>+ * Operations for kexec.
>+ */
>+#define KEXEC_CMD_kexec                 0
>+#define KEXEC_CMD_kexec_prepare         1
>+#define KEXEC_CMD_kexec_cleanup         2
>+#define KEXEC_CMD_reserve               3
>+
> #ifndef __ASSEMBLY__
> 
> typedef uint16_t domid_t;
>--- x/xen/include/xen/mm.h
>+++ x/xen/include/xen/mm.h
>@@ -40,6 +40,7 @@ struct page_info;
> paddr_t init_boot_allocator(paddr_t bitmap_start);
> void init_boot_pages(paddr_t ps, paddr_t pe);
> unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long 
>pfn_align);
>+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long 
>pfn_at);
> void end_boot_allocator(void);
> 
> /* Generic allocator. These functions are *not* interrupt-safe. */
>
>
>_______________________________________________
>Xen-devel mailing list
>Xen-devel@lists.xensource.com
>http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386 (Take VI)
  2006-05-05  1:03                         ` horms
@ 2006-05-06  8:46                           ` Akio Takebe
  2006-05-07  4:46                             ` Horms
  0 siblings, 1 reply; 68+ messages in thread
From: Akio Takebe @ 2006-05-06  8:46 UTC (permalink / raw)
  To: horms; +Cc: xen-devel, Akio Takebe, Magnus Damm, Isaku Yamahata,
	Mark Williamson

Hi, Horms

Thank you for your reply.

>On Wed, May 03, 2006 at 04:16:22PM +0900, Akio Takebe wrote:
>> Hi, Simon and Magnus
>> 
>> I have one question.
>> When Xen is panic, I seemed kexec is not called.
>> Only when dom0 is panic, kexec is called.
>
>That is a good point.
>
>> But in the case of nmi=dom0, can we use kexec by pushing NMI button?
>> Am I righit?
>
>Probably, I will have to investigate a little further.
>Though, I'm not sure that I have ever seen an NMI button.
>Are you thinking about the INIT button on some ia64 boxes?
>That is a bit different to NMI on x86.
I said about the NMI bottun on x86.
Many x86 servers (not PC) have a NMI bottun 
like many ia64 servers have a INIT bottun.

Best Regards,

Akio Takebe

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386 (Take VI)
  2006-05-06  8:44                       ` Akio Takebe
@ 2006-05-07  4:45                         ` Horms
  2006-05-09  4:16                           ` [PATCH]: kexec: framework and i386 (Take VII) Horms
  0 siblings, 1 reply; 68+ messages in thread
From: Horms @ 2006-05-07  4:45 UTC (permalink / raw)
  To: Akio Takebe; +Cc: Isaku Yamahata, xen-devel, Mark Williamson, Magnus Damm

On Sat, May 06, 2006 at 05:44:44PM +0900, Akio Takebe wrote:
> Hi, Horms
> 
> Why you modify ref-linux-2.6.16/kernel/{drivers/base/cpu.c, kernel/kexec.c }?
> I tried to patch your kexec patch, I fail to patch it.
> How do you do patch?

Sorry, the drivers/base/cpu.c portion is just an artifact of xen's build
system which modifies that file on build, but doesn't unmodify it on
distclean. It shouldn't have been included in my patch.

kernel/kexec.c needs to be modified primarily so that mfns are used
instead of pfns. Again because of strangeness in the the xen build
system, this patch is a bit odd as it patches a file not covered by a
xen checkout (even though its needed for a xen build).  If you run the
following before applying the patch it should apply.

make prep-kernels clean kclean
make -C linux-2.6.16-xen distclean

> I think you can make a patch in patches/linux-2.6.16/ if you would
> modify these.

Yes, that is probably the best way forward, I'll work on breaking it
out in that manner.

-- 
Horms                                           http://www.vergenet.net/~horms/

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386 (Take VI)
  2006-05-06  8:46                           ` Akio Takebe
@ 2006-05-07  4:46                             ` Horms
  2006-05-07  9:45                               ` Akio Takebe
  0 siblings, 1 reply; 68+ messages in thread
From: Horms @ 2006-05-07  4:46 UTC (permalink / raw)
  To: Akio Takebe; +Cc: Isaku Yamahata, xen-devel, Mark Williamson, Magnus Damm

On Sat, May 06, 2006 at 05:46:43PM +0900, Akio Takebe wrote:
> Hi, Horms
> 
> Thank you for your reply.
> 
> >On Wed, May 03, 2006 at 04:16:22PM +0900, Akio Takebe wrote:
> >> Hi, Simon and Magnus
> >> 
> >> I have one question.
> >> When Xen is panic, I seemed kexec is not called.
> >> Only when dom0 is panic, kexec is called.
> >
> >That is a good point.
> >
> >> But in the case of nmi=dom0, can we use kexec by pushing NMI button?
> >> Am I righit?
> >
> >Probably, I will have to investigate a little further.
> >Though, I'm not sure that I have ever seen an NMI button.
> >Are you thinking about the INIT button on some ia64 boxes?
> >That is a bit different to NMI on x86.
> I said about the NMI bottun on x86.
> Many x86 servers (not PC) have a NMI bottun 
> like many ia64 servers have a INIT bottun.

Ok thanks, I haven't seen such a machine.
I'll look into simulating it in software.

-- 
Horms                                           http://www.vergenet.net/~horms/

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386 (Take VI)
  2006-05-07  4:46                             ` Horms
@ 2006-05-07  9:45                               ` Akio Takebe
  2006-05-08  9:02                                 ` Ian Campbell
  0 siblings, 1 reply; 68+ messages in thread
From: Akio Takebe @ 2006-05-07  9:45 UTC (permalink / raw)
  To: Horms; +Cc: xen-devel, Akio Takebe, Magnus Damm, Isaku Yamahata,
	Mark Williamson

>
>Ok thanks, I haven't seen such a machine.
>I'll look into simulating it in software.
>
I have x86 server with NMI button.
If necessary, I can test it :-)

Best Regards,

Akio Takebe

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386 (Take VI)
  2006-05-07  9:45                               ` Akio Takebe
@ 2006-05-08  9:02                                 ` Ian Campbell
  2006-05-11 11:35                                   ` horms
  0 siblings, 1 reply; 68+ messages in thread
From: Ian Campbell @ 2006-05-08  9:02 UTC (permalink / raw)
  To: Akio Takebe
  Cc: Isaku Yamahata, Magnus Damm, Horms, Mark Williamson, xen-devel

I didn't get Horms' (I presume that's who is quoted below) original mail
so I'll reply to this one.

> >Ok thanks, I haven't seen such a machine.
> >I'll look into simulating it in software.

There is code in xen/arch/x86/nmi.c:do_nmi_trigger(). You can trigger it
with the 'n' keyhandler.

Ian.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* [PATCH]: kexec: framework and i386 (Take VII)
  2006-05-07  4:45                         ` Horms
@ 2006-05-09  4:16                           ` Horms
  2006-05-09  9:18                             ` [PATCH]: kexec: framework and i386 (Take VIII) Horms
  0 siblings, 1 reply; 68+ messages in thread
From: Horms @ 2006-05-09  4:16 UTC (permalink / raw)
  To: Akio Takebe; +Cc: Isaku Yamahata, xen-devel, Mark Williamson, Magnus Damm

On Sun, May 07, 2006 at 01:45:22PM +0900, Horms wrote:
> On Sat, May 06, 2006 at 05:44:44PM +0900, Akio Takebe wrote:
> 
> > I think you can make a patch in patches/linux-2.6.16/ if you would
> > modify these.
> 
> Yes, that is probably the best way forward, I'll work on breaking it
> out in that manner.

Hi Takebe-san,

here is an updated version of the patch which moves portions into
patches/linux-2.6.16/ as you suggested. It also moves to
xen-unstable 9969 / Linux 2.6.16.13 and has some minor build fixes,
for problems that crept into the previous patch.

-- 
Horms                                           http://www.vergenet.net/~horms/

kexec: framework and i386

This is an implementation of kexec for dom0/xen, that allows
kexecing of the physical machine from xen. The approach taken is
to move the architecture-dependant kexec code into a new hypercall.

Some notes:
  * machine_kexec_cleanup() and machine_kexec_prepare() don't do
    anything in i386. So while this patch adds a framework for them,
    I am not sure what parameters are needs at this stage.
  * Only works for UP, as machine_shutdown is not implemented yet
  * kexecing into xen does not seem to work, I think that 
    kexec-tools needs updating, but I have not investigated yet
  * Kdump works by first copying the kernel into dom0 segments
    and relocating them later in xen, the same way that kexec does
    The only difference is that the relocation is made into
    an area reserved by xen
  * Kdump reservation is made using the xen command line parameters,
    kdump_megabytes and kdump_megabytes_base, rather than
    the linux option crashkernel, which is now ignored.
    Two parameters are used instead of one to simplify parsing.
    This can be cleaned up later if desired. But the reservation
    seems to need to be made by xen to make sure that it happens
    early enough.
    The tested values are kdump_megabytes=16, kdump_megabytes_base=32
    (kdump_megabytes_base=16 does not seem to work)
  * This patch uses a new kexec hypercall
  * SMP Kexec works, Kdump is next on the list

Highlights since the previous posted version:
 
  * Diff now applies to a xen checkout from hg 
    (previously it assumed that the kernel was unpacked)
    - xen-unstable-hg 9660 / Linux 2.6.16.13
  * Added machine_shutdown, which disapperared in the previous release of
    this patch
  * Fixed include problems in kexec.h

Prepared by Horms and Magnus Damm

Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
Signed-Off-By: Horms <horms@verge.net.au>

 linux-2.6-xen-sparse/arch/i386/Kconfig                         |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/Makefile                 |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c              |   24 +
 linux-2.6-xen-sparse/drivers/xen/core/Makefile                 |    1 
 linux-2.6-xen-sparse/drivers/xen/core/crash.c                  |   98 ++++
 linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c          |   73 +++
 linux-2.6-xen-sparse/drivers/xen/core/reboot.c                 |    4 
 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h |   10 
 linux-2.6.16.13/kexec.patch                                    |  175 ++++++++
 xen/arch/x86/Makefile                                          |    1 
 xen/arch/x86/dom0_ops.c                                        |    3 
 xen/arch/x86/machine_kexec.c                                   |   28 +
 xen/arch/x86/setup.c                                           |   75 +++
 xen/arch/x86/x86_32/Makefile                                   |    1 
 xen/arch/x86/x86_32/entry.S                                    |    2 
 xen/arch/x86/x86_32/machine_kexec.c                            |  205 ++++++++++
 xen/arch/x86/x86_64/Makefile                                   |    1 
 xen/arch/x86/x86_64/machine_kexec.c                            |   25 +
 xen/common/Makefile                                            |    1 
 xen/common/kexec.c                                             |   73 +++
 xen/common/page_alloc.c                                        |   33 +
 xen/include/asm-x86/hypercall.h                                |    6 
 xen/include/public/kexec.h                                     |   45 ++
 xen/include/public/xen.h                                       |    9 
 xen/include/xen/mm.h                                           |    1 
 25 files changed, 876 insertions(+), 22 deletions(-)

--- x/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ x/linux-2.6-xen-sparse/arch/i386/Kconfig
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_XEN
+	depends on EXPERIMENTAL
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
@@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o machine_kexec.o crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
@@ -68,6 +68,10 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -932,6 +936,7 @@ static void __init parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -942,6 +947,10 @@ static void __init parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1318,9 +1327,21 @@ void __init setup_bootmem_allocator(void
 	}
 #endif
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end)
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
+#else
+	{
+		struct kexec_arg xen_kexec_arg;
+		BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_reserve, &xen_kexec_arg));
+		if (xen_kexec_arg.u.reserve.size) {
+			crashk_res.start = xen_kexec_arg.u.reserve.start;
+			crashk_res.end = xen_kexec_arg.u.reserve.start + 
+				xen_kexec_arg.u.reserve.size - 1;
+		}
+	}
+#endif
 #endif
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1395,6 +1416,9 @@ legacy_init_iomem_resources(struct resou
 		res->end = map[i].end - 1;
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		request_resource(&iomem_resource, res);
+#ifdef CONFIG_KEXEC
+        request_resource(res, &crashk_res);
+#endif
 	}
 
 	free_bootmem(__pa(map), PAGE_SIZE);
--- x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
+++ x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_NET)     += skbuff.o
 obj-$(CONFIG_SMP)     += smpboot.o
 obj-$(CONFIG_SYSFS)   += hypervisor_sysfs.o
 obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
+obj-$(CONFIG_KEXEC)   += machine_kexec.o crash.o
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/crash.c
@@ -0,0 +1,98 @@
+/*
+ * Architecture specific (i386-xen) functions for kexec based crash dumps.
+ *
+ * Created by: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <linux/kernel.h> /* For printk */
+
+/* XXX: final_note(), crash_save_this_cpu() and crash_save_self()
+ * are copied from arch/i386/kernel/crash.c, might be good to either
+ * the original functions non-static and use them, or just
+ * merge this this into that file. 
+ */
+#include <linux/elf.h>     /* For struct elf_note */
+#include <linux/elfcore.h> /* For struct elf_prstatus */
+#include <linux/kexec.h>   /* crash_notes */
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+							       size_t data_len)
+{
+	struct elf_note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = data_len;
+	note.n_type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) +3)/4;
+	memcpy(buf, name, note.n_namesz);
+	buf += (note.n_namesz + 3)/4;
+	memcpy(buf, data, note.n_descsz);
+	buf += (note.n_descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	struct elf_note note;
+
+	note.n_namesz = 0;
+	note.n_descsz = 0;
+	note.n_type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
+{
+	struct elf_prstatus prstatus;
+	u32 *buf;
+
+	if ((cpu < 0) || (cpu >= NR_CPUS))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * I need a well defined structure format
+	 * for the data I pass, and I need tags
+	 * on the data to indicate what information I have
+	 * squirrelled away.  ELF notes happen to provide
+	 * all of that that no need to invent something new.
+	 */
+	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+	if (!buf)
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	prstatus.pr_pid = current->pid;
+	elf_core_copy_regs(&prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+				sizeof(prstatus));
+	final_note(buf);
+}
+
+static void crash_save_self(struct pt_regs *regs)
+{
+	int cpu;
+
+	cpu = smp_processor_id();
+	crash_save_this_cpu(regs, cpu);
+}
+
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	/* XXX: This should do something */
+	printk("xen-kexec: Need to turn of other CPUS in "
+	       "machine_crash_shutdown()\n");
+	crash_save_self(regs);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
@@ -0,0 +1,73 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <linux/kexec.h>
+#include <xen/interface/kexec.h>
+#include <linux/mm.h>
+#include <asm/hypercall.h>
+
+const extern unsigned char relocate_new_kernel[];
+extern unsigned int relocate_new_kernel_size;
+
+/*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+ * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
+ * have been allocated, but the segments have yet
+ * been copied into the kernel.
+ *
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ *
+ * Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+	kexec_arg_t hypercall_arg;
+       	hypercall_arg.u.helper.data = NULL;
+	return HYPERVISOR_kexec(KEXEC_CMD_kexec_prepare, &hypercall_arg);
+}
+
+/*
+ * Undo anything leftover by machine_kexec_prepare
+ * when an image is freed.
+ */
+void machine_kexec_cleanup(struct kimage *image)
+{
+	kexec_arg_t hypercall_arg;
+	hypercall_arg.u.helper.data = NULL;
+	HYPERVISOR_kexec(KEXEC_CMD_kexec_cleanup, &hypercall_arg);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+	kexec_arg_t hypercall_arg;
+	hypercall_arg.u.kexec.indirection_page = image->head;
+	hypercall_arg.u.kexec.reboot_code_buffer = 
+		pfn_to_mfn(page_to_pfn(image->control_code_page)) << PAGE_SHIFT;
+	hypercall_arg.u.kexec.start_address = image->start;
+	hypercall_arg.u.kexec.relocate_new_kernel = relocate_new_kernel;
+	hypercall_arg.u.kexec.relocate_new_kernel_size = 
+		relocate_new_kernel_size;
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, &hypercall_arg);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
+++ x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
@@ -66,6 +66,10 @@ void machine_power_off(void)
 	HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 }
 
+#ifdef CONFIG_KEXEC
+void machine_shutdown(void) { }
+#endif
+
 int reboot_thru_bios = 0;	/* for dmi_scan.c */
 EXPORT_SYMBOL(machine_restart);
 EXPORT_SYMBOL(machine_halt);
--- x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
@@ -39,6 +39,8 @@
 # error "please don't include this file directly"
 #endif
 
+#include <xen/interface/kexec.h>
+
 #define __STR(x) #x
 #define STR(x) __STR(x)
 
@@ -359,6 +361,14 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, kexec_arg_t * arg)
+{
+	return _hypercall2(int, kexec_op, op, arg); 
+}
+
+
 
 #endif /* __HYPERCALL_H__ */
 
--- x/xen/arch/x86/Makefile
+++ x/xen/arch/x86/Makefile
@@ -39,6 +39,7 @@ obj-y += trampoline.o
 obj-y += traps.o
 obj-y += usercopy.o
 obj-y += x86_emulate.o
+obj-y += machine_kexec.o
 
 ifneq ($(pae),n)
 obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o
--- x/xen/arch/x86/dom0_ops.c
+++ x/xen/arch/x86/dom0_ops.c
@@ -29,6 +29,9 @@
 #include <asm/mtrr.h>
 #include "cpu/mtrr/mtrr.h"
 
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
 #define TRC_DOM0OP_ENTER_BASE  0x00020000
 #define TRC_DOM0OP_LEAVE_BASE  0x00030000
 
--- /dev/null
+++ x/xen/arch/x86/machine_kexec.c
@@ -0,0 +1,28 @@
+/******************************************************************************
+ * arch/x86/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ */
+
+#include <xen/types.h>
+#include <public/kexec.h>
+
+int machine_kexec_prepare(struct kexec_arg *arg)
+{
+	return 0;
+}
+
+void machine_kexec_cleanup(struct kexec_arg *arg)
+{
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/setup.c
+++ x/xen/arch/x86/setup.c
@@ -38,6 +38,11 @@ static unsigned int opt_xenheap_megabyte
 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
 #endif
 
+unsigned int opt_kdump_megabytes = 0;
+integer_param("kdump_megabytes", opt_kdump_megabytes);
+unsigned int opt_kdump_megabytes_base = 0;
+integer_param("kdump_megabytes_base", opt_kdump_megabytes_base);
+
 /* opt_nosmp: If true, secondary processors are ignored. */
 static int opt_nosmp = 0;
 boolean_param("nosmp", opt_nosmp);
@@ -192,6 +197,20 @@ static void percpu_free_unused_areas(voi
                        __pa(__per_cpu_end));
 }
 
+void __init move_memory(unsigned long dst, 
+                          unsigned long src_start, unsigned long src_end)
+{
+#if defined(CONFIG_X86_32)
+    memmove((void *)dst,  /* use low mapping */
+            (void *)src_start,      /* use low mapping */
+            src_end - src_start);
+#elif defined(CONFIG_X86_64)
+    memmove(__va(dst),
+            __va(src_start),
+            src_end - src_start);
+#endif
+}
+
 void __init __start_xen(multiboot_info_t *mbi)
 {
     char __cmdline[] = "", *cmdline = __cmdline;
@@ -327,15 +346,8 @@ void __init __start_xen(multiboot_info_t
         initial_images_start = xenheap_phys_end;
     initial_images_end = initial_images_start + modules_length;
 
-#if defined(CONFIG_X86_32)
-    memmove((void *)initial_images_start,  /* use low mapping */
-            (void *)mod[0].mod_start,      /* use low mapping */
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#elif defined(CONFIG_X86_64)
-    memmove(__va(initial_images_start),
-            __va(mod[0].mod_start),
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#endif
+    move_memory(initial_images_start, 
+                mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
 
     /* Initialise boot-time allocator with all RAM situated after modules. */
     xenheap_phys_start = init_boot_allocator(__pa(&_end));
@@ -383,6 +395,51 @@ void __init __start_xen(multiboot_info_t
 #endif
     }
 
+    if (opt_kdump_megabytes) {
+        unsigned long kdump_start, kdump_size, k;
+
+        /* mark images pages as free for now */
+
+        init_boot_pages(initial_images_start, initial_images_end);
+
+        kdump_start = opt_kdump_megabytes_base << 20;
+        kdump_size = opt_kdump_megabytes << 20;
+
+        printk("Kdump: %luMB (%lukB) at 0x%lx\n", 
+               kdump_size >> 20,
+               kdump_size >> 10,
+               kdump_start);
+
+        if ((kdump_start & ~PAGE_MASK) || (kdump_size & ~PAGE_MASK))
+            panic("Kdump parameters not page aligned\n");
+
+        kdump_start >>= PAGE_SHIFT;
+        kdump_size >>= PAGE_SHIFT;
+
+        /* allocate pages for Kdump memory area */
+
+        k = alloc_boot_pages_at(kdump_size, kdump_start);
+
+        if (k != kdump_start)
+            panic("Unable to reserve Kdump memory\n");
+
+        /* allocate pages for relocated initial images */
+
+        k = ((initial_images_end - initial_images_start) & ~PAGE_MASK) ? 1 : 0;
+        k += (initial_images_end - initial_images_start) >> PAGE_SHIFT;
+
+        k = alloc_boot_pages(k, 1);
+
+        if (!k)
+            panic("Unable to allocate initial images memory\n");
+
+        move_memory(k << PAGE_SHIFT, initial_images_start, initial_images_end);
+
+        initial_images_end -= initial_images_start;
+        initial_images_start = k << PAGE_SHIFT;
+        initial_images_end += initial_images_start;
+    }        
+
     memguard_init();
 
     printk("System RAM: %luMB (%lukB)\n", 
--- x/xen/arch/x86/x86_32/Makefile
+++ x/xen/arch/x86/x86_32/Makefile
@@ -3,5 +3,6 @@ obj-y += entry.o
 obj-y += mm.o
 obj-y += seg_fixup.o
 obj-y += traps.o
+obj-y += machine_kexec.o
 
 obj-$(supervisor_mode_kernel) += supervisor_mode_kernel.o
--- x/xen/arch/x86/x86_32/entry.S
+++ x/xen/arch/x86/x86_32/entry.S
@@ -648,6 +648,7 @@ ENTRY(hypercall_table)
         .long do_xenoprof_op
         .long do_event_channel_op
         .long do_physdev_op
+        .long do_kexec
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -687,6 +688,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_xenoprof_op       */
         .byte 2 /* do_event_channel_op  */
         .byte 2 /* do_physdev_op        */
+        .byte 2 /* do_kexec             */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- /dev/null
+++ x/xen/arch/x86/x86_32/machine_kexec.c
@@ -0,0 +1,205 @@
+/******************************************************************************
+ * arch/x86/x86_32/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
+ */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/domain_page.h> 
+#include <xen/timer.h>
+#include <xen/sched.h>
+#include <xen/reboot.h>
+#include <xen/console.h>
+#include <asm/page.h> 
+#include <asm/flushtlb.h>
+#include <public/kexec.h>
+
+static void __machine_kexec(struct kexec_arg *arg);
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+                    unsigned long indirection_page,
+                    unsigned long reboot_code_buffer,
+                    unsigned long start_address,
+                    unsigned int has_pae);
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L2_ATTR (_PAGE_PRESENT)
+
+#ifndef CONFIG_X86_PAE
+
+static u32 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    unsigned long mfn;
+    u32 *pgtable_level2;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level2 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    write_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level2);
+}
+
+#else
+static u64 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+static u64 pgtable_level2[L2_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    int mfn;
+    intpte_t *pgtable_level3;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level3 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+    set_64bit(&pgtable_level3[l3_table_offset(address)],
+	      __pa(pgtable_level2) | L2_ATTR);
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    load_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level3);
+}
+#endif
+
+static void kexec_load_segments(void)
+{
+#define __SSTR(X) #X
+#define SSTR(X) __SSTR(X)
+    __asm__ __volatile__ (
+        "\tljmp $"SSTR(__HYPERVISOR_CS)",$1f\n"
+        "\t1:\n"
+        "\tmovl $"SSTR(__HYPERVISOR_DS)",%%eax\n"
+        "\tmovl %%eax,%%ds\n"
+        "\tmovl %%eax,%%es\n"
+        "\tmovl %%eax,%%fs\n"
+        "\tmovl %%eax,%%gs\n"
+        "\tmovl %%eax,%%ss\n"
+        ::: "eax", "memory");
+#undef SSTR
+#undef __SSTR
+}
+
+#define kexec_load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
+static void kexec_set_idt(void *newidt, __u16 limit)
+{
+    struct Xgt_desc_struct curidt;
+
+    /* ia32 supports unaliged loads & stores */
+    curidt.size    = limit;
+    curidt.address = (unsigned long)newidt;
+    
+    kexec_load_idt(&curidt);
+
+};
+
+#define kexec_load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
+static void kexec_set_gdt(void *newgdt, __u16 limit)
+{
+    struct Xgt_desc_struct curgdt;
+
+    /* ia32 supports unaligned loads & stores */
+    curgdt.size    = limit;
+    curgdt.address = (unsigned long)newgdt;
+
+    kexec_load_gdt(&curgdt);
+};
+
+static void __machine_shutdown(void *data)
+{
+    struct kexec_arg *arg = (struct kexec_arg *)data;
+
+    printk("__machine_shutdown: cpu=%u\n", smp_processor_id());
+
+    watchdog_disable();
+    console_start_sync();
+
+    smp_send_stop();
+
+#ifdef CONFIG_X86_IO_APIC
+    disable_IO_APIC();
+#endif   
+
+    __machine_kexec(arg);
+}
+
+void machine_shutdown(struct kexec_arg *arg)
+{
+    int reboot_cpu_id;
+    cpumask_t reboot_cpu;
+
+
+    reboot_cpu_id = 0;
+
+    if (!cpu_isset(reboot_cpu_id, cpu_online_map))
+        reboot_cpu_id = smp_processor_id();
+    
+    if (reboot_cpu_id != smp_processor_id()) {
+        cpus_clear(reboot_cpu);
+        cpu_set(reboot_cpu_id, reboot_cpu);
+        on_selected_cpus(reboot_cpu, __machine_shutdown, arg, 1, 0);
+	for (;;)
+		; /* nothing */
+    }
+    else
+        __machine_shutdown(arg);
+    BUG();
+}
+
+static void __machine_kexec(struct kexec_arg *arg)
+{
+    relocate_new_kernel_t rnk;
+
+    local_irq_disable();
+
+    identity_map_page(arg->u.kexec.reboot_code_buffer);
+
+    copy_from_user((void *)arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.relocate_new_kernel,
+           arg->u.kexec.relocate_new_kernel_size);
+
+    kexec_load_segments();
+    kexec_set_gdt(__va(0),0);
+    kexec_set_idt(__va(0),0);
+
+    rnk = (relocate_new_kernel_t) arg->u.kexec.reboot_code_buffer;
+    (*rnk)(arg->u.kexec.indirection_page, arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.start_address, cpu_has_pae);
+}
+
+void machine_kexec(struct kexec_arg *arg)
+{
+    machine_shutdown(arg);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/x86_64/Makefile
+++ x/xen/arch/x86/x86_64/Makefile
@@ -1,3 +1,4 @@
 obj-y += entry.o
 obj-y += mm.o
 obj-y += traps.o
+obj-y += machine_kexec.o
--- /dev/null
+++ x/xen/arch/x86/x86_64/machine_kexec.c
@@ -0,0 +1,25 @@
+/******************************************************************************
+ * arch/x86/x86_64/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
+ */
+
+#include <xen/types.h>
+#include <public/kexec.h>
+
+void machine_kexec(struct kexec_arg *arg)
+{
+    printk("machine_kexec: not implemented\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/Makefile
+++ x/xen/common/Makefile
@@ -7,6 +7,7 @@ obj-y += event_channel.o
 obj-y += grant_table.o
 obj-y += kernel.o
 obj-y += keyhandler.o
+obj-y += kexec.o
 obj-y += lib.o
 obj-y += memory.o
 obj-y += multicall.o
--- /dev/null
+++ x/xen/common/kexec.c
@@ -0,0 +1,73 @@
+/*
+ * Achitecture independent kexec code for Xen
+ *
+ * At this statge, just a switch for the kexec hypercall into
+ * architecture dependent code.
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#include <xen/lib.h>
+#include <xen/errno.h>
+#include <xen/guest_access.h>
+#include <xen/sched.h>
+#include <xen/types.h>
+#include <public/kexec.h>
+
+extern int machine_kexec_prepare(struct kexec_arg *arg);
+extern void machine_kexec_cleanup(struct kexec_arg *arg);
+extern void machine_kexec(struct kexec_arg *arg);
+
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
+int do_kexec(unsigned long op, 
+             XEN_GUEST_HANDLE(kexec_arg_t) uarg)
+{
+    struct kexec_arg arg;
+
+    if ( !IS_PRIV(current->domain) )  
+        return -EPERM;
+
+    if (op == KEXEC_CMD_reserve)
+    {
+	arg.u.reserve.size = opt_kdump_megabytes << 20;
+	arg.u.reserve.start = opt_kdump_megabytes_base << 20;
+	if ( unlikely(copy_to_guest(uarg, &arg, 1) != 0) )
+	{
+		printk("do_kexec: copy_to_guest failed");
+		return -EFAULT;
+	}
+	return 0;
+    }
+
+    if ( unlikely(copy_from_guest(&arg, uarg, 1) != 0) )
+    {
+        printk("do_kexec: __copy_from_guest failed");
+        return -EFAULT;
+    }
+
+    switch(op) {
+    case KEXEC_CMD_kexec:
+        machine_kexec(&arg);
+        return -EINVAL; /* Not Reached */
+    case KEXEC_CMD_kexec_prepare:
+        return machine_kexec_prepare(&arg);
+    case KEXEC_CMD_kexec_cleanup:
+        machine_kexec_cleanup(&arg);
+        return 0;
+    }
+
+    return -EINVAL;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
--- x/xen/common/page_alloc.c
+++ x/xen/common/page_alloc.c
@@ -212,24 +212,35 @@ void init_boot_pages(paddr_t ps, paddr_t
     }
 }
 
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at)
+{
+    unsigned long i;
+
+    for ( i = 0; i < nr_pfns; i++ )
+        if ( allocated_in_map(pfn_at + i) )
+             break;
+
+    if ( i == nr_pfns )
+    {
+        map_alloc(pfn_at, nr_pfns);
+        return pfn_at;
+    }
+
+    return 0;
+}
+
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
 {
-    unsigned long pg, i;
+    unsigned long pg, i = 0;
 
     for ( pg = 0; (pg + nr_pfns) < max_page; pg += pfn_align )
     {
-        for ( i = 0; i < nr_pfns; i++ )
-            if ( allocated_in_map(pg + i) )
-                 break;
-
-        if ( i == nr_pfns )
-        {
-            map_alloc(pg, nr_pfns);
-            return pg;
-        }
+        i = alloc_boot_pages_at(nr_pfns, pg);
+        if (i != 0)
+            break;
     }
 
-    return 0;
+    return i;
 }
 
 
--- x/xen/include/asm-x86/hypercall.h
+++ x/xen/include/asm-x86/hypercall.h
@@ -6,6 +6,8 @@
 #define __ASM_X86_HYPERCALL_H__
 
 #include <public/physdev.h>
+#include <xen/types.h>
+#include <public/kexec.h>
 
 extern long
 do_event_channel_op_compat(
@@ -87,6 +89,10 @@ extern long
 arch_do_vcpu_op(
     int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg);
 
+extern int
+do_kexec(
+    unsigned long op, XEN_GUEST_HANDLE(kexec_arg_t) uarg);
+
 #ifdef __x86_64__
 
 extern long
--- /dev/null
+++ x/xen/include/public/kexec.h
@@ -0,0 +1,45 @@
+/*
+ * kexec.h: Xen kexec public
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+#include "xen.h"
+
+/*
+ * Scratch space for passing arguments to the kexec hypercall
+ */
+typedef struct kexec_arg {
+    union {
+        struct {
+            unsigned long data; /* Not sure what this should be yet */
+        } helper;
+        struct {
+            unsigned long indirection_page;
+            unsigned long reboot_code_buffer;
+            unsigned long start_address;
+            const char *relocate_new_kernel;
+            unsigned int relocate_new_kernel_size;
+        } kexec;
+        struct {
+            unsigned long size;
+            unsigned long start;
+        } reserve;
+    } u;
+} kexec_arg_t;
+DEFINE_XEN_GUEST_HANDLE(kexec_arg_t);
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/public/xen.h
+++ x/xen/include/public/xen.h
@@ -64,6 +64,7 @@
 #define __HYPERVISOR_xenoprof_op          31
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
+#define __HYPERVISOR_kexec_op             34
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
@@ -238,6 +239,14 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
 #define VMASST_TYPE_writable_pagetables  2
 #define MAX_VMASST_TYPE 2
 
+/*
+ * Operations for kexec.
+ */
+#define KEXEC_CMD_kexec                 0
+#define KEXEC_CMD_kexec_prepare         1
+#define KEXEC_CMD_kexec_cleanup         2
+#define KEXEC_CMD_reserve               3
+
 #ifndef __ASSEMBLY__
 
 typedef uint16_t domid_t;
--- x/xen/include/xen/mm.h
+++ x/xen/include/xen/mm.h
@@ -40,6 +40,7 @@ struct page_info;
 paddr_t init_boot_allocator(paddr_t bitmap_start);
 void init_boot_pages(paddr_t ps, paddr_t pe);
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align);
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at);
 void end_boot_allocator(void);
 
 /* Generic allocator. These functions are *not* interrupt-safe. */
--- /dev/null	2006-05-08 18:31:14.283785672 +0900
+++ x/linux-2.6.16.13/kexec.patch	2006-05-09 10:19:10.000000000 +0900
@@ -0,0 +1,175 @@
+--- x/ref-linux-2.6.16.13/drivers/base/cpu.c
++++ x/ref-linux-2.6.16.13/drivers/base/cpu.c
+@@ -101,7 +101,11 @@ static ssize_t show_crash_notes(struct s
+ 	 * boot up and this data does not change there after. Hence this
+ 	 * operation should be safe. No locking required.
+ 	 */
++#ifndef CONFIG_XEN
+ 	addr = __pa(per_cpu_ptr(crash_notes, cpunum));
++#else
++	addr = virt_to_machine(per_cpu_ptr(crash_notes, cpunum));
++#endif
+ 	rc = sprintf(buf, "%Lx\n", addr);
+ 	return rc;
+ }
+--- x/ref-linux-2.6.16.13/kernel/kexec.c
++++ x/ref-linux-2.6.16.13/kernel/kexec.c
+@@ -38,6 +38,20 @@ struct resource crashk_res = {
+ 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ };
+ 
++/* Kexec needs to know about the actually physical addresss.
++ * But in xen, a physical address is a pseudo-physical addresss. */
++#ifndef CONFIG_XEN
++#define kexec_page_to_pfn(page)  page_to_pfn(page)
++#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
++#define kexec_virt_to_phys(addr) virt_to_phys(addr)
++#define kexec_phys_to_virt(addr) phys_to_virt(addr)
++#else
++#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
++#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
++#define kexec_virt_to_phys(addr) virt_to_machine(addr)
++#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
++#endif
++
+ int kexec_should_crash(struct task_struct *p)
+ {
+ 	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+@@ -403,7 +417,7 @@ static struct page *kimage_alloc_normal_
+ 		pages = kimage_alloc_pages(GFP_KERNEL, order);
+ 		if (!pages)
+ 			break;
+-		pfn   = page_to_pfn(pages);
++		pfn   = kexec_page_to_pfn(pages);
+ 		epfn  = pfn + count;
+ 		addr  = pfn << PAGE_SHIFT;
+ 		eaddr = epfn << PAGE_SHIFT;
+@@ -437,6 +451,7 @@ static struct page *kimage_alloc_normal_
+ 	return pages;
+ }
+ 
++#ifndef CONFIG_XEN
+ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ 						      unsigned int order)
+ {
+@@ -490,7 +505,7 @@ static struct page *kimage_alloc_crash_c
+ 		}
+ 		/* If I don't overlap any segments I have found my hole! */
+ 		if (i == image->nr_segments) {
+-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
++			pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
+ 			break;
+ 		}
+ 	}
+@@ -517,6 +532,13 @@ struct page *kimage_alloc_control_pages(
+ 
+ 	return pages;
+ }
++#else /* !CONFIG_XEN */
++struct page *kimage_alloc_control_pages(struct kimage *image,
++					 unsigned int order)
++{
++	return kimage_alloc_normal_control_pages(image, order);
++}
++#endif
+ 
+ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+ {
+@@ -532,7 +554,7 @@ static int kimage_add_entry(struct kimag
+ 			return -ENOMEM;
+ 
+ 		ind_page = page_address(page);
+-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
++		*image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
+ 		image->entry = ind_page;
+ 		image->last_entry = ind_page +
+ 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+@@ -593,13 +615,13 @@ static int kimage_terminate(struct kimag
+ #define for_each_kimage_entry(image, ptr, entry) \
+ 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ 		ptr = (entry & IND_INDIRECTION)? \
+-			phys_to_virt((entry & PAGE_MASK)): ptr +1)
++			kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
+ 
+ static void kimage_free_entry(kimage_entry_t entry)
+ {
+ 	struct page *page;
+ 
+-	page = pfn_to_page(entry >> PAGE_SHIFT);
++	page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
+ 	kimage_free_pages(page);
+ }
+ 
+@@ -686,7 +708,7 @@ static struct page *kimage_alloc_page(st
+ 	 * have a match.
+ 	 */
+ 	list_for_each_entry(page, &image->dest_pages, lru) {
+-		addr = page_to_pfn(page) << PAGE_SHIFT;
++		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
+ 		if (addr == destination) {
+ 			list_del(&page->lru);
+ 			return page;
+@@ -701,12 +723,12 @@ static struct page *kimage_alloc_page(st
+ 		if (!page)
+ 			return NULL;
+ 		/* If the page cannot be used file it away */
+-		if (page_to_pfn(page) >
++		if (kexec_page_to_pfn(page) >
+ 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ 			list_add(&page->lru, &image->unuseable_pages);
+ 			continue;
+ 		}
+-		addr = page_to_pfn(page) << PAGE_SHIFT;
++		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
+ 
+ 		/* If it is the destination page we want use it */
+ 		if (addr == destination)
+@@ -729,7 +751,7 @@ static struct page *kimage_alloc_page(st
+ 			struct page *old_page;
+ 
+ 			old_addr = *old & PAGE_MASK;
+-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
++			old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
+ 			copy_highpage(page, old_page);
+ 			*old = addr | (*old & ~PAGE_MASK);
+ 
+@@ -779,7 +801,7 @@ static int kimage_load_normal_segment(st
+ 			result  = -ENOMEM;
+ 			goto out;
+ 		}
+-		result = kimage_add_page(image, page_to_pfn(page)
++		result = kimage_add_page(image, kexec_page_to_pfn(page)
+ 								<< PAGE_SHIFT);
+ 		if (result < 0)
+ 			goto out;
+@@ -811,6 +833,7 @@ out:
+ 	return result;
+ }
+ 
++#ifndef CONFIG_XEN
+ static int kimage_load_crash_segment(struct kimage *image,
+ 					struct kexec_segment *segment)
+ {
+@@ -833,7 +856,7 @@ static int kimage_load_crash_segment(str
+ 		char *ptr;
+ 		size_t uchunk, mchunk;
+ 
+-		page = pfn_to_page(maddr >> PAGE_SHIFT);
++		page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
+ 		if (page == 0) {
+ 			result  = -ENOMEM;
+ 			goto out;
+@@ -881,6 +904,13 @@ static int kimage_load_segment(struct ki
+ 
+ 	return result;
+ }
++#else /* CONFIG_XEN */
++static int kimage_load_segment(struct kimage *image,
++				struct kexec_segment *segment)
++{
++	return kimage_load_normal_segment(image, segment);
++}
++#endif
+ 
+ /*
+  * Exec Kernel system call: for obvious reasons only root may call it.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* [PATCH]: kexec: framework and i386 (Take VIII)
  2006-05-09  4:16                           ` [PATCH]: kexec: framework and i386 (Take VII) Horms
@ 2006-05-09  9:18                             ` Horms
  2006-05-09 13:28                               ` Akio Takebe
  0 siblings, 1 reply; 68+ messages in thread
From: Horms @ 2006-05-09  9:18 UTC (permalink / raw)
  To: Akio Takebe; +Cc: Isaku Yamahata, xen-devel, Mark Williamson, Magnus Damm

On Tue, May 09, 2006 at 01:16:32PM +0900, Horms wrote:
> On Sun, May 07, 2006 at 01:45:22PM +0900, Horms wrote:
> > On Sat, May 06, 2006 at 05:44:44PM +0900, Akio Takebe wrote:
> > 
> > > I think you can make a patch in patches/linux-2.6.16/ if you would
> > > modify these.
> > 
> > Yes, that is probably the best way forward, I'll work on breaking it
> > out in that manner.
> 
> Hi Takebe-san,
> 
> here is an updated version of the patch which moves portions into
> patches/linux-2.6.16/ as you suggested. It also moves to
> xen-unstable 9969 / Linux 2.6.16.13 and has some minor build fixes,
> for problems that crept into the previous patch.

Sorry, this mornin's patch had the internal patch in the wrong location
and with the wrong diff level.

-- 
Horms                                           http://www.vergenet.net/~horms/

kexec: framework and i386

This is an implementation of kexec for dom0/xen, that allows
kexecing of the physical machine from xen. The approach taken is
to move the architecture-dependant kexec code into a new hypercall.

Some notes:
  * machine_kexec_cleanup() and machine_kexec_prepare() don't do
    anything in i386. So while this patch adds a framework for them,
    I am not sure what parameters are needs at this stage.
  * Only works for UP, as machine_shutdown is not implemented yet
  * kexecing into xen does not seem to work, I think that 
    kexec-tools needs updating, but I have not investigated yet
  * Kdump works by first copying the kernel into dom0 segments
    and relocating them later in xen, the same way that kexec does
    The only difference is that the relocation is made into
    an area reserved by xen
  * Kdump reservation is made using the xen command line parameters,
    kdump_megabytes and kdump_megabytes_base, rather than
    the linux option crashkernel, which is now ignored.
    Two parameters are used instead of one to simplify parsing.
    This can be cleaned up later if desired. But the reservation
    seems to need to be made by xen to make sure that it happens
    early enough.
    The tested values are kdump_megabytes=16, kdump_megabytes_base=32
    (kdump_megabytes_base=16 does not seem to work)
  * This patch uses a new kexec hypercall
  * SMP Kexec works, Kdump is next on the list

Highlights since the previous posted version:
 
  * Diff now applies to a xen checkout from hg 
    (previously it assumed that the kernel was unpacked)
    - xen-unstable-hg 9660 / Linux 2.6.16.13
  * Added machine_shutdown, which disapperared in the previous release of
    this patch
  * Fixed include problems in kexec.h

Prepared by Horms and Magnus Damm

Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
Signed-Off-By: Horms <horms@verge.net.au>

 buildconfigs/linux-defconfig_xen_x86_32                        |    1 
 linux-2.6-xen-sparse/arch/i386/Kconfig                         |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/Makefile                 |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c              |   24 +
 linux-2.6-xen-sparse/drivers/xen/core/Makefile                 |    1 
 linux-2.6-xen-sparse/drivers/xen/core/crash.c                  |   98 ++++
 linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c          |   73 +++
 linux-2.6-xen-sparse/drivers/xen/core/reboot.c                 |    4 
 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h |   10 
 patches/linux-2.6.16.13/kexec.patch                            |  175 ++++++++
 xen/arch/x86/Makefile                                          |    1 
 xen/arch/x86/dom0_ops.c                                        |    3 
 xen/arch/x86/machine_kexec.c                                   |   28 +
 xen/arch/x86/setup.c                                           |   75 +++
 xen/arch/x86/x86_32/Makefile                                   |    1 
 xen/arch/x86/x86_32/entry.S                                    |    2 
 xen/arch/x86/x86_32/machine_kexec.c                            |  205 ++++++++++
 xen/arch/x86/x86_64/Makefile                                   |    1 
 xen/arch/x86/x86_64/machine_kexec.c                            |   25 +
 xen/common/Makefile                                            |    1 
 xen/common/kexec.c                                             |   73 +++
 xen/common/page_alloc.c                                        |   33 +
 xen/include/asm-x86/hypercall.h                                |    6 
 xen/include/public/kexec.h                                     |   45 ++
 xen/include/public/xen.h                                       |    9 
 xen/include/xen/mm.h                                           |    1 
 26 files changed, 877 insertions(+), 22 deletions(-)

--- x/buildconfigs/linux-defconfig_xen_x86_32
+++ x/buildconfigs/linux-defconfig_xen_x86_32
@@ -184,6 +184,7 @@ CONFIG_MTRR=y
 CONFIG_REGPARM=y
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
--- x/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ x/linux-2.6-xen-sparse/arch/i386/Kconfig
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_XEN
+	depends on EXPERIMENTAL
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
@@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o machine_kexec.o crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
@@ -68,6 +68,10 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -932,6 +936,7 @@ static void __init parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -942,6 +947,10 @@ static void __init parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1318,9 +1327,21 @@ void __init setup_bootmem_allocator(void
 	}
 #endif
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end)
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
+#else
+	{
+		struct kexec_arg xen_kexec_arg;
+		BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_reserve, &xen_kexec_arg));
+		if (xen_kexec_arg.u.reserve.size) {
+			crashk_res.start = xen_kexec_arg.u.reserve.start;
+			crashk_res.end = xen_kexec_arg.u.reserve.start + 
+				xen_kexec_arg.u.reserve.size - 1;
+		}
+	}
+#endif
 #endif
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1395,6 +1416,9 @@ legacy_init_iomem_resources(struct resou
 		res->end = map[i].end - 1;
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		request_resource(&iomem_resource, res);
+#ifdef CONFIG_KEXEC
+        request_resource(res, &crashk_res);
+#endif
 	}
 
 	free_bootmem(__pa(map), PAGE_SIZE);
--- x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
+++ x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_NET)     += skbuff.o
 obj-$(CONFIG_SMP)     += smpboot.o
 obj-$(CONFIG_SYSFS)   += hypervisor_sysfs.o
 obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
+obj-$(CONFIG_KEXEC)   += machine_kexec.o crash.o
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/crash.c
@@ -0,0 +1,98 @@
+/*
+ * Architecture specific (i386-xen) functions for kexec based crash dumps.
+ *
+ * Created by: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <linux/kernel.h> /* For printk */
+
+/* XXX: final_note(), crash_save_this_cpu() and crash_save_self()
+ * are copied from arch/i386/kernel/crash.c, might be good to either
+ * the original functions non-static and use them, or just
+ * merge this this into that file. 
+ */
+#include <linux/elf.h>     /* For struct elf_note */
+#include <linux/elfcore.h> /* For struct elf_prstatus */
+#include <linux/kexec.h>   /* crash_notes */
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+							       size_t data_len)
+{
+	struct elf_note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = data_len;
+	note.n_type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) +3)/4;
+	memcpy(buf, name, note.n_namesz);
+	buf += (note.n_namesz + 3)/4;
+	memcpy(buf, data, note.n_descsz);
+	buf += (note.n_descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	struct elf_note note;
+
+	note.n_namesz = 0;
+	note.n_descsz = 0;
+	note.n_type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
+{
+	struct elf_prstatus prstatus;
+	u32 *buf;
+
+	if ((cpu < 0) || (cpu >= NR_CPUS))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * I need a well defined structure format
+	 * for the data I pass, and I need tags
+	 * on the data to indicate what information I have
+	 * squirrelled away.  ELF notes happen to provide
+	 * all of that that no need to invent something new.
+	 */
+	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+	if (!buf)
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	prstatus.pr_pid = current->pid;
+	elf_core_copy_regs(&prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+				sizeof(prstatus));
+	final_note(buf);
+}
+
+static void crash_save_self(struct pt_regs *regs)
+{
+	int cpu;
+
+	cpu = smp_processor_id();
+	crash_save_this_cpu(regs, cpu);
+}
+
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	/* XXX: This should do something */
+	printk("xen-kexec: Need to turn of other CPUS in "
+	       "machine_crash_shutdown()\n");
+	crash_save_self(regs);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
@@ -0,0 +1,73 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <linux/kexec.h>
+#include <xen/interface/kexec.h>
+#include <linux/mm.h>
+#include <asm/hypercall.h>
+
+const extern unsigned char relocate_new_kernel[];
+extern unsigned int relocate_new_kernel_size;
+
+/*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+ * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
+ * have been allocated, but the segments have yet
+ * been copied into the kernel.
+ *
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ *
+ * Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+	kexec_arg_t hypercall_arg;
+       	hypercall_arg.u.helper.data = NULL;
+	return HYPERVISOR_kexec(KEXEC_CMD_kexec_prepare, &hypercall_arg);
+}
+
+/*
+ * Undo anything leftover by machine_kexec_prepare
+ * when an image is freed.
+ */
+void machine_kexec_cleanup(struct kimage *image)
+{
+	kexec_arg_t hypercall_arg;
+	hypercall_arg.u.helper.data = NULL;
+	HYPERVISOR_kexec(KEXEC_CMD_kexec_cleanup, &hypercall_arg);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+	kexec_arg_t hypercall_arg;
+	hypercall_arg.u.kexec.indirection_page = image->head;
+	hypercall_arg.u.kexec.reboot_code_buffer = 
+		pfn_to_mfn(page_to_pfn(image->control_code_page)) << PAGE_SHIFT;
+	hypercall_arg.u.kexec.start_address = image->start;
+	hypercall_arg.u.kexec.relocate_new_kernel = relocate_new_kernel;
+	hypercall_arg.u.kexec.relocate_new_kernel_size = 
+		relocate_new_kernel_size;
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, &hypercall_arg);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
+++ x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
@@ -66,6 +66,10 @@ void machine_power_off(void)
 	HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 }
 
+#ifdef CONFIG_KEXEC
+void machine_shutdown(void) { }
+#endif
+
 int reboot_thru_bios = 0;	/* for dmi_scan.c */
 EXPORT_SYMBOL(machine_restart);
 EXPORT_SYMBOL(machine_halt);
--- x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
@@ -39,6 +39,8 @@
 # error "please don't include this file directly"
 #endif
 
+#include <xen/interface/kexec.h>
+
 #define __STR(x) #x
 #define STR(x) __STR(x)
 
@@ -359,6 +361,14 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, kexec_arg_t * arg)
+{
+	return _hypercall2(int, kexec_op, op, arg); 
+}
+
+
 
 #endif /* __HYPERCALL_H__ */
 
--- x/xen/arch/x86/Makefile
+++ x/xen/arch/x86/Makefile
@@ -39,6 +39,7 @@ obj-y += trampoline.o
 obj-y += traps.o
 obj-y += usercopy.o
 obj-y += x86_emulate.o
+obj-y += machine_kexec.o
 
 ifneq ($(pae),n)
 obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o
--- x/xen/arch/x86/dom0_ops.c
+++ x/xen/arch/x86/dom0_ops.c
@@ -29,6 +29,9 @@
 #include <asm/mtrr.h>
 #include "cpu/mtrr/mtrr.h"
 
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
 #define TRC_DOM0OP_ENTER_BASE  0x00020000
 #define TRC_DOM0OP_LEAVE_BASE  0x00030000
 
--- /dev/null
+++ x/xen/arch/x86/machine_kexec.c
@@ -0,0 +1,28 @@
+/******************************************************************************
+ * arch/x86/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ */
+
+#include <xen/types.h>
+#include <public/kexec.h>
+
+int machine_kexec_prepare(struct kexec_arg *arg)
+{
+	return 0;
+}
+
+void machine_kexec_cleanup(struct kexec_arg *arg)
+{
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/setup.c
+++ x/xen/arch/x86/setup.c
@@ -38,6 +38,11 @@ static unsigned int opt_xenheap_megabyte
 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
 #endif
 
+unsigned int opt_kdump_megabytes = 0;
+integer_param("kdump_megabytes", opt_kdump_megabytes);
+unsigned int opt_kdump_megabytes_base = 0;
+integer_param("kdump_megabytes_base", opt_kdump_megabytes_base);
+
 /* opt_nosmp: If true, secondary processors are ignored. */
 static int opt_nosmp = 0;
 boolean_param("nosmp", opt_nosmp);
@@ -192,6 +197,20 @@ static void percpu_free_unused_areas(voi
                        __pa(__per_cpu_end));
 }
 
+void __init move_memory(unsigned long dst, 
+                          unsigned long src_start, unsigned long src_end)
+{
+#if defined(CONFIG_X86_32)
+    memmove((void *)dst,  /* use low mapping */
+            (void *)src_start,      /* use low mapping */
+            src_end - src_start);
+#elif defined(CONFIG_X86_64)
+    memmove(__va(dst),
+            __va(src_start),
+            src_end - src_start);
+#endif
+}
+
 void __init __start_xen(multiboot_info_t *mbi)
 {
     char __cmdline[] = "", *cmdline = __cmdline;
@@ -327,15 +346,8 @@ void __init __start_xen(multiboot_info_t
         initial_images_start = xenheap_phys_end;
     initial_images_end = initial_images_start + modules_length;
 
-#if defined(CONFIG_X86_32)
-    memmove((void *)initial_images_start,  /* use low mapping */
-            (void *)mod[0].mod_start,      /* use low mapping */
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#elif defined(CONFIG_X86_64)
-    memmove(__va(initial_images_start),
-            __va(mod[0].mod_start),
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#endif
+    move_memory(initial_images_start, 
+                mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
 
     /* Initialise boot-time allocator with all RAM situated after modules. */
     xenheap_phys_start = init_boot_allocator(__pa(&_end));
@@ -383,6 +395,51 @@ void __init __start_xen(multiboot_info_t
 #endif
     }
 
+    if (opt_kdump_megabytes) {
+        unsigned long kdump_start, kdump_size, k;
+
+        /* mark images pages as free for now */
+
+        init_boot_pages(initial_images_start, initial_images_end);
+
+        kdump_start = opt_kdump_megabytes_base << 20;
+        kdump_size = opt_kdump_megabytes << 20;
+
+        printk("Kdump: %luMB (%lukB) at 0x%lx\n", 
+               kdump_size >> 20,
+               kdump_size >> 10,
+               kdump_start);
+
+        if ((kdump_start & ~PAGE_MASK) || (kdump_size & ~PAGE_MASK))
+            panic("Kdump parameters not page aligned\n");
+
+        kdump_start >>= PAGE_SHIFT;
+        kdump_size >>= PAGE_SHIFT;
+
+        /* allocate pages for Kdump memory area */
+
+        k = alloc_boot_pages_at(kdump_size, kdump_start);
+
+        if (k != kdump_start)
+            panic("Unable to reserve Kdump memory\n");
+
+        /* allocate pages for relocated initial images */
+
+        k = ((initial_images_end - initial_images_start) & ~PAGE_MASK) ? 1 : 0;
+        k += (initial_images_end - initial_images_start) >> PAGE_SHIFT;
+
+        k = alloc_boot_pages(k, 1);
+
+        if (!k)
+            panic("Unable to allocate initial images memory\n");
+
+        move_memory(k << PAGE_SHIFT, initial_images_start, initial_images_end);
+
+        initial_images_end -= initial_images_start;
+        initial_images_start = k << PAGE_SHIFT;
+        initial_images_end += initial_images_start;
+    }        
+
     memguard_init();
 
     printk("System RAM: %luMB (%lukB)\n", 
--- x/xen/arch/x86/x86_32/Makefile
+++ x/xen/arch/x86/x86_32/Makefile
@@ -3,5 +3,6 @@ obj-y += entry.o
 obj-y += mm.o
 obj-y += seg_fixup.o
 obj-y += traps.o
+obj-y += machine_kexec.o
 
 obj-$(supervisor_mode_kernel) += supervisor_mode_kernel.o
--- x/xen/arch/x86/x86_32/entry.S
+++ x/xen/arch/x86/x86_32/entry.S
@@ -648,6 +648,7 @@ ENTRY(hypercall_table)
         .long do_xenoprof_op
         .long do_event_channel_op
         .long do_physdev_op
+        .long do_kexec
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -687,6 +688,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_xenoprof_op       */
         .byte 2 /* do_event_channel_op  */
         .byte 2 /* do_physdev_op        */
+        .byte 2 /* do_kexec             */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- /dev/null
+++ x/xen/arch/x86/x86_32/machine_kexec.c
@@ -0,0 +1,205 @@
+/******************************************************************************
+ * arch/x86/x86_32/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
+ */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/domain_page.h> 
+#include <xen/timer.h>
+#include <xen/sched.h>
+#include <xen/reboot.h>
+#include <xen/console.h>
+#include <asm/page.h> 
+#include <asm/flushtlb.h>
+#include <public/kexec.h>
+
+static void __machine_kexec(struct kexec_arg *arg);
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+                    unsigned long indirection_page,
+                    unsigned long reboot_code_buffer,
+                    unsigned long start_address,
+                    unsigned int has_pae);
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L2_ATTR (_PAGE_PRESENT)
+
+#ifndef CONFIG_X86_PAE
+
+static u32 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    unsigned long mfn;
+    u32 *pgtable_level2;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level2 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    write_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level2);
+}
+
+#else
+static u64 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+static u64 pgtable_level2[L2_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    int mfn;
+    intpte_t *pgtable_level3;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level3 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+    set_64bit(&pgtable_level3[l3_table_offset(address)],
+	      __pa(pgtable_level2) | L2_ATTR);
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    load_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level3);
+}
+#endif
+
+static void kexec_load_segments(void)
+{
+#define __SSTR(X) #X
+#define SSTR(X) __SSTR(X)
+    __asm__ __volatile__ (
+        "\tljmp $"SSTR(__HYPERVISOR_CS)",$1f\n"
+        "\t1:\n"
+        "\tmovl $"SSTR(__HYPERVISOR_DS)",%%eax\n"
+        "\tmovl %%eax,%%ds\n"
+        "\tmovl %%eax,%%es\n"
+        "\tmovl %%eax,%%fs\n"
+        "\tmovl %%eax,%%gs\n"
+        "\tmovl %%eax,%%ss\n"
+        ::: "eax", "memory");
+#undef SSTR
+#undef __SSTR
+}
+
+#define kexec_load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
+static void kexec_set_idt(void *newidt, __u16 limit)
+{
+    struct Xgt_desc_struct curidt;
+
+    /* ia32 supports unaliged loads & stores */
+    curidt.size    = limit;
+    curidt.address = (unsigned long)newidt;
+    
+    kexec_load_idt(&curidt);
+
+};
+
+#define kexec_load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
+static void kexec_set_gdt(void *newgdt, __u16 limit)
+{
+    struct Xgt_desc_struct curgdt;
+
+    /* ia32 supports unaligned loads & stores */
+    curgdt.size    = limit;
+    curgdt.address = (unsigned long)newgdt;
+
+    kexec_load_gdt(&curgdt);
+};
+
+static void __machine_shutdown(void *data)
+{
+    struct kexec_arg *arg = (struct kexec_arg *)data;
+
+    printk("__machine_shutdown: cpu=%u\n", smp_processor_id());
+
+    watchdog_disable();
+    console_start_sync();
+
+    smp_send_stop();
+
+#ifdef CONFIG_X86_IO_APIC
+    disable_IO_APIC();
+#endif   
+
+    __machine_kexec(arg);
+}
+
+void machine_shutdown(struct kexec_arg *arg)
+{
+    int reboot_cpu_id;
+    cpumask_t reboot_cpu;
+
+
+    reboot_cpu_id = 0;
+
+    if (!cpu_isset(reboot_cpu_id, cpu_online_map))
+        reboot_cpu_id = smp_processor_id();
+    
+    if (reboot_cpu_id != smp_processor_id()) {
+        cpus_clear(reboot_cpu);
+        cpu_set(reboot_cpu_id, reboot_cpu);
+        on_selected_cpus(reboot_cpu, __machine_shutdown, arg, 1, 0);
+	for (;;)
+		; /* nothing */
+    }
+    else
+        __machine_shutdown(arg);
+    BUG();
+}
+
+static void __machine_kexec(struct kexec_arg *arg)
+{
+    relocate_new_kernel_t rnk;
+
+    local_irq_disable();
+
+    identity_map_page(arg->u.kexec.reboot_code_buffer);
+
+    copy_from_user((void *)arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.relocate_new_kernel,
+           arg->u.kexec.relocate_new_kernel_size);
+
+    kexec_load_segments();
+    kexec_set_gdt(__va(0),0);
+    kexec_set_idt(__va(0),0);
+
+    rnk = (relocate_new_kernel_t) arg->u.kexec.reboot_code_buffer;
+    (*rnk)(arg->u.kexec.indirection_page, arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.start_address, cpu_has_pae);
+}
+
+void machine_kexec(struct kexec_arg *arg)
+{
+    machine_shutdown(arg);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/x86_64/Makefile
+++ x/xen/arch/x86/x86_64/Makefile
@@ -1,3 +1,4 @@
 obj-y += entry.o
 obj-y += mm.o
 obj-y += traps.o
+obj-y += machine_kexec.o
--- /dev/null
+++ x/xen/arch/x86/x86_64/machine_kexec.c
@@ -0,0 +1,25 @@
+/******************************************************************************
+ * arch/x86/x86_64/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
+ */
+
+#include <xen/types.h>
+#include <public/kexec.h>
+
+void machine_kexec(struct kexec_arg *arg)
+{
+    printk("machine_kexec: not implemented\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/Makefile
+++ x/xen/common/Makefile
@@ -7,6 +7,7 @@ obj-y += event_channel.o
 obj-y += grant_table.o
 obj-y += kernel.o
 obj-y += keyhandler.o
+obj-y += kexec.o
 obj-y += lib.o
 obj-y += memory.o
 obj-y += multicall.o
--- /dev/null
+++ x/xen/common/kexec.c
@@ -0,0 +1,73 @@
+/*
+ * Achitecture independent kexec code for Xen
+ *
+ * At this statge, just a switch for the kexec hypercall into
+ * architecture dependent code.
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#include <xen/lib.h>
+#include <xen/errno.h>
+#include <xen/guest_access.h>
+#include <xen/sched.h>
+#include <xen/types.h>
+#include <public/kexec.h>
+
+extern int machine_kexec_prepare(struct kexec_arg *arg);
+extern void machine_kexec_cleanup(struct kexec_arg *arg);
+extern void machine_kexec(struct kexec_arg *arg);
+
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
+int do_kexec(unsigned long op, 
+             XEN_GUEST_HANDLE(kexec_arg_t) uarg)
+{
+    struct kexec_arg arg;
+
+    if ( !IS_PRIV(current->domain) )  
+        return -EPERM;
+
+    if (op == KEXEC_CMD_reserve)
+    {
+	arg.u.reserve.size = opt_kdump_megabytes << 20;
+	arg.u.reserve.start = opt_kdump_megabytes_base << 20;
+	if ( unlikely(copy_to_guest(uarg, &arg, 1) != 0) )
+	{
+		printk("do_kexec: copy_to_guest failed");
+		return -EFAULT;
+	}
+	return 0;
+    }
+
+    if ( unlikely(copy_from_guest(&arg, uarg, 1) != 0) )
+    {
+        printk("do_kexec: __copy_from_guest failed");
+        return -EFAULT;
+    }
+
+    switch(op) {
+    case KEXEC_CMD_kexec:
+        machine_kexec(&arg);
+        return -EINVAL; /* Not Reached */
+    case KEXEC_CMD_kexec_prepare:
+        return machine_kexec_prepare(&arg);
+    case KEXEC_CMD_kexec_cleanup:
+        machine_kexec_cleanup(&arg);
+        return 0;
+    }
+
+    return -EINVAL;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
--- x/xen/common/page_alloc.c
+++ x/xen/common/page_alloc.c
@@ -212,24 +212,35 @@ void init_boot_pages(paddr_t ps, paddr_t
     }
 }
 
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at)
+{
+    unsigned long i;
+
+    for ( i = 0; i < nr_pfns; i++ )
+        if ( allocated_in_map(pfn_at + i) )
+             break;
+
+    if ( i == nr_pfns )
+    {
+        map_alloc(pfn_at, nr_pfns);
+        return pfn_at;
+    }
+
+    return 0;
+}
+
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
 {
-    unsigned long pg, i;
+    unsigned long pg, i = 0;
 
     for ( pg = 0; (pg + nr_pfns) < max_page; pg += pfn_align )
     {
-        for ( i = 0; i < nr_pfns; i++ )
-            if ( allocated_in_map(pg + i) )
-                 break;
-
-        if ( i == nr_pfns )
-        {
-            map_alloc(pg, nr_pfns);
-            return pg;
-        }
+        i = alloc_boot_pages_at(nr_pfns, pg);
+        if (i != 0)
+            break;
     }
 
-    return 0;
+    return i;
 }
 
 
--- x/xen/include/asm-x86/hypercall.h
+++ x/xen/include/asm-x86/hypercall.h
@@ -6,6 +6,8 @@
 #define __ASM_X86_HYPERCALL_H__
 
 #include <public/physdev.h>
+#include <xen/types.h>
+#include <public/kexec.h>
 
 extern long
 do_event_channel_op_compat(
@@ -87,6 +89,10 @@ extern long
 arch_do_vcpu_op(
     int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg);
 
+extern int
+do_kexec(
+    unsigned long op, XEN_GUEST_HANDLE(kexec_arg_t) uarg);
+
 #ifdef __x86_64__
 
 extern long
--- /dev/null
+++ x/xen/include/public/kexec.h
@@ -0,0 +1,45 @@
+/*
+ * kexec.h: Xen kexec public
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+#include "xen.h"
+
+/*
+ * Scratch space for passing arguments to the kexec hypercall
+ */
+typedef struct kexec_arg {
+    union {
+        struct {
+            unsigned long data; /* Not sure what this should be yet */
+        } helper;
+        struct {
+            unsigned long indirection_page;
+            unsigned long reboot_code_buffer;
+            unsigned long start_address;
+            const char *relocate_new_kernel;
+            unsigned int relocate_new_kernel_size;
+        } kexec;
+        struct {
+            unsigned long size;
+            unsigned long start;
+        } reserve;
+    } u;
+} kexec_arg_t;
+DEFINE_XEN_GUEST_HANDLE(kexec_arg_t);
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/public/xen.h
+++ x/xen/include/public/xen.h
@@ -64,6 +64,7 @@
 #define __HYPERVISOR_xenoprof_op          31
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
+#define __HYPERVISOR_kexec_op             34
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
@@ -238,6 +239,14 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
 #define VMASST_TYPE_writable_pagetables  2
 #define MAX_VMASST_TYPE 2
 
+/*
+ * Operations for kexec.
+ */
+#define KEXEC_CMD_kexec                 0
+#define KEXEC_CMD_kexec_prepare         1
+#define KEXEC_CMD_kexec_cleanup         2
+#define KEXEC_CMD_reserve               3
+
 #ifndef __ASSEMBLY__
 
 typedef uint16_t domid_t;
--- x/xen/include/xen/mm.h
+++ x/xen/include/xen/mm.h
@@ -40,6 +40,7 @@ struct page_info;
 paddr_t init_boot_allocator(paddr_t bitmap_start);
 void init_boot_pages(paddr_t ps, paddr_t pe);
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align);
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at);
 void end_boot_allocator(void);
 
 /* Generic allocator. These functions are *not* interrupt-safe. */
--- /dev/null	2006-05-09 15:32:30.399072192 +0900
+++ x/patches/linux-2.6.16.13/kexec.patch	2006-05-09 18:03:46.000000000 +0900
@@ -0,0 +1,175 @@
+--- x/drivers/base/cpu.c
++++ x/drivers/base/cpu.c
+@@ -101,7 +101,11 @@ static ssize_t show_crash_notes(struct s
+ 	 * boot up and this data does not change there after. Hence this
+ 	 * operation should be safe. No locking required.
+ 	 */
++#ifndef CONFIG_XEN
+ 	addr = __pa(per_cpu_ptr(crash_notes, cpunum));
++#else
++	addr = virt_to_machine(per_cpu_ptr(crash_notes, cpunum));
++#endif
+ 	rc = sprintf(buf, "%Lx\n", addr);
+ 	return rc;
+ }
+--- x/kernel/kexec.c
++++ x/kernel/kexec.c
+@@ -38,6 +38,20 @@ struct resource crashk_res = {
+ 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ };
+ 
++/* Kexec needs to know about the actually physical addresss.
++ * But in xen, a physical address is a pseudo-physical addresss. */
++#ifndef CONFIG_XEN
++#define kexec_page_to_pfn(page)  page_to_pfn(page)
++#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
++#define kexec_virt_to_phys(addr) virt_to_phys(addr)
++#define kexec_phys_to_virt(addr) phys_to_virt(addr)
++#else
++#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
++#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
++#define kexec_virt_to_phys(addr) virt_to_machine(addr)
++#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
++#endif
++
+ int kexec_should_crash(struct task_struct *p)
+ {
+ 	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+@@ -403,7 +417,7 @@ static struct page *kimage_alloc_normal_
+ 		pages = kimage_alloc_pages(GFP_KERNEL, order);
+ 		if (!pages)
+ 			break;
+-		pfn   = page_to_pfn(pages);
++		pfn   = kexec_page_to_pfn(pages);
+ 		epfn  = pfn + count;
+ 		addr  = pfn << PAGE_SHIFT;
+ 		eaddr = epfn << PAGE_SHIFT;
+@@ -437,6 +451,7 @@ static struct page *kimage_alloc_normal_
+ 	return pages;
+ }
+ 
++#ifndef CONFIG_XEN
+ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ 						      unsigned int order)
+ {
+@@ -490,7 +505,7 @@ static struct page *kimage_alloc_crash_c
+ 		}
+ 		/* If I don't overlap any segments I have found my hole! */
+ 		if (i == image->nr_segments) {
+-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
++			pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
+ 			break;
+ 		}
+ 	}
+@@ -517,6 +532,13 @@ struct page *kimage_alloc_control_pages(
+ 
+ 	return pages;
+ }
++#else /* !CONFIG_XEN */
++struct page *kimage_alloc_control_pages(struct kimage *image,
++					 unsigned int order)
++{
++	return kimage_alloc_normal_control_pages(image, order);
++}
++#endif
+ 
+ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+ {
+@@ -532,7 +554,7 @@ static int kimage_add_entry(struct kimag
+ 			return -ENOMEM;
+ 
+ 		ind_page = page_address(page);
+-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
++		*image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
+ 		image->entry = ind_page;
+ 		image->last_entry = ind_page +
+ 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+@@ -593,13 +615,13 @@ static int kimage_terminate(struct kimag
+ #define for_each_kimage_entry(image, ptr, entry) \
+ 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ 		ptr = (entry & IND_INDIRECTION)? \
+-			phys_to_virt((entry & PAGE_MASK)): ptr +1)
++			kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
+ 
+ static void kimage_free_entry(kimage_entry_t entry)
+ {
+ 	struct page *page;
+ 
+-	page = pfn_to_page(entry >> PAGE_SHIFT);
++	page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
+ 	kimage_free_pages(page);
+ }
+ 
+@@ -686,7 +708,7 @@ static struct page *kimage_alloc_page(st
+ 	 * have a match.
+ 	 */
+ 	list_for_each_entry(page, &image->dest_pages, lru) {
+-		addr = page_to_pfn(page) << PAGE_SHIFT;
++		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
+ 		if (addr == destination) {
+ 			list_del(&page->lru);
+ 			return page;
+@@ -701,12 +723,12 @@ static struct page *kimage_alloc_page(st
+ 		if (!page)
+ 			return NULL;
+ 		/* If the page cannot be used file it away */
+-		if (page_to_pfn(page) >
++		if (kexec_page_to_pfn(page) >
+ 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ 			list_add(&page->lru, &image->unuseable_pages);
+ 			continue;
+ 		}
+-		addr = page_to_pfn(page) << PAGE_SHIFT;
++		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
+ 
+ 		/* If it is the destination page we want use it */
+ 		if (addr == destination)
+@@ -729,7 +751,7 @@ static struct page *kimage_alloc_page(st
+ 			struct page *old_page;
+ 
+ 			old_addr = *old & PAGE_MASK;
+-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
++			old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
+ 			copy_highpage(page, old_page);
+ 			*old = addr | (*old & ~PAGE_MASK);
+ 
+@@ -779,7 +801,7 @@ static int kimage_load_normal_segment(st
+ 			result  = -ENOMEM;
+ 			goto out;
+ 		}
+-		result = kimage_add_page(image, page_to_pfn(page)
++		result = kimage_add_page(image, kexec_page_to_pfn(page)
+ 								<< PAGE_SHIFT);
+ 		if (result < 0)
+ 			goto out;
+@@ -811,6 +833,7 @@ out:
+ 	return result;
+ }
+ 
++#ifndef CONFIG_XEN
+ static int kimage_load_crash_segment(struct kimage *image,
+ 					struct kexec_segment *segment)
+ {
+@@ -833,7 +856,7 @@ static int kimage_load_crash_segment(str
+ 		char *ptr;
+ 		size_t uchunk, mchunk;
+ 
+-		page = pfn_to_page(maddr >> PAGE_SHIFT);
++		page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
+ 		if (page == 0) {
+ 			result  = -ENOMEM;
+ 			goto out;
+@@ -881,6 +904,13 @@ static int kimage_load_segment(struct ki
+ 
+ 	return result;
+ }
++#else /* CONFIG_XEN */
++static int kimage_load_segment(struct kimage *image,
++				struct kexec_segment *segment)
++{
++	return kimage_load_normal_segment(image, segment);
++}
++#endif
+ 
+ /*
+  * Exec Kernel system call: for obvious reasons only root may call it.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH]: kexec: framework and i386 (Take VIII)
  2006-05-09  9:18                             ` [PATCH]: kexec: framework and i386 (Take VIII) Horms
@ 2006-05-09 13:28                               ` Akio Takebe
  2006-05-16 10:43                                 ` Akio Takebe
  0 siblings, 1 reply; 68+ messages in thread
From: Akio Takebe @ 2006-05-09 13:28 UTC (permalink / raw)
  To: Horms; +Cc: xen-devel, Akio Takebe, Magnus Damm, Isaku Yamahata,
	Mark Williamson

>> 
>> Hi Takebe-san,
>> 
>> here is an updated version of the patch which moves portions into
>> patches/linux-2.6.16/ as you suggested. It also moves to
>> xen-unstable 9969 / Linux 2.6.16.13 and has some minor build fixes,
>> for problems that crept into the previous patch.
>
>Sorry, this mornin's patch had the internal patch in the wrong location
>and with the wrong diff level.
>

Hi, Horms

Thank you for sending your new patch.
This patch is good compilation. :) 
I try and repot soon.

Best Regards

Akio Takebe

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386 (Take VI)
  2006-05-08  9:02                                 ` Ian Campbell
@ 2006-05-11 11:35                                   ` horms
  0 siblings, 0 replies; 68+ messages in thread
From: horms @ 2006-05-11 11:35 UTC (permalink / raw)
  To: Ian Campbell
  Cc: Isaku Yamahata, Magnus Damm, xen-devel, Akio Takebe,
	Mark Williamson

On Mon, May 08, 2006 at 10:02:37AM +0100, Ian Campbell wrote:
> I didn't get Horms' (I presume that's who is quoted below) original mail
> so I'll reply to this one.
> 
> > >Ok thanks, I haven't seen such a machine.
> > >I'll look into simulating it in software.
> 
> There is code in xen/arch/x86/nmi.c:do_nmi_trigger(). You can trigger it
> with the 'n' keyhandler.

Thanks, I will use that technique.

-- 
Horms                                           http://www.vergenet.net/~horms/

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386 (Take VI)
  2006-05-03  7:16                       ` Akio Takebe
  2006-05-05  1:03                         ` horms
@ 2006-05-15  8:29                         ` Akio Takebe
  1 sibling, 0 replies; 68+ messages in thread
From: Akio Takebe @ 2006-05-15  8:29 UTC (permalink / raw)
  To: Simon Horman [Horms], Keir Fraser
  Cc: Isaku Yamahata, Magnus Damm, xen-devel, Mark Williamson,
	Akio Takebe

Hi,

I tested about NMI button with your patch.
I got coredump including all memory!
We always use dom0's coredump by using Horms's and Magnus's patch. :-)

FYI, I used the following grub.conf 
and /proc/sys/kernel/unknown_nmi_panic=1

title Xen 3.0 kexec
        root (hd0,0)
        kernel /xen-3.0.gz dom0_mem=256M kdump_megabytes=64 kdump_megabytes_base=32 nmi=dom0 nosmp
        module /vmlinuz-2.6-xen ro root=LABEL=/ rhgb nosmp
        module /initrd-2.6-xen.img

Best Regards,

Akio Takebe

>Hi, Simon and Magnus
>
>I have one question.
>When Xen is panic, I seemed kexec is not called.
>Only when dom0 is panic, kexec is called.
>But in the case of nmi=dom0, can we use kexec by pushing NMI button?
>Am I righit?
>
>I'll use your patch soon, and report. :-)
>
>Best Regards,
>
>Akio Takebe
>
>
>
>
>_______________________________________________
>Xen-devel mailing list
>Xen-devel@lists.xensource.com
>http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386 (Take VIII)
  2006-05-09 13:28                               ` Akio Takebe
@ 2006-05-16 10:43                                 ` Akio Takebe
  2006-05-16 10:44                                   ` Keir Fraser
  0 siblings, 1 reply; 68+ messages in thread
From: Akio Takebe @ 2006-05-16 10:43 UTC (permalink / raw)
  To: Horms, Keir Fraser, Ian Pratt, Kazuo Moriwaka
  Cc: Isaku Yamahata, Magnus Damm, xen-devel, Mark Williamson,
	Akio Takebe

Hi, Keir

I tried Horms's kexec patch and Kazuo's tools.
And I could get coredump of dom0!
By using this feature, we can debug dom0 with gdbserver-xen 
to the same way as domU.
I think that this is very useful.
Xen don't have dump feature yet,
and this feature don't affect performace, stability, and so on.
We think this feature is necessary for trouble-shooting xen.

Could Keir apply this feature?
or more comments?

Best Regards,

Akio Takebe

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386 (Take VIII)
  2006-05-16 10:43                                 ` Akio Takebe
@ 2006-05-16 10:44                                   ` Keir Fraser
  2006-05-16 11:03                                     ` Akio Takebe
  0 siblings, 1 reply; 68+ messages in thread
From: Keir Fraser @ 2006-05-16 10:44 UTC (permalink / raw)
  To: Akio Takebe
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Isaku Yamahata, Magnus Damm,
	Horms, Mark Williamson


On 16 May 2006, at 11:43, Akio Takebe wrote:

> I tried Horms's kexec patch and Kazuo's tools.
> And I could get coredump of dom0!
> By using this feature, we can debug dom0 with gdbserver-xen
> to the same way as domU.
> I think that this is very useful.
> Xen don't have dump feature yet,
> and this feature don't affect performace, stability, and so on.
> We think this feature is necessary for trouble-shooting xen.
>
> Could Keir apply this feature?
> or more comments?

Can it kexec to Xen yet?

  -- Keir

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386 (Take VIII)
  2006-05-16 10:44                                   ` Keir Fraser
@ 2006-05-16 11:03                                     ` Akio Takebe
  2006-05-16 12:39                                       ` Keir Fraser
  0 siblings, 1 reply; 68+ messages in thread
From: Akio Takebe @ 2006-05-16 11:03 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Akio Takebe, Isaku Yamahata,
	Magnus Damm, Horms, Mark Williamson

Hi,

No, it can kexec only to kdump kernel.
I think we need to update kexec-tools for kexecing Xen.
(e.g. to load xen,dom0 and initrd)
Am I right, Horms?

But this feature is good as coredump feature.
Even if cannot kexec to Xen, I believe this is important feature.

Best Regards,

Akio Takebe

>
>On 16 May 2006, at 11:43, Akio Takebe wrote:
>
>> I tried Horms's kexec patch and Kazuo's tools.
>> And I could get coredump of dom0!
>> By using this feature, we can debug dom0 with gdbserver-xen
>> to the same way as domU.
>> I think that this is very useful.
>> Xen don't have dump feature yet,
>> and this feature don't affect performace, stability, and so on.
>> We think this feature is necessary for trouble-shooting xen.
>>
>> Could Keir apply this feature?
>> or more comments?
>
>Can it kexec to Xen yet?
>
>  -- Keir
>
>
>_______________________________________________
>Xen-devel mailing list
>Xen-devel@lists.xensource.com
>http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386 (Take VIII)
  2006-05-16 11:03                                     ` Akio Takebe
@ 2006-05-16 12:39                                       ` Keir Fraser
  2006-05-17  2:44                                         ` Horms
  0 siblings, 1 reply; 68+ messages in thread
From: Keir Fraser @ 2006-05-16 12:39 UTC (permalink / raw)
  To: Akio Takebe
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Isaku Yamahata, Magnus Damm,
	Horms, Mark Williamson


On 16 May 2006, at 12:03, Akio Takebe wrote:

> No, it can kexec only to kdump kernel.
> I think we need to update kexec-tools for kexecing Xen.
> (e.g. to load xen,dom0 and initrd)
> Am I right, Horms?

kexec-tools support multiboot format these days. So if kexec is added 
to Xen then we should support kexec'ing to Xen, or we need a good 
explanation why we can't.

  -- Keir

> But this feature is good as coredump feature.
> Even if cannot kexec to Xen, I believe this is important feature.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386 (Take VIII)
  2006-05-16 12:39                                       ` Keir Fraser
@ 2006-05-17  2:44                                         ` Horms
  2006-05-17  4:53                                           ` Horms
  0 siblings, 1 reply; 68+ messages in thread
From: Horms @ 2006-05-17  2:44 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Akio Takebe, Isaku Yamahata,
	Magnus Damm, Mark Williamson

On Tue, May 16, 2006 at 01:39:31PM +0100, Keir Fraser wrote:
> 
> On 16 May 2006, at 12:03, Akio Takebe wrote:
> 
> >No, it can kexec only to kdump kernel.
> >I think we need to update kexec-tools for kexecing Xen.
> >(e.g. to load xen,dom0 and initrd)
> >Am I right, Horms?
> 
> kexec-tools support multiboot format these days. So if kexec is added 
> to Xen then we should support kexec'ing to Xen, or we need a good 
> explanation why we can't.

No it can't kexec into xen yet. I haven't looked into this in depth but
I suspect that kexec-tools needs to be updated as Takebe-san suggests.
As you mention kexec-tools does support multiboot so I suspect that it
is not much work. I will look into it and get back to you. I take it
that you would like this to be working before merging?

In semi-related news, I will post an updated version of the patch in the
next day or so. This is able to capture all of xen's CPUs on kdump and
kdump on xen crash. This means that feature-wise in terms of xen/kernel
code the x86_32 port is pretty much complete. I would be really excited
to get this merged so more eyes can go over the code and we can get
some good feedback and testing. 

My colleague Magnus has x86_64 port is well under way, however we are
having a few problems relating to the approach he has taken to page
table handling on kexec. I am hoping to take a crack at ia64 in the near
future, though I suspect that x86_32 bug fixes and other merge-related work
will delay that a little.

-- 
Horms                                           http://www.vergenet.net/~horms/

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386 (Take VIII)
  2006-05-17  2:44                                         ` Horms
@ 2006-05-17  4:53                                           ` Horms
  2006-05-17  9:52                                             ` Re: [PATCH]: kexec: framework and i386 (Take IX) Horms
  0 siblings, 1 reply; 68+ messages in thread
From: Horms @ 2006-05-17  4:53 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Akio Takebe, Isaku Yamahata,
	Magnus Damm, Mark Williamson

On Wed, May 17, 2006 at 11:44:04AM +0900, Horms wrote:
> On Tue, May 16, 2006 at 01:39:31PM +0100, Keir Fraser wrote:
> > 
> > On 16 May 2006, at 12:03, Akio Takebe wrote:
> > 
> > >No, it can kexec only to kdump kernel.
> > >I think we need to update kexec-tools for kexecing Xen.
> > >(e.g. to load xen,dom0 and initrd)
> > >Am I right, Horms?
> > 
> > kexec-tools support multiboot format these days. So if kexec is added 
> > to Xen then we should support kexec'ing to Xen, or we need a good 
> > explanation why we can't.
> 
> No it can't kexec into xen yet.

I'm happy to report that with some more testing, as long as kexec-tool
is compiled with zlib support I can kexec linux->xen and xen->xen.
Actually, zlib might not be neccessary if both xen and linux are
uncompressed. In any case, for reference, here is a kexec command line
that works for me.

kexec -l -t multiboot-x86 --append="console=com1 sync_console conswitch=bb com1=115200,8n1,0x3f8 dom0_mem=48000" /root/xen --module="/root/vmlinuz-xen root=/dev/hda1 ro console=ttyS0,115200 clock=pit ip=on apm=power-off" --module=/tmp/initramfs_data.cpio

I will post an updated patch today or tomorrow and at that time I will
outline the immediate targets for further development. For now, I'd
like to ask that you don't merge what I have posted, as there are some
invasive changes coming up with regards to page table handling, but 
I sould be able to provide a patch that includes those changes within
the next week as the code is already done, it just needs to be cleaned
up a bit and merged with the kexec patch.

-- 
Horms                                           http://www.vergenet.net/~horms/

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386 (Take IX)
  2006-05-17  4:53                                           ` Horms
@ 2006-05-17  9:52                                             ` Horms
  2006-05-17 10:10                                               ` Keir Fraser
  0 siblings, 1 reply; 68+ messages in thread
From: Horms @ 2006-05-17  9:52 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Akio Takebe, Isaku Yamahata,
	Magnus Damm, Mark Williamson

[-- Attachment #1: Type: text/plain, Size: 1390 bytes --]

Hi,

as promised earlier in the day, here is an update on the kexec/kdump
patch. The main changes are that SMP now works, and the dumping of
cpu registers for kdump has been moved into the hypervisor so as to
allow all CPUs to be captured, not just dom0's VCPUs.

Also, as mentioned earlier in the day linux->xen and xen->xen kexec
does work, contrary to what I previously reported.

I have also broken the patch out into generic, x86 and x86_32 patches
which need to be applied in that order. This was done to allow other
architectures to be worked on more easily. By that, I mean it makes it
easier for my colleagues and I to work together. It should also make it
easier to review the code. If a monolithic patch is desired please let
me know as it is very easy for me to produce one.

I hope to make the next round available within the next few (working) days. 
This will change page table handling around a bit (only for kexec/kdump
not for the rest of the time) so as to avoid trampling the page tables,
which is a problem for kdump as it destroys data that might otherwise
be analysed. My colleague Magnus Damm is working on having his approach
addoped by Linux kdump.

Beyond that, Magnus has also been working on a x86_64 port, though
that is not quite working. And I plan to start on ia64 soon.

-- 
Horms                                           http://www.vergenet.net/~horms/


[-- Attachment #2: 51.1-kexec-generic-upstream.patch --]
[-- Type: text/plain, Size: 31293 bytes --]

kexec: framework

This is an implementation of kexec for dom0/xen, that allows
kexecing of the physical machine from xen. The approach taken is
to move the architecture-dependant kexec code into a new hypercall.

This patch only includes the framework, it cann't be used without
architecture dependant hooks, however the code should compile as is.

 linux-2.6-xen-sparse/drivers/xen/core/Makefile        |    1 
 linux-2.6-xen-sparse/drivers/xen/core/crash.c         |   48 ++
 linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c |  100 ++++++
 linux-2.6-xen-sparse/drivers/xen/core/reboot.c        |    4 
 linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h     |   27 +
 patches/linux-2.6.16.13/kexec.patch                   |  274 +++++++++++++++++
 xen/arch/x86/Makefile                                 |    2 
 xen/arch/x86/crash.c                                  |   26 +
 xen/arch/x86/machine_kexec.c                          |   51 +++
 xen/common/Makefile                                   |    1 
 xen/common/kexec.c                                    |  183 +++++++++++
 xen/common/page_alloc.c                               |   33 +-
 xen/drivers/char/console.c                            |    3 
 xen/include/asm-x86/kexec.h                           |   32 +
 xen/include/public/kexec.h                            |   48 ++
 xen/include/public/xen.h                              |   12 
 xen/include/xen/elfcore.h                             |   73 ++++
 xen/include/xen/kexec.h                               |   33 ++
 xen/include/xen/mm.h                                  |    1 
 19 files changed, 941 insertions(+), 11 deletions(-)

--- x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
+++ x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_NET)     += skbuff.o
 obj-$(CONFIG_SMP)     += smpboot.o
 obj-$(CONFIG_SYSFS)   += hypervisor_sysfs.o
 obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
+obj-$(CONFIG_KEXEC)   += machine_kexec.o crash.o
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/crash.c
@@ -0,0 +1,48 @@
+/*
+ * Architecture independent functions for kexec based crash dumps in xen.
+ *
+ * Created by: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <asm/ptrace.h>
+#include <linux/types.h>
+#include <asm/kexec-xen.h>
+#include <asm/hypervisor.h>
+#include <asm/system.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <asm/hw_irq.h>
+#include <xen/interface/kexec.h>
+
+/* 
+ * This passes the registers's down to the hypervisor and has it kexec()
+ * This is a bit different to the linux implementation which
+ * has this call save registers and stop CPUs and then goes into
+ * machine_kexec() later. But for Xen it makes more sense to
+ * have the kexec hypercall do everything, and this call
+ * has the registers parameter that is needed.
+ * to the hypervisor to allow the hypervisor to kdump itself
+ * on an internal panic 
+ */
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	kexec_arg_t hypercall_arg;
+	printk("machine_crash_shutdown: %d\n", smp_processor_id());
+	local_irq_disable();
+#ifdef CONFIG_X86_IO_APIC
+	disable_IO_APIC();
+#endif
+	crash_translate_regs(regs, &hypercall_arg.u.regs);
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, KEXEC_TYPE_CRASH, &hypercall_arg);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
@@ -0,0 +1,100 @@
+/*
+ * drivers/xen/core/machine_kexec.c 
+ * handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <linux/kexec.h>
+#include <xen/interface/kexec.h>
+#include <linux/mm.h>
+#include <asm/hypercall.h>
+#include <asm/kexec-xen.h>
+
+const extern unsigned char relocate_new_kernel[];
+extern unsigned int relocate_new_kernel_size;
+
+/*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+ * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
+ * have been allocated, but the segments have yet
+ * been copied into the kernel.
+ *
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ *
+ * Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+	return HYPERVISOR_kexec(KEXEC_CMD_kexec_prepare, image->type, NULL);
+}
+
+/*
+ * Undo anything leftover by machine_kexec_prepare
+ * when an image is freed.
+ */
+void machine_kexec_cleanup(struct kimage *image)
+{
+	HYPERVISOR_kexec(KEXEC_CMD_kexec_cleanup, image->type, NULL);
+}
+
+
+/*
+ * Load the image into xen so xen can kdump itself
+ * This might have been done in prepare, but prepare
+ * is currently called too early. It might make sense
+ * to move prepare, buf for now, just add an extra hook.
+ */
+int machine_kexec_load(struct kimage *image)
+{
+	kexec_arg_t hypercall_arg;
+	hypercall_arg.u.image.indirection_page = image->head;
+	hypercall_arg.u.image.reboot_code_buffer = 
+		pfn_to_mfn(page_to_pfn(image->control_code_page)) << PAGE_SHIFT;
+	hypercall_arg.u.image.start_address = image->start;
+	hypercall_arg.u.image.relocate_new_kernel = relocate_new_kernel;
+	hypercall_arg.u.image.relocate_new_kernel_size = 
+		relocate_new_kernel_size;
+	return HYPERVISOR_kexec(KEXEC_CMD_kexec_load, image->type,
+				&hypercall_arg);
+}
+
+/*
+ * Unload the image that was stored by machine_kexec_load()
+ * This might have been done in machine_kexec_cleanup() but it
+ * is called too late, and its possible xen could try and kdump
+ * using resources that have been freed.
+ */
+void machine_kexec_unload(struct kimage *image)
+{
+	HYPERVISOR_kexec(KEXEC_CMD_kexec_unload, image->type, NULL);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ *
+ * This has the hypervisor move to the prefered reboot CPU, 
+ * stop all CPUs and kexec. That is it combines machine_shutdown()
+ * and machine_kexec() in Linux kexec terms.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+	printk("machine_kexec\n");
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, image->type, NULL);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
+++ x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
@@ -66,6 +66,10 @@ void machine_power_off(void)
 	HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 }
 
+#ifdef CONFIG_KEXEC
+void machine_shutdown(void) { }
+#endif
+
 int reboot_thru_bios = 0;	/* for dmi_scan.c */
 EXPORT_SYMBOL(machine_restart);
 EXPORT_SYMBOL(machine_halt);
--- /dev/null
+++ x/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h
@@ -0,0 +1,27 @@
+/*
+ * include/asm-i386/kexec-xen.h
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _I386_KEXEC_XEN_H
+#define _I386_KEXEC_XEN_H
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+					struct cpu_user_regs *xen_regs)
+{
+	printk("STUB: include/asm-i386/kexec-xen.h: crash_translate_regs: "
+	       "not implemented\n");
+}
+
+#endif /* _I386_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/xen/arch/x86/Makefile
+++ x/xen/arch/x86/Makefile
@@ -39,6 +39,8 @@ obj-y += trampoline.o
 obj-y += traps.o
 obj-y += usercopy.o
 obj-y += x86_emulate.o
+obj-y += machine_kexec.o
+obj-y += crash.o
 
 ifneq ($(pae),n)
 obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o
--- /dev/null
+++ x/xen/arch/x86/crash.c
@@ -0,0 +1,26 @@
+/******************************************************************************
+ * arch/x86/crash.c
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+void machine_crash_shutdown(struct cpu_user_regs *regs)
+{
+    printk("STUB: arch/x86/crash.c: machine_crash_shutdown: not implemented\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/arch/x86/machine_kexec.c
@@ -0,0 +1,51 @@
+/******************************************************************************
+ * arch/x86/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ */
+
+#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+int machine_kexec_prepare(int type, struct kexec_arg *arg)
+{
+    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_prepare: "
+        "not implemented\n");
+    return -1;
+}
+
+void machine_kexec_cleanup(int type, struct kexec_arg *arg)
+{
+    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_cleanup: "
+        "not implemented\n");
+}
+
+void machine_kexec_reserved(struct kexec_arg *arg)
+{
+    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_reserved: "
+        "not implemented\n");
+}
+
+void machine_kexec(struct kexec_arg *arg)
+{
+    printk("STUB: arch/x86/machine_kexec.c: machine_kexec: "
+        "not implemented\n");
+}
+
+void machine_shutdown(struct kexec_arg *arg)
+{
+    printk("STUB: arch/x86/machine_shutdown.c: machine_shutdown: "
+       "not implemented\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/Makefile
+++ x/xen/common/Makefile
@@ -7,6 +7,7 @@ obj-y += event_channel.o
 obj-y += grant_table.o
 obj-y += kernel.o
 obj-y += keyhandler.o
+obj-y += kexec.o
 obj-y += lib.o
 obj-y += memory.o
 obj-y += multicall.o
--- /dev/null
+++ x/xen/common/kexec.c
@@ -0,0 +1,183 @@
+/*
+ * common/kexec.c - Achitecture independent kexec code for Xen
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Based in part on Linux 2.6.16's kernel/kexec.c
+ */
+
+#include <asm/kexec.h>
+#include <xen/lib.h>
+#include <xen/errno.h>
+#include <xen/guest_access.h>
+#include <xen/sched.h>
+#include <xen/types.h>
+#include <xen/kexec.h>
+#include <public/kexec.h>
+
+DEFINE_PER_CPU (note_buf_t, crash_notes);
+
+static struct kexec_arg kexec_image;
+static int kexec_image_set = 0;
+static struct kexec_arg kexec_crash_image;
+static int kexec_crash_image_set = 0;
+static int kexec_crash_lock = 0;
+
+/* Must call with kexec_crash_lock held */
+void __crash_kexec(struct cpu_user_regs *regs)
+{
+    struct cpu_user_regs fixed_regs;
+
+    if (!kexec_crash_image_set)
+	    return;
+    crash_setup_regs(&fixed_regs, regs);
+    machine_crash_shutdown(&fixed_regs);
+    machine_kexec(&kexec_crash_image); /* Does not return */
+}
+
+void crash_kexec(struct cpu_user_regs *regs)
+{
+    int locked;
+
+    locked = xchg(&kexec_crash_lock, 1);
+    if (locked)
+        return;
+    __crash_kexec(regs);
+    xchg(&kexec_crash_lock, 0);
+}
+
+static int get_crash_note(XEN_GUEST_HANDLE(kexec_arg_t) uarg)
+{
+    struct kexec_arg arg;
+    int locked, cpu;
+
+    if ( unlikely(copy_from_guest(&arg, uarg, 1) != 0) )
+    {
+        printk("do_kexec: get_crash_note: __copy_from_guest failed\n");
+        return -EFAULT;
+    }
+
+    cpu = (int)arg.u.crash_note;
+    if (cpu < 0)
+        return -EINVAL;
+
+    if (cpu > num_booting_cpus())
+        arg.u.crash_note = 0L;
+    else {
+        locked = xchg(&kexec_crash_lock, 1);
+        if (locked)
+        {
+           printk("do_kexec: get_crash_note: in xen-generated kdump\n");
+           return -EFAULT;
+        }
+        arg.u.crash_note = __pa((unsigned long)per_cpu(crash_notes, cpu));
+        xchg(&kexec_crash_lock, 0);
+    }
+
+    if ( unlikely(copy_to_guest(uarg, &arg, 1) != 0) )
+    {
+        printk("do_kexec: get_crash_note: copy_to_guest failed\n");
+        return -EFAULT;
+    }
+    
+    return 0;
+}
+
+
+int do_kexec(unsigned long op, int type, XEN_GUEST_HANDLE(kexec_arg_t) uarg)
+{
+    struct kexec_arg *image, tmp_arg;
+    int locked;
+    int *image_set;
+    int status = -EINVAL;
+
+    if ( !IS_PRIV(current->domain) )  
+        return -EPERM;
+
+    switch (op)
+    {
+    case KEXEC_CMD_kexec_crash_note:
+        return get_crash_note(uarg);
+    case KEXEC_CMD_kexec_reserve:
+	machine_kexec_reserved(&tmp_arg);
+        if ( unlikely(copy_to_guest(uarg, &tmp_arg, 1) != 0) )
+        {
+            printk("do_kexec (CMD_kexec_reserve): copy_to_guest failed\n");
+            return -EFAULT;
+        }
+        return 0;
+    }
+
+    if (type == KEXEC_TYPE_CRASH)
+    {
+        image = &kexec_crash_image;
+        image_set = &kexec_crash_image_set;
+        locked = xchg(&kexec_crash_lock, 1);
+        if (locked)
+        {
+           printk("do_kexec: dump is locked\n");
+           return -EFAULT;
+        }
+    }
+    else
+    {
+        image = &kexec_image;
+        image_set = &kexec_image_set;
+    }
+
+    switch(op) {
+    case KEXEC_CMD_kexec:
+        BUG_ON(!*image_set);
+	if (type == KEXEC_TYPE_CRASH) {
+            if ( unlikely(copy_from_guest(&tmp_arg, uarg, 1) != 0) )
+            {
+                printk("do_kexec (CMD_kexec): copy_from_guest failed\n");
+                status = -EFAULT;
+	        break;
+            }
+            __crash_kexec(&tmp_arg.u.regs);
+	}
+	else
+            machine_shutdown(image); /* Does not return */
+        break;
+    case KEXEC_CMD_kexec_prepare:
+        /* Might need to tighten up kexec_crash_lock semantics,
+         * but this currently does nothing. ditto for cleanup */
+        status = machine_kexec_prepare(type, image);
+        break;
+    case KEXEC_CMD_kexec_cleanup:
+        machine_kexec_cleanup(type, image);
+        status = 0;
+        break;
+    case KEXEC_CMD_kexec_load:
+        BUG_ON(*image_set);
+        if ( unlikely(copy_from_guest(image, uarg, 1) != 0) )
+        {
+            printk("do_kexec (CMD_kexec_load): copy_from_guest failed\n");
+            status = -EFAULT;
+	    break;
+        }
+        *image_set = 1;
+        status = 0;
+        break;
+    case KEXEC_CMD_kexec_unload:
+        BUG_ON(!*image_set);
+        *image_set = 0;
+        status = 0;
+        break;
+    }
+
+    if (type == KEXEC_TYPE_CRASH)
+        xchg(&kexec_crash_lock, 0);
+    return status;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/page_alloc.c
+++ x/xen/common/page_alloc.c
@@ -212,24 +212,35 @@ void init_boot_pages(paddr_t ps, paddr_t
     }
 }
 
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at)
+{
+    unsigned long i;
+
+    for ( i = 0; i < nr_pfns; i++ )
+        if ( allocated_in_map(pfn_at + i) )
+             break;
+
+    if ( i == nr_pfns )
+    {
+        map_alloc(pfn_at, nr_pfns);
+        return pfn_at;
+    }
+
+    return 0;
+}
+
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
 {
-    unsigned long pg, i;
+    unsigned long pg, i = 0;
 
     for ( pg = 0; (pg + nr_pfns) < max_page; pg += pfn_align )
     {
-        for ( i = 0; i < nr_pfns; i++ )
-            if ( allocated_in_map(pg + i) )
-                 break;
-
-        if ( i == nr_pfns )
-        {
-            map_alloc(pg, nr_pfns);
-            return pg;
-        }
+        i = alloc_boot_pages_at(nr_pfns, pg);
+        if (i != 0)
+            break;
     }
 
-    return 0;
+    return i;
 }
 
 
--- x/xen/drivers/char/console.c
+++ x/xen/drivers/char/console.c
@@ -677,6 +677,7 @@ void panic(const char *fmt, ...)
     unsigned long flags;
     static spinlock_t lock = SPIN_LOCK_UNLOCKED;
     extern void machine_restart(char *);
+    extern void crash_kexec(struct cpu_user_regs *regs);
     
     debugtrace_dump();
 
@@ -696,6 +697,8 @@ void panic(const char *fmt, ...)
 
     debugger_trap_immediate();
 
+    crash_kexec(NULL);
+
     watchdog_disable();
     mdelay(5000);
     machine_restart(0);
--- /dev/null
+++ x/xen/include/asm-x86/kexec.h
@@ -0,0 +1,32 @@
+/******************************************************************************
+ * include/asm-x86/kexec.h
+ * 
+ * Created By: Horms
+ *
+ */
+
+#ifndef __X86_KEXEC_H__
+#define __X86_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: include/asm-x86/kexec.h: crash_setup_regs: "
+       "not implemented\n");
+}
+
+#endif /* __X86_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/public/kexec.h
@@ -0,0 +1,48 @@
+/*
+ * kexec.h - Public portion
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+#include "xen.h"
+
+#define KEXEC_TYPE_DEFAULT 0
+#define KEXEC_TYPE_CRASH   1
+
+/*
+ * Scratch space for passing arguments to the kexec hypercall
+ */
+typedef struct kexec_arg {
+    union {
+        struct {
+            unsigned long indirection_page;
+            unsigned long reboot_code_buffer;
+            unsigned long start_address;
+            const char *relocate_new_kernel;
+            unsigned int relocate_new_kernel_size;
+	    unsigned int type;
+        } image;
+	struct cpu_user_regs regs;
+        struct {
+            unsigned long size;
+            unsigned long start;
+        } reserve;
+	unsigned long crash_note;
+    } u;
+} kexec_arg_t;
+DEFINE_XEN_GUEST_HANDLE(kexec_arg_t);
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/public/xen.h
+++ x/xen/include/public/xen.h
@@ -64,6 +64,7 @@
 #define __HYPERVISOR_xenoprof_op          31
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
+#define __HYPERVISOR_kexec_op             34
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
@@ -238,6 +239,17 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
 #define VMASST_TYPE_writable_pagetables  2
 #define MAX_VMASST_TYPE 2
 
+/*
+ * Operations for kexec.
+ */
+#define KEXEC_CMD_kexec                 0
+#define KEXEC_CMD_kexec_prepare         1
+#define KEXEC_CMD_kexec_cleanup         2
+#define KEXEC_CMD_kexec_load            3
+#define KEXEC_CMD_kexec_unload          4
+#define KEXEC_CMD_kexec_reserve         5
+#define KEXEC_CMD_kexec_crash_note      6
+
 #ifndef __ASSEMBLY__
 
 typedef uint16_t domid_t;
--- /dev/null
+++ x/xen/include/xen/elfcore.h
@@ -0,0 +1,73 @@
+/******************************************************************************
+ * include/xen/elfcore.h
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on include/linux/elfcore.h from Linux 2.6.16
+ * Naming scheeme based on include/xen/elf.h (not include/linux/elfcore.h)
+ *
+ */
+
+#ifndef __ELFCOREC_H__
+#define __ELFCOREC_H__
+
+#include <xen/types.h>
+#include <xen/elf.h>
+#include <public/xen.h>
+
+#define NT_PRSTATUS     1
+
+typedef struct
+{
+    int signo;                       /* signal number */
+    int code;                        /* extra code */
+    int errno;                       /* errno */
+} ELF_Signifo;
+
+/* These seem to be the same length on all architectures on Linux */
+typedef int ELF_Pid;
+typedef struct {
+	long tv_sec;
+	long tv_usec;
+} ELF_Timeval;
+typedef unsigned long ELF_Greg;
+#define ELF_NGREG (sizeof (struct cpu_user_regs) / sizeof(ELF_Greg))
+typedef ELF_Greg ELF_Gregset[ELF_NGREG];
+
+/*
+ * Definitions to generate Intel SVR4-like core files.
+ * These mostly have the same names as the SVR4 types with "elf_"
+ * tacked on the front to prevent clashes with linux definitions,
+ * and the typedef forms have been avoided.  This is mostly like
+ * the SVR4 structure, but more Linuxy, with things that Linux does
+ * not support and which gdb doesn't really use excluded.
+ */
+typedef struct
+{
+    ELF_Signifo pr_info;         /* Info associated with signal */
+    short pr_cursig;             /* Current signal */
+    unsigned long pr_sigpend;    /* Set of pending signals */
+    unsigned long pr_sighold;    /* Set of held signals */
+    ELF_Pid pr_pid;
+    ELF_Pid pr_ppid;
+    ELF_Pid pr_pgrp;
+    ELF_Pid pr_sid;
+    ELF_Timeval pr_utime;        /* User time */
+    ELF_Timeval pr_stime;        /* System time */
+    ELF_Timeval pr_cutime;       /* Cumulative user time */
+    ELF_Timeval pr_cstime;       /* Cumulative system time */
+    ELF_Gregset pr_reg;          /* GP registers */
+    int pr_fpvalid;              /* True if math co-processor being used.  */
+} ELF_Prstatus;
+
+#endif /* __ELFCOREC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/xen/kexec.h
@@ -0,0 +1,33 @@
+/*
+ * include/xen/kexec.h - Internal archtecture independant portion
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <public/kexec.h>
+
+#define MAX_NOTE_BYTES 1024
+
+typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
+DECLARE_PER_CPU (note_buf_t, crash_notes);
+
+int machine_kexec_prepare(int type, struct kexec_arg *arg);
+void machine_kexec_cleanup(int type, struct kexec_arg *arg);
+void machine_kexec_reserved(struct kexec_arg *arg);
+void machine_kexec(struct kexec_arg *arg);
+void machine_shutdown(struct kexec_arg *arg);
+void machine_crash_shutdown(struct cpu_user_regs *regs);
+
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/xen/mm.h
+++ x/xen/include/xen/mm.h
@@ -40,6 +40,7 @@ struct page_info;
 paddr_t init_boot_allocator(paddr_t bitmap_start);
 void init_boot_pages(paddr_t ps, paddr_t pe);
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align);
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at);
 void end_boot_allocator(void);
 
 /* Generic allocator. These functions are *not* interrupt-safe. */
--- /dev/null	2006-05-09 15:32:30.399072192 +0900
+++ x/patches/linux-2.6.16.13/kexec.patch	2006-05-17 18:37:45.000000000 +0900
@@ -0,0 +1,274 @@
+--- x/drivers/base/cpu.c
++++ x/drivers/base/cpu.c
+@@ -80,12 +80,30 @@ void unregister_cpu(struct cpu *cpu, str
+ #else /* ... !CONFIG_HOTPLUG_CPU */
+ static inline void register_cpu_control(struct cpu *cpu)
+ {
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++	kexec_arg_t hypercall_arg;
++	hypercall_arg.u.crash_note = (unsigned long)cpu;
++#endif
+ }
+ #endif /* CONFIG_HOTPLUG_CPU */
+ 
+ #ifdef CONFIG_KEXEC
+ #include <linux/kexec.h>
+ 
++#ifdef CONFIG_XEN
++static unsigned long get_crash_notes(int cpu)
++{
++	kexec_arg_t hypercall_arg;
++
++	hypercall_arg.u.crash_note = (unsigned long)cpu;
++	if (HYPERVISOR_kexec(KEXEC_CMD_kexec_crash_note, 0, &hypercall_arg) < 0)
++		return 0L;
++	return hypercall_arg.u.crash_note;
++}
++#endif
++
++/* XXX: This only finds dom0's CPU's */
+ static ssize_t show_crash_notes(struct sys_device *dev, char *buf)
+ {
+ 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+@@ -101,7 +119,11 @@ static ssize_t show_crash_notes(struct s
+ 	 * boot up and this data does not change there after. Hence this
+ 	 * operation should be safe. No locking required.
+ 	 */
++#ifndef CONFIG_XEN
+ 	addr = __pa(per_cpu_ptr(crash_notes, cpunum));
++#else
++	addr = (unsigned long long)get_crash_notes(cpunum);
++#endif
+ 	rc = sprintf(buf, "%Lx\n", addr);
+ 	return rc;
+ }
+--- x/include/linux/kexec.h
++++ x/include/linux/kexec.h
+@@ -91,6 +91,10 @@ struct kimage {
+ extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
+ extern int machine_kexec_prepare(struct kimage *image);
+ extern void machine_kexec_cleanup(struct kimage *image);
++#ifdef CONFIG_XEN
++extern int machine_kexec_load(struct kimage *image);
++extern void machine_kexec_unload(struct kimage *image);
++#endif
+ extern asmlinkage long sys_kexec_load(unsigned long entry,
+ 					unsigned long nr_segments,
+ 					struct kexec_segment __user *segments,
+--- x/kernel/kexec.c
++++ x/kernel/kexec.c
+@@ -27,8 +27,10 @@
+ #include <asm/system.h>
+ #include <asm/semaphore.h>
+ 
++#ifndef CONFIG_XEN
+ /* Per cpu memory for storing cpu states in case of system crash. */
+ note_buf_t* crash_notes;
++#endif
+ 
+ /* Location of the reserved area for the crash kernel */
+ struct resource crashk_res = {
+@@ -38,6 +40,20 @@ struct resource crashk_res = {
+ 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ };
+ 
++/* Kexec needs to know about the actually physical addresss.
++ * But in xen, a physical address is a pseudo-physical addresss. */
++#ifndef CONFIG_XEN
++#define kexec_page_to_pfn(page)  page_to_pfn(page)
++#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
++#define kexec_virt_to_phys(addr) virt_to_phys(addr)
++#define kexec_phys_to_virt(addr) phys_to_virt(addr)
++#else
++#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
++#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
++#define kexec_virt_to_phys(addr) virt_to_machine(addr)
++#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
++#endif
++
+ int kexec_should_crash(struct task_struct *p)
+ {
+ 	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+@@ -403,7 +419,7 @@ static struct page *kimage_alloc_normal_
+ 		pages = kimage_alloc_pages(GFP_KERNEL, order);
+ 		if (!pages)
+ 			break;
+-		pfn   = page_to_pfn(pages);
++		pfn   = kexec_page_to_pfn(pages);
+ 		epfn  = pfn + count;
+ 		addr  = pfn << PAGE_SHIFT;
+ 		eaddr = epfn << PAGE_SHIFT;
+@@ -437,6 +453,7 @@ static struct page *kimage_alloc_normal_
+ 	return pages;
+ }
+ 
++#ifndef CONFIG_XEN
+ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ 						      unsigned int order)
+ {
+@@ -490,7 +507,7 @@ static struct page *kimage_alloc_crash_c
+ 		}
+ 		/* If I don't overlap any segments I have found my hole! */
+ 		if (i == image->nr_segments) {
+-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
++			pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
+ 			break;
+ 		}
+ 	}
+@@ -517,6 +534,13 @@ struct page *kimage_alloc_control_pages(
+ 
+ 	return pages;
+ }
++#else /* !CONFIG_XEN */
++struct page *kimage_alloc_control_pages(struct kimage *image,
++					 unsigned int order)
++{
++	return kimage_alloc_normal_control_pages(image, order);
++}
++#endif
+ 
+ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+ {
+@@ -532,7 +556,7 @@ static int kimage_add_entry(struct kimag
+ 			return -ENOMEM;
+ 
+ 		ind_page = page_address(page);
+-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
++		*image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
+ 		image->entry = ind_page;
+ 		image->last_entry = ind_page +
+ 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+@@ -593,13 +617,13 @@ static int kimage_terminate(struct kimag
+ #define for_each_kimage_entry(image, ptr, entry) \
+ 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ 		ptr = (entry & IND_INDIRECTION)? \
+-			phys_to_virt((entry & PAGE_MASK)): ptr +1)
++			kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
+ 
+ static void kimage_free_entry(kimage_entry_t entry)
+ {
+ 	struct page *page;
+ 
+-	page = pfn_to_page(entry >> PAGE_SHIFT);
++	page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
+ 	kimage_free_pages(page);
+ }
+ 
+@@ -611,6 +635,10 @@ static void kimage_free(struct kimage *i
+ 	if (!image)
+ 		return;
+ 
++#ifdef CONFIG_XEN
++	machine_kexec_unload(image);
++#endif
++
+ 	kimage_free_extra_pages(image);
+ 	for_each_kimage_entry(image, ptr, entry) {
+ 		if (entry & IND_INDIRECTION) {
+@@ -686,7 +714,7 @@ static struct page *kimage_alloc_page(st
+ 	 * have a match.
+ 	 */
+ 	list_for_each_entry(page, &image->dest_pages, lru) {
+-		addr = page_to_pfn(page) << PAGE_SHIFT;
++		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
+ 		if (addr == destination) {
+ 			list_del(&page->lru);
+ 			return page;
+@@ -701,12 +729,12 @@ static struct page *kimage_alloc_page(st
+ 		if (!page)
+ 			return NULL;
+ 		/* If the page cannot be used file it away */
+-		if (page_to_pfn(page) >
++		if (kexec_page_to_pfn(page) >
+ 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ 			list_add(&page->lru, &image->unuseable_pages);
+ 			continue;
+ 		}
+-		addr = page_to_pfn(page) << PAGE_SHIFT;
++		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
+ 
+ 		/* If it is the destination page we want use it */
+ 		if (addr == destination)
+@@ -729,7 +757,7 @@ static struct page *kimage_alloc_page(st
+ 			struct page *old_page;
+ 
+ 			old_addr = *old & PAGE_MASK;
+-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
++			old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
+ 			copy_highpage(page, old_page);
+ 			*old = addr | (*old & ~PAGE_MASK);
+ 
+@@ -779,7 +807,7 @@ static int kimage_load_normal_segment(st
+ 			result  = -ENOMEM;
+ 			goto out;
+ 		}
+-		result = kimage_add_page(image, page_to_pfn(page)
++		result = kimage_add_page(image, kexec_page_to_pfn(page)
+ 								<< PAGE_SHIFT);
+ 		if (result < 0)
+ 			goto out;
+@@ -811,6 +839,7 @@ out:
+ 	return result;
+ }
+ 
++#ifndef CONFIG_XEN
+ static int kimage_load_crash_segment(struct kimage *image,
+ 					struct kexec_segment *segment)
+ {
+@@ -833,7 +862,7 @@ static int kimage_load_crash_segment(str
+ 		char *ptr;
+ 		size_t uchunk, mchunk;
+ 
+-		page = pfn_to_page(maddr >> PAGE_SHIFT);
++		page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
+ 		if (page == 0) {
+ 			result  = -ENOMEM;
+ 			goto out;
+@@ -881,6 +910,13 @@ static int kimage_load_segment(struct ki
+ 
+ 	return result;
+ }
++#else /* CONFIG_XEN */
++static int kimage_load_segment(struct kimage *image,
++				struct kexec_segment *segment)
++{
++	return kimage_load_normal_segment(image, segment);
++}
++#endif
+ 
+ /*
+  * Exec Kernel system call: for obvious reasons only root may call it.
+@@ -991,6 +1027,11 @@ asmlinkage long sys_kexec_load(unsigned 
+ 		if (result)
+ 			goto out;
+ 	}
++#ifdef CONFIG_XEN
++	result = machine_kexec_load(image);
++	if (result)
++		goto out;
++#endif
+ 	/* Install the new kernel, and  Uninstall the old */
+ 	image = xchg(dest_image, image);
+ 
+@@ -1045,7 +1086,6 @@ void crash_kexec(struct pt_regs *regs)
+ 	struct kimage *image;
+ 	int locked;
+ 
+-
+ 	/* Take the kexec_lock here to prevent sys_kexec_load
+ 	 * running on one cpu from replacing the crash kernel
+ 	 * we are using after a panic on a different cpu.
+@@ -1067,6 +1107,7 @@ void crash_kexec(struct pt_regs *regs)
+ 	}
+ }
+ 
++#ifndef CONFIG_XEN
+ static int __init crash_notes_memory_init(void)
+ {
+ 	/* Allocate memory for saving cpu registers. */
+@@ -1079,3 +1120,4 @@ static int __init crash_notes_memory_ini
+ 	return 0;
+ }
+ module_init(crash_notes_memory_init)
++#endif

[-- Attachment #3: 51.2.1-kexec-x86-upstream.patch --]
[-- Type: text/plain, Size: 20199 bytes --]

kexec: x86

This is the x86 component of kexec for xen.
The generic component is a prerequsite for this patch.
The x86_64 or x86_32 (i386) patch is also needed
in order to use this code, however the code should compile is.

 xen/arch/x86/crash.c                |  171 ++++++++++++++++++++++++++++++++++-
 xen/arch/x86/dom0_ops.c             |    3 
 xen/arch/x86/machine_kexec.c        |   58 +++++++++--
 xen/arch/x86/setup.c                |   75 +++++++++++++--
 xen/arch/x86/x86_32/Makefile        |    1 
 xen/arch/x86/x86_32/machine_kexec.c |   27 +++++
 xen/arch/x86/x86_64/Makefile        |    1 
 xen/arch/x86/x86_64/machine_kexec.c |   28 +++++
 xen/common/kexec.c                  |    3 
 xen/include/asm-x86/elf.h           |   27 +++++
 xen/include/asm-x86/hypercall.h     |    6 +
 xen/include/asm-x86/kexec.h         |   14 +-
 xen/include/asm-x86/x86_32/elf.h    |   30 ++++++
 xen/include/asm-x86/x86_32/kexec.h  |   51 ++++++++++
 xen/include/asm-x86/x86_64/elf.h    |   30 ++++++
 xen/include/asm-x86/x86_64/kexec.h  |   50 ++++++++++
 16 files changed, 542 insertions(+), 33 deletions(-)

--- x/xen/arch/x86/crash.c
+++ x/xen/arch/x86/crash.c
@@ -3,16 +3,181 @@
  * 
  * Created By: Horms
  *
- * Should be based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
+ * Based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
  */
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <asm/atomic.h>
+#include <asm/elf.h>
+#include <asm/percpu.h>
+#include <asm/kexec.h>
 #include <xen/types.h>
+#include <xen/irq.h>
+#include <asm/ipi.h>
+#include <asm/nmi.h>
+#include <xen/string.h>
+#include <xen/elf.h>
+#include <xen/elfcore.h>
+#include <xen/smp.h>
+#include <xen/delay.h>
+#include <xen/perfc.h>
+#include <xen/kexec.h>
 #include <public/xen.h>
 
+static int crashing_cpu;
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+							       size_t data_len)
+{
+	Elf_Note note;
+
+	note.namesz = strlen(name) + 1;
+	note.descsz = data_len;
+	note.type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) +3)/4;
+	memcpy(buf, name, note.namesz);
+	buf += (note.namesz + 3)/4;
+	memcpy(buf, data, note.descsz);
+	buf += (note.descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	Elf_Note note;
+
+	note.namesz = 0;
+	note.descsz = 0;
+	note.type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static void crash_save_this_cpu(struct cpu_user_regs *regs, int cpu)
+{
+	ELF_Prstatus prstatus;
+	uint32_t *buf;
+
+	printk("crash_save_this_cpu: %d\n",  cpu);
+
+	if ((cpu < 0) || (cpu >= NR_CPUS))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * A well defined structure format with tags is needed
+	 * ELF notes happen to provide this and there is infastructure
+	 * in the Linux kernel to supprot them. In order to make
+	 * crash dumps produced by xen the same, the same
+	 * technique is used here.
+	 */
+
+	/* It should be safe to use per_cpu() here instead of per_cpu_ptr()
+	 * (which does not exist in xen) as kexecing_lock must be held in
+	 * order to get anywhere near here */
+	buf = (uint32_t *)per_cpu(crash_notes, cpu);
+	if (!buf) /* XXX: Can this ever occur? */
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	/* XXX: Xen does not have processes. For the crashing CPU on a dom0
+	 * crash this could be pased down from dom0, but is this
+	 * neccessary?
+	 * prstatus.pr_pid = current->pid; */
+	ELF_CORE_COPY_REGS(prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+				sizeof(prstatus));
+	final_note(buf);
+}
+
+static void crash_save_self(struct cpu_user_regs *regs)
+{
+	crash_save_this_cpu(regs, smp_processor_id());
+}
+
+#ifdef CONFIG_SMP
+static atomic_t waiting_for_crash_ipi;
+
+static int crash_nmi_callback(struct cpu_user_regs *regs, int cpu)
+{
+	struct cpu_user_regs fixed_regs;
+
+	/* Don't do anything if this handler is invoked on crashing cpu.
+	 * Otherwise, system will completely hang. Crashing cpu can get
+	 * an NMI if system was initially booted with nmi_watchdog parameter.
+	 */
+	if (cpu == crashing_cpu)
+		return 1;
+	local_irq_disable();
+
+	if (!user_mode(regs)) {
+		crash_fixup_ss_esp(&fixed_regs, regs);
+		regs = &fixed_regs;
+	}
+	crash_save_this_cpu(regs, cpu);
+	disable_local_APIC();
+	atomic_dec(&waiting_for_crash_ipi);
+	/* Assume hlt works */
+	__asm__ __volatile__ ( "hlt" );
+	for(;;);
+
+	return 1;
+
+	/* Need to use this somewhere as Xen builds with -Werror */
+	crash_setup_regs(&fixed_regs, regs);
+}
+
+/*
+ * By using the NMI code instead of a vector we just sneak thru the
+ * word generator coming out with just what we want.  AND it does
+ * not matter if clustered_apic_mode is set or not.
+ */
+static void smp_send_nmi_allbutself(void)
+{
+	cpumask_t allbutself = cpu_online_map;
+    	cpu_clear(smp_processor_id(), allbutself);
+	send_IPI_mask(allbutself, APIC_DM_NMI);
+}
+
+static void nmi_shootdown_cpus(void)
+{
+	unsigned long msecs;
+
+	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+	/* Would it be better to replace the trap vector here? */
+	set_nmi_callback(crash_nmi_callback);
+	/* Ensure the new callback function is set before sending
+	 * out the NMI
+	 */
+	wmb();
+
+	smp_send_nmi_allbutself();
+
+	msecs = 1000; /* Wait at most a second for the other cpus to stop */
+	while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+		mdelay(1);
+		msecs--;
+	}
+
+	/* Leave the nmi callback set */
+	disable_local_APIC();
+}
+#else
+static void nmi_shootdown_cpus(void)
+{
+	/* There are no cpus to shootdown */
+}
+#endif
+
 void machine_crash_shutdown(struct cpu_user_regs *regs)
 {
-    printk("STUB: arch/x86/crash.c: machine_crash_shutdown: not implemented\n");
+	printk("machine_crash_shutdown: %d\n", smp_processor_id());
+	local_irq_disable();
+
+	crashing_cpu = smp_processor_id();
+	nmi_shootdown_cpus();
+#ifdef CONFIG_X86_IO_APIC
+        disable_IO_APIC();
+#endif
+	crash_save_self(regs);
 }
 
 /*
--- x/xen/arch/x86/dom0_ops.c
+++ x/xen/arch/x86/dom0_ops.c
@@ -29,6 +29,9 @@
 #include <asm/mtrr.h>
 #include "cpu/mtrr/mtrr.h"
 
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
 #define TRC_DOM0OP_ENTER_BASE  0x00020000
 #define TRC_DOM0OP_LEAVE_BASE  0x00030000
 
--- x/xen/arch/x86/machine_kexec.c
+++ x/xen/arch/x86/machine_kexec.c
@@ -5,39 +5,71 @@
  *
  */
 
-#include <xen/lib.h>       /* for printk() used in stubs */
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/flushtlb.h>
+#include <xen/lib.h>
+#include <xen/smp.h>
+#include <xen/nmi.h>
 #include <xen/types.h>
+#include <xen/console.h>
+#include <xen/kexec.h>
 #include <public/kexec.h>
 
 int machine_kexec_prepare(int type, struct kexec_arg *arg)
 {
-    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_prepare: "
-        "not implemented\n");
-    return -1;
+	return 0;
 }
 
 void machine_kexec_cleanup(int type, struct kexec_arg *arg)
 {
-    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_cleanup: "
-        "not implemented\n");
 }
 
 void machine_kexec_reserved(struct kexec_arg *arg)
 {
-    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_reserved: "
-        "not implemented\n");
+    arg->u.reserve.size = opt_kdump_megabytes << 20;
+    arg->u.reserve.start = opt_kdump_megabytes_base << 20;
 }
 
-void machine_kexec(struct kexec_arg *arg)
+static void __machine_shutdown(void *data)
 {
-    printk("STUB: arch/x86/machine_kexec.c: machine_kexec: "
-        "not implemented\n");
+    struct kexec_arg *arg = (struct kexec_arg *)data;
+
+    printk("__machine_shutdown: cpu=%u\n", smp_processor_id());
+
+    watchdog_disable();
+    console_start_sync();
+
+    smp_send_stop();
+
+#ifdef CONFIG_X86_IO_APIC
+    disable_IO_APIC();
+#endif   
+
+    machine_kexec(arg);
 }
 
 void machine_shutdown(struct kexec_arg *arg)
 {
-    printk("STUB: arch/x86/machine_shutdown.c: machine_shutdown: "
-       "not implemented\n");
+    int reboot_cpu_id;
+    cpumask_t reboot_cpu;
+
+
+    reboot_cpu_id = 0;
+
+    if (!cpu_isset(reboot_cpu_id, cpu_online_map))
+        reboot_cpu_id = smp_processor_id();
+    
+    if (reboot_cpu_id != smp_processor_id()) {
+        cpus_clear(reboot_cpu);
+        cpu_set(reboot_cpu_id, reboot_cpu);
+        on_selected_cpus(reboot_cpu, __machine_shutdown, arg, 1, 0);
+	for (;;)
+		; /* nothing */
+    }
+    else
+        __machine_shutdown(arg);
+    BUG();
 }
 
 /*
--- x/xen/arch/x86/setup.c
+++ x/xen/arch/x86/setup.c
@@ -38,6 +38,11 @@ static unsigned int opt_xenheap_megabyte
 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
 #endif
 
+unsigned int opt_kdump_megabytes = 0;
+integer_param("kdump_megabytes", opt_kdump_megabytes);
+unsigned int opt_kdump_megabytes_base = 0;
+integer_param("kdump_megabytes_base", opt_kdump_megabytes_base);
+
 /* opt_nosmp: If true, secondary processors are ignored. */
 static int opt_nosmp = 0;
 boolean_param("nosmp", opt_nosmp);
@@ -192,6 +197,20 @@ static void percpu_free_unused_areas(voi
                        __pa(__per_cpu_end));
 }
 
+void __init move_memory(unsigned long dst, 
+                          unsigned long src_start, unsigned long src_end)
+{
+#if defined(CONFIG_X86_32)
+    memmove((void *)dst,  /* use low mapping */
+            (void *)src_start,      /* use low mapping */
+            src_end - src_start);
+#elif defined(CONFIG_X86_64)
+    memmove(__va(dst),
+            __va(src_start),
+            src_end - src_start);
+#endif
+}
+
 void __init __start_xen(multiboot_info_t *mbi)
 {
     char __cmdline[] = "", *cmdline = __cmdline;
@@ -327,15 +346,8 @@ void __init __start_xen(multiboot_info_t
         initial_images_start = xenheap_phys_end;
     initial_images_end = initial_images_start + modules_length;
 
-#if defined(CONFIG_X86_32)
-    memmove((void *)initial_images_start,  /* use low mapping */
-            (void *)mod[0].mod_start,      /* use low mapping */
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#elif defined(CONFIG_X86_64)
-    memmove(__va(initial_images_start),
-            __va(mod[0].mod_start),
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#endif
+    move_memory(initial_images_start, 
+                mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
 
     /* Initialise boot-time allocator with all RAM situated after modules. */
     xenheap_phys_start = init_boot_allocator(__pa(&_end));
@@ -383,6 +395,51 @@ void __init __start_xen(multiboot_info_t
 #endif
     }
 
+    if (opt_kdump_megabytes) {
+        unsigned long kdump_start, kdump_size, k;
+
+        /* mark images pages as free for now */
+
+        init_boot_pages(initial_images_start, initial_images_end);
+
+        kdump_start = opt_kdump_megabytes_base << 20;
+        kdump_size = opt_kdump_megabytes << 20;
+
+        printk("Kdump: %luMB (%lukB) at 0x%lx\n", 
+               kdump_size >> 20,
+               kdump_size >> 10,
+               kdump_start);
+
+        if ((kdump_start & ~PAGE_MASK) || (kdump_size & ~PAGE_MASK))
+            panic("Kdump parameters not page aligned\n");
+
+        kdump_start >>= PAGE_SHIFT;
+        kdump_size >>= PAGE_SHIFT;
+
+        /* allocate pages for Kdump memory area */
+
+        k = alloc_boot_pages_at(kdump_size, kdump_start);
+
+        if (k != kdump_start)
+            panic("Unable to reserve Kdump memory\n");
+
+        /* allocate pages for relocated initial images */
+
+        k = ((initial_images_end - initial_images_start) & ~PAGE_MASK) ? 1 : 0;
+        k += (initial_images_end - initial_images_start) >> PAGE_SHIFT;
+
+        k = alloc_boot_pages(k, 1);
+
+        if (!k)
+            panic("Unable to allocate initial images memory\n");
+
+        move_memory(k << PAGE_SHIFT, initial_images_start, initial_images_end);
+
+        initial_images_end -= initial_images_start;
+        initial_images_start = k << PAGE_SHIFT;
+        initial_images_end += initial_images_start;
+    }        
+
     memguard_init();
 
     printk("System RAM: %luMB (%lukB)\n", 
--- x/xen/arch/x86/x86_32/Makefile
+++ x/xen/arch/x86/x86_32/Makefile
@@ -3,5 +3,6 @@ obj-y += entry.o
 obj-y += mm.o
 obj-y += seg_fixup.o
 obj-y += traps.o
+obj-y += machine_kexec.o
 
 obj-$(supervisor_mode_kernel) += supervisor_mode_kernel.o
--- /dev/null
+++ x/xen/arch/x86/x86_32/machine_kexec.c
@@ -0,0 +1,27 @@
+/*
+ * arch/x86/x86_32/machine_kexec.c
+ * Handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Should be losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <public/kexec.h>
+
+void machine_kexec(struct kexec_arg *arg)
+{
+    printk("STUB: arch/x86/x86_32/machine_kexec.c: machine_kexec: "
+        "not implemented\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/x86_64/Makefile
+++ x/xen/arch/x86/x86_64/Makefile
@@ -1,3 +1,4 @@
 obj-y += entry.o
 obj-y += mm.o
 obj-y += traps.o
+obj-y += machine_kexec.o
--- /dev/null
+++ x/xen/arch/x86/x86_64/machine_kexec.c
@@ -0,0 +1,28 @@
+/*
+ * arch/x86/x86_64/machine_kexec.c
+ * Handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Should be losely based on arch/x86_64/kernel/machine_kexec.c
+ */
+
+#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+void machine_kexec(struct kexec_arg *arg)
+{
+    printk("STUB: arch/x86/x86_64/machine_kexec.c: machine_kexec: "
+        "not implemented\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/kexec.c
+++ x/xen/common/kexec.c
@@ -99,7 +99,8 @@ int do_kexec(unsigned long op, int type,
     case KEXEC_CMD_kexec_crash_note:
         return get_crash_note(uarg);
     case KEXEC_CMD_kexec_reserve:
-	machine_kexec_reserved(&tmp_arg);
+        tmp_arg.u.reserve.size = opt_kdump_megabytes << 20;
+        tmp_arg.u.reserve.start = opt_kdump_megabytes_base << 20;
         if ( unlikely(copy_to_guest(uarg, &tmp_arg, 1) != 0) )
         {
             printk("do_kexec (CMD_kexec_reserve): copy_to_guest failed\n");
--- /dev/null
+++ x/xen/include/asm-x86/elf.h
@@ -0,0 +1,27 @@
+/******************************************************************************
+ * include/asm-x86/elf.h
+ * 
+ * Created By: Horms
+ *
+ */
+
+#ifndef __X86_ELF_H__
+#define __X86_ELF_H__
+
+#ifdef __x86_64__
+#include <asm/x86_64/elf.h>
+#else
+#include <asm/x86_32/elf.h>
+#endif
+
+#endif /* __X86_ELF_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/asm-x86/hypercall.h
+++ x/xen/include/asm-x86/hypercall.h
@@ -6,6 +6,8 @@
 #define __ASM_X86_HYPERCALL_H__
 
 #include <public/physdev.h>
+#include <xen/types.h>
+#include <public/kexec.h>
 
 extern long
 do_event_channel_op_compat(
@@ -87,6 +89,10 @@ extern long
 arch_do_vcpu_op(
     int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg);
 
+extern int
+do_kexec(
+    unsigned long op, XEN_GUEST_HANDLE(kexec_arg_t) uarg);
+
 #ifdef __x86_64__
 
 extern long
--- x/xen/include/asm-x86/kexec.h
+++ x/xen/include/asm-x86/kexec.h
@@ -8,16 +8,16 @@
 #ifndef __X86_KEXEC_H__
 #define __X86_KEXEC_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <asm/processor.h>
 #include <xen/types.h>
+#include <xen/string.h>
 #include <public/xen.h>
 
-static void crash_setup_regs(struct cpu_user_regs *newregs,
-			     struct cpu_user_regs *oldregs)
-{
-    printk("STUB: include/asm-x86/kexec.h: crash_setup_regs: "
-       "not implemented\n");
-}
+#ifdef __x86_64__
+#include <asm/x86_64/kexec.h>
+#else
+#include <asm/x86_32/kexec.h>
+#endif
 
 #endif /* __X86_KEXEC_H__ */
 
--- /dev/null
+++ x/xen/include/asm-x86/x86_32/elf.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+ * include/asm-x86/x86_32/elf.h
+ * 
+ * Created By: Horms
+ *
+ * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS
+ * from Linux 2.6.16
+ */
+
+#ifndef __X86_ELF_X86_32_H__
+#define __X86_ELF_X86_32_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
+    printk("STUB: include/asm-x86/x86_32/kexec.h: ELF_CORE_COPY_REGS: " \
+       "not implemented\n")
+  
+
+#endif /* __X86_ELF_X86_32_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_32/kexec.h
@@ -0,0 +1,51 @@
+/******************************************************************************
+ * include/asm-x86/x86_32/kexec.h
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16
+ *
+ */
+
+#ifndef __X86_32_KEXEC_H__
+#define __X86_32_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
+		    struct cpu_user_regs *oldregs)
+{
+    printk("STUB: include/asm-x86/x86_32/kexec.h: crash_fixup_ss_esp: "
+       "not implemented\n");
+    return;
+    crash_fixup_ss_esp(newregs, oldregs);
+}
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: include/asm-x86/x86_32/kexec.h: crash_setup_regs: "
+       "not implemented\n");
+}
+
+static inline int user_mode(struct cpu_user_regs *regs)
+{
+    printk("STUB: include/asm-x86/x86_32/kexec.h: user_mode: "
+       "not implemented\n");
+    return -1;
+}
+
+
+#endif /* __X86_32_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_64/elf.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+ * include/asm-x86/x86_64/elf.h
+ * 
+ * Created By: Horms
+ *
+ * Should pull be based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS
+ * from Linux 2.6.16
+ */
+
+#ifndef __X86_ELF_X86_64_H__
+#define __X86_ELF_X86_64_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
+    printk("STUB: include/asm-x86/x86_64/kexec.h: ELF_CORE_COPY_REGS: " \
+       "not implemented\n")
+  
+
+#endif /* __X86_ELF_X86_64_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_64/kexec.h
@@ -0,0 +1,50 @@
+/******************************************************************************
+ * include/asm-x86/x86_64/kexec.h
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on include/asm-x86_64/kexec.h from Linux 2.6.16
+ *
+ */
+
+#ifndef __X86_64_KEXEC_H__
+#define __X86_64_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
+		    struct cpu_user_regs *oldregs)
+{
+    printk("STUB: include/asm-x86/x86_64/kexec.h: crash_fixup_ss_esp: "
+       "not implemented\n");
+}
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: include/asm-x86/x86_64/kexec.h: crash_setup_regs: "
+       "not implemented\n");
+    return;
+    crash_fixup_ss_esp(newregs, oldregs);
+}
+
+static inline int user_mode(struct cpu_user_regs *regs)
+{
+    printk("STUB: include/asm-x86/x86_64/kexec.h: user_mode: "
+       "not implemented\n");
+    return -1;
+}
+
+#endif /* __X86_64_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */

[-- Attachment #4: 51.2.1.1-kexec-x86_32-upstream.patch --]
[-- Type: text/plain, Size: 18623 bytes --]

kexec: x86_32

This is the x86_32 component of kexec for xen.
The x86 component is a prerequsite for this patch.

 buildconfigs/linux-defconfig_xen_x86_32                        |    1 
 linux-2.6-xen-sparse/arch/i386/Kconfig                         |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/Makefile                 |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c              |   25 +
 linux-2.6-xen-sparse/drivers/xen/core/crash.c                  |    2 
 linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h              |   19 +
 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h |   10 
 patches/linux-2.6.16.13/kexec-x86_32.patch                     |   12 
 xen/arch/x86/machine_kexec.c                                   |    6 
 xen/arch/x86/x86_32/entry.S                                    |    2 
 xen/arch/x86/x86_32/machine_kexec.c                            |  143 +++++++++-
 xen/include/asm-x86/x86_32/elf.h                               |   34 +-
 xen/include/asm-x86/x86_32/kexec.h                             |   68 +++-
 xen/include/xen/kexec.h                                        |    1 
 14 files changed, 281 insertions(+), 46 deletions(-)

--- x/buildconfigs/linux-defconfig_xen_x86_32
+++ x/buildconfigs/linux-defconfig_xen_x86_32
@@ -184,6 +184,7 @@ CONFIG_MTRR=y
 CONFIG_REGPARM=y
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
--- x/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ x/linux-2.6-xen-sparse/arch/i386/Kconfig
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_XEN
+	depends on EXPERIMENTAL
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
@@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o machine_kexec.o crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
@@ -68,6 +68,10 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -932,6 +936,7 @@ static void __init parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -942,6 +947,10 @@ static void __init parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1318,9 +1327,22 @@ void __init setup_bootmem_allocator(void
 	}
 #endif
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end)
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
+#else
+	{
+		struct kexec_arg xen_kexec_arg;
+		BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0,
+					&xen_kexec_arg));
+		if (xen_kexec_arg.u.reserve.size) {
+			crashk_res.start = xen_kexec_arg.u.reserve.start;
+			crashk_res.end = xen_kexec_arg.u.reserve.start + 
+				xen_kexec_arg.u.reserve.size - 1;
+		}
+	}
+#endif
 #endif
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1395,6 +1417,9 @@ legacy_init_iomem_resources(struct resou
 		res->end = map[i].end - 1;
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		request_resource(&iomem_resource, res);
+#ifdef CONFIG_KEXEC
+        request_resource(res, &crashk_res);
+#endif
 	}
 
 	free_bootmem(__pa(map), PAGE_SIZE);
--- x/linux-2.6-xen-sparse/drivers/xen/core/crash.c
+++ x/linux-2.6-xen-sparse/drivers/xen/core/crash.c
@@ -1,5 +1,5 @@
 /*
- * Architecture independent functions for kexec based crash dumps in xen.
+ * Architecture specific (i386-xen) functions for kexec based crash dumps.
  *
  * Created by: Horms <horms@verge.net.au>
  *
--- x/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h
+++ x/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h
@@ -7,11 +7,26 @@
 #ifndef _I386_KEXEC_XEN_H
 #define _I386_KEXEC_XEN_H
 
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_32.h>
+
 static inline void crash_translate_regs(struct pt_regs *linux_regs,
 					struct cpu_user_regs *xen_regs)
 {
-	printk("STUB: include/asm-i386/kexec-xen.h: crash_translate_regs: "
-	       "not implemented\n");
+	xen_regs->ebx    = linux_regs->ebx;
+	xen_regs->ecx    = linux_regs->ecx;
+	xen_regs->edx    = linux_regs->edx;
+	xen_regs->esi    = linux_regs->esi;
+	xen_regs->edi    = linux_regs->edi;
+	xen_regs->ebp    = linux_regs->ebp;
+	xen_regs->eax    = linux_regs->eax;
+	xen_regs->esp    = linux_regs->esp;
+	xen_regs->ss     = linux_regs->xss;
+	xen_regs->cs     = linux_regs->xcs;
+	xen_regs->ds     = linux_regs->xds;
+	xen_regs->es     = linux_regs->xes;
+	xen_regs->eflags = linux_regs->eflags;
 }
 
 #endif /* _I386_KEXEC_XEN_H */
--- x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
@@ -39,6 +39,8 @@
 # error "please don't include this file directly"
 #endif
 
+#include <xen/interface/kexec.h>
+
 #define __STR(x) #x
 #define STR(x) __STR(x)
 
@@ -359,6 +361,14 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, int type, kexec_arg_t * arg)
+{
+	return _hypercall3(int, kexec_op, op, type, arg); 
+}
+
+
 
 #endif /* __HYPERCALL_H__ */
 
--- x/xen/arch/x86/machine_kexec.c
+++ x/xen/arch/x86/machine_kexec.c
@@ -25,12 +25,6 @@ void machine_kexec_cleanup(int type, str
 {
 }
 
-void machine_kexec_reserved(struct kexec_arg *arg)
-{
-    arg->u.reserve.size = opt_kdump_megabytes << 20;
-    arg->u.reserve.start = opt_kdump_megabytes_base << 20;
-}
-
 static void __machine_shutdown(void *data)
 {
     struct kexec_arg *arg = (struct kexec_arg *)data;
--- x/xen/arch/x86/x86_32/entry.S
+++ x/xen/arch/x86/x86_32/entry.S
@@ -648,6 +648,7 @@ ENTRY(hypercall_table)
         .long do_xenoprof_op
         .long do_event_channel_op
         .long do_physdev_op
+        .long do_kexec
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -687,6 +688,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_xenoprof_op       */
         .byte 2 /* do_event_channel_op  */
         .byte 2 /* do_physdev_op        */
+        .byte 3 /* do_kexec             */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- x/xen/arch/x86/x86_32/machine_kexec.c
+++ x/xen/arch/x86/x86_32/machine_kexec.c
@@ -1,19 +1,146 @@
-/*
+/******************************************************************************
  * arch/x86/x86_32/machine_kexec.c
- * Handle transition of Linux booting another kernel
- *
- * Created By: Horms <horms@verge.net.au>
+ * 
+ * Created By: Horms
  *
- * Should be losely based on arch/i386/kernel/machine_kexec.c
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
  */
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/domain_page.h> 
+#include <xen/timer.h>
+#include <xen/sched.h>
+#include <xen/reboot.h>
+#include <asm/page.h> 
+#include <asm/flushtlb.h>
 #include <public/kexec.h>
 
+typedef asmlinkage void (*relocate_new_kernel_t)(
+                    unsigned long indirection_page,
+                    unsigned long reboot_code_buffer,
+                    unsigned long start_address,
+                    unsigned int has_pae);
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L2_ATTR (_PAGE_PRESENT)
+
+#ifndef CONFIG_X86_PAE
+
+static u32 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    unsigned long mfn;
+    u32 *pgtable_level2;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level2 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    write_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level2);
+}
+
+#else
+static u64 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+static u64 pgtable_level2[L2_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    int mfn;
+    intpte_t *pgtable_level3;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level3 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+    set_64bit(&pgtable_level3[l3_table_offset(address)],
+	      __pa(pgtable_level2) | L2_ATTR);
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    load_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level3);
+}
+#endif
+
+static void kexec_load_segments(void)
+{
+#define __SSTR(X) #X
+#define SSTR(X) __SSTR(X)
+    __asm__ __volatile__ (
+        "\tljmp $"SSTR(__HYPERVISOR_CS)",$1f\n"
+        "\t1:\n"
+        "\tmovl $"SSTR(__HYPERVISOR_DS)",%%eax\n"
+        "\tmovl %%eax,%%ds\n"
+        "\tmovl %%eax,%%es\n"
+        "\tmovl %%eax,%%fs\n"
+        "\tmovl %%eax,%%gs\n"
+        "\tmovl %%eax,%%ss\n"
+        ::: "eax", "memory");
+#undef SSTR
+#undef __SSTR
+}
+
+#define kexec_load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
+static void kexec_set_idt(void *newidt, __u16 limit)
+{
+    struct Xgt_desc_struct curidt;
+
+    /* ia32 supports unaliged loads & stores */
+    curidt.size    = limit;
+    curidt.address = (unsigned long)newidt;
+    
+    kexec_load_idt(&curidt);
+
+};
+
+#define kexec_load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
+static void kexec_set_gdt(void *newgdt, __u16 limit)
+{
+    struct Xgt_desc_struct curgdt;
+
+    /* ia32 supports unaligned loads & stores */
+    curgdt.size    = limit;
+    curgdt.address = (unsigned long)newgdt;
+
+    kexec_load_gdt(&curgdt);
+};
+
 void machine_kexec(struct kexec_arg *arg)
 {
-    printk("STUB: arch/x86/x86_32/machine_kexec.c: machine_kexec: "
-        "not implemented\n");
+    relocate_new_kernel_t rnk;
+
+    local_irq_disable();
+
+    identity_map_page(arg->u.image.reboot_code_buffer);
+
+    copy_from_user((void *)arg->u.image.reboot_code_buffer, 
+           arg->u.image.relocate_new_kernel,
+           arg->u.image.relocate_new_kernel_size);
+
+    kexec_load_segments();
+    kexec_set_gdt(__va(0),0);
+    kexec_set_idt(__va(0),0);
+
+    rnk = (relocate_new_kernel_t) arg->u.image.reboot_code_buffer;
+    (*rnk)(arg->u.image.indirection_page, arg->u.image.reboot_code_buffer, 
+           arg->u.image.start_address, cpu_has_pae);
 }
 
 /*
--- x/xen/include/asm-x86/x86_32/elf.h
+++ x/xen/include/asm-x86/x86_32/elf.h
@@ -3,19 +3,39 @@
  * 
  * Created By: Horms
  *
- * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS
- * from Linux 2.6.16
+ * Based heavily on include/asm-i386/elf.h and 
+ * include/asm-i386/system.h from Linux 2.6.16
  */
 
 #ifndef __X86_ELF_X86_32_H__
 #define __X86_ELF_X86_32_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+/* XXX: Xen doesn't have orig_eax.  For kdump, on a dom0 crash, the values
+ * for the crashing CPU could could be passed down from dom0, but is that
+ * neccessary?
+ * Also, I'm not sure why fs and gs are derived from the CPU
+ * rather than regs */
 
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
-    printk("STUB: include/asm-x86/x86_32/kexec.h: ELF_CORE_COPY_REGS: " \
-       "not implemented\n")
-  
+#define ELF_CORE_COPY_REGS(pr_reg, regs) do {                           \
+    unsigned i;                                                         \
+    pr_reg[0] = regs->ebx;                                              \
+    pr_reg[1] = regs->ecx;                                              \
+    pr_reg[2] = regs->edx;                                              \
+    pr_reg[3] = regs->esi;                                              \
+    pr_reg[4] = regs->edi;                                              \
+    pr_reg[5] = regs->ebp;                                              \
+    pr_reg[6] = regs->eax;                                              \
+    pr_reg[7] = regs->ds;                                               \
+    pr_reg[8] = regs->es;                                               \
+    asm volatile("mov %%fs,%0":"=rm" (i)); pr_reg[9]  = i;              \
+    asm volatile("mov %%gs,%0":"=rm" (i)); pr_reg[10] = i;              \
+    pr_reg[11] = 0; /* regs->orig_eax; */                               \
+    pr_reg[12] = regs->eip;                                             \
+    pr_reg[13] = regs->cs;                                              \
+    pr_reg[14] = regs->eflags;                                          \
+    pr_reg[15] = regs->esp;                                             \
+    pr_reg[16] = regs->ss;                                              \
+} while(0);
 
 #endif /* __X86_ELF_X86_32_H__ */
 
--- x/xen/include/asm-x86/x86_32/kexec.h
+++ x/xen/include/asm-x86/x86_32/kexec.h
@@ -3,42 +3,72 @@
  * 
  * Created By: Horms
  *
- * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16
- *
+ * Based heavily on include/asm-i386/kexec.h from Linux 2.6.16
  */
 
-#ifndef __X86_32_KEXEC_H__
-#define __X86_32_KEXEC_H__
-
-#include <xen/lib.h>       /* for printk() used in stub */
-#include <xen/types.h>
-#include <public/xen.h>
+#ifndef __X86_KEXEC_X86_32_H__
+#define __X86_KEXEC_X86_32_H__
 
+/* CPU does not save ss and esp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
 static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
-		    struct cpu_user_regs *oldregs)
+                    struct cpu_user_regs *oldregs)
 {
-    printk("STUB: include/asm-x86/x86_32/kexec.h: crash_fixup_ss_esp: "
-       "not implemented\n");
-    return;
-    crash_fixup_ss_esp(newregs, oldregs);
+    memcpy(newregs, oldregs, sizeof(*newregs));
+    newregs->esp = (unsigned long)&(oldregs->esp);
+    __asm__ __volatile__(
+            "xorl %%eax, %%eax\n\t"
+            "movw %%ss, %%ax\n\t"
+            :"=a"(newregs->ss));
 }
 
+/*
+ * This function is responsible for capturing register states if coming
+ * via panic otherwise just fix up the ss and esp if coming via kernel
+ * mode exception.
+ */
 static void crash_setup_regs(struct cpu_user_regs *newregs,
 			     struct cpu_user_regs *oldregs)
 {
-    printk("STUB: include/asm-x86/x86_32/kexec.h: crash_setup_regs: "
-       "not implemented\n");
+    if (oldregs)
+        crash_fixup_ss_esp(newregs, oldregs);
+    else {
+        __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx));
+        __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx));
+        __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx));
+        __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi));
+        __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi));
+        __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp));
+        __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax));
+        __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp));
+        __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->ss));
+        __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->cs));
+        __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->ds));
+        __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->es));
+        __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags));
+
+        newregs->eip = (unsigned long)current_text_addr();
+    }
 }
 
+/*              
+ * From Linux 2.6.16's include/asm-i386/mach-xen/asm/ptrace.h
+ *
+ * user_mode_vm(regs) determines whether a register set came from user mode.
+ * This is true if V8086 mode was enabled OR if the register set was from
+ * protected mode with RPL-3 CS value.  This tricky test checks that with
+ * one comparison.  Many places in the kernel can bypass this full check
+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ */
 static inline int user_mode(struct cpu_user_regs *regs)
 {
-    printk("STUB: include/asm-x86/x86_32/kexec.h: user_mode: "
-       "not implemented\n");
-    return -1;
+    return (regs->cs & 2) != 0;
 }
 
 
-#endif /* __X86_32_KEXEC_H__ */
+#endif /* __X86_KEXEC_X86_32_H__ */
 
 /*
  * Local variables:
--- x/xen/include/xen/kexec.h
+++ x/xen/include/xen/kexec.h
@@ -14,7 +14,6 @@ DECLARE_PER_CPU (note_buf_t, crash_notes
 
 int machine_kexec_prepare(int type, struct kexec_arg *arg);
 void machine_kexec_cleanup(int type, struct kexec_arg *arg);
-void machine_kexec_reserved(struct kexec_arg *arg);
 void machine_kexec(struct kexec_arg *arg);
 void machine_shutdown(struct kexec_arg *arg);
 void machine_crash_shutdown(struct cpu_user_regs *regs);
--- /dev/null	2006-05-09 15:32:30.399072192 +0900
+++ x/patches/linux-2.6.16.13/kexec-x86_32.patch	2006-05-17 18:37:45.000000000 +0900
@@ -0,0 +1,12 @@
+--- x/arch/i386/kernel/crash.c
++++ x/arch/i386/kernel/crash.c
+@@ -175,9 +175,5 @@ void machine_crash_shutdown(struct pt_re
+ 	/* Make a note of crashing cpu. Will be used in NMI callback.*/
+ 	crashing_cpu = smp_processor_id();
+ 	nmi_shootdown_cpus();
+-	lapic_shutdown();
+-#if defined(CONFIG_X86_IO_APIC)
+-	disable_IO_APIC();
+-#endif
+ 	crash_save_self(regs);
+ }

[-- Attachment #5: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386 (Take IX)
  2006-05-17  9:52                                             ` Re: [PATCH]: kexec: framework and i386 (Take IX) Horms
@ 2006-05-17 10:10                                               ` Keir Fraser
  2006-05-18  3:37                                                 ` Horms
  0 siblings, 1 reply; 68+ messages in thread
From: Keir Fraser @ 2006-05-17 10:10 UTC (permalink / raw)
  To: Horms
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Akio Takebe, Isaku Yamahata,
	Magnus Damm, Mark Williamson


On 17 May 2006, at 10:52, Horms wrote:

> as promised earlier in the day, here is an update on the kexec/kdump
> patch. The main changes are that SMP now works, and the dumping of
> cpu registers for kdump has been moved into the hypervisor so as to
> allow all CPUs to be captured, not just dom0's VCPUs.

Just looking at the generic patch:
  * Define KEXEC_CMD_* in your public kexec.h header, not xen.h.
  * Don't pack all the different arg structs into a union -- the union 
will change in size if you ever add a bigger argument substructure, 
plus it's ugly. Split them out and put a comment by each KEXEC_CMD_* 
definition explaining what its argument parameter points at (see other 
header files like vcpu.h for an example).
  * Can you explain the need for all the changesin your kexec.patch? I 
guess there are some virt_to_phys address translations that need fixing 
up, but you also scatter a few hypercalls around in there (e.g., in 
base/cpu.c) -- can they not be handled more cleanly, or is kexec-on-xen 
somehow more special than kexec on any native architecture?

  -- Keir

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH]: kexec: framework and i386 (Take IX)
  2006-05-17 10:10                                               ` Keir Fraser
@ 2006-05-18  3:37                                                 ` Horms
  2006-05-25  7:20                                                   ` [PATCH] kexec: framework and i386 (Take X) Horms
  0 siblings, 1 reply; 68+ messages in thread
From: Horms @ 2006-05-18  3:37 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Akio Takebe, Isaku Yamahata,
	Magnus Damm, Mark Williamson

On Wed, May 17, 2006 at 11:10:45AM +0100, Keir Fraser wrote:
> 
> On 17 May 2006, at 10:52, Horms wrote:
> 
> >as promised earlier in the day, here is an update on the kexec/kdump
> >patch. The main changes are that SMP now works, and the dumping of
> >cpu registers for kdump has been moved into the hypervisor so as to
> >allow all CPUs to be captured, not just dom0's VCPUs.
> 
> Just looking at the generic patch:
>  * Define KEXEC_CMD_* in your public kexec.h header, not xen.h.
>  * Don't pack all the different arg structs into a union -- the union 
> will change in size if you ever add a bigger argument substructure, 
> plus it's ugly. Split them out and put a comment by each KEXEC_CMD_* 
> definition explaining what its argument parameter points at (see other 
> header files like vcpu.h for an example).
>  * Can you explain the need for all the changesin your kexec.patch? I 
> guess there are some virt_to_phys address translations that need fixing 
> up, but you also scatter a few hypercalls around in there (e.g., in 
> base/cpu.c) -- can they not be handled more cleanly, or is kexec-on-xen 
> somehow more special than kexec on any native architecture?

Hi Keir,

thanks for your suggestions, I'll address these and send a more
detailed reply a little later.

-- 
Horms                                           http://www.vergenet.net/~horms/

^ permalink raw reply	[flat|nested] 68+ messages in thread

* [PATCH] kexec: framework and i386 (Take X)
  2006-05-18  3:37                                                 ` Horms
@ 2006-05-25  7:20                                                   ` Horms
  2006-06-05  2:53                                                     ` Akio Takebe
  2006-06-15  7:29                                                     ` [PATCH] kexec: framework and i386 (Take XI) Horms
  0 siblings, 2 replies; 68+ messages in thread
From: Horms @ 2006-05-25  7:20 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Akio Takebe, Isaku Yamahata,
	Magnus Damm, Mark Williamson

[-- Attachment #1: Type: text/plain, Size: 5187 bytes --]

Hi,

sorry for the somewhat long delay between sending updates.
I'm happy to announce tenth take of the kexec/kdump patch.
I'll address Keir's questions from the 9th release below,
but first I would like to quickly summarise the patches.

Kexec/kdump is implemented by moving the privelaged portions
(and related plumbing where needed) from linux into the hypervisor.
This is primarily done by implementing kexec's architecture
independent hooks as hypercalls.

Both Kexec is working for x86_32 and x86_64 for SMP and UP.
Kdump is also working for SMP and UP on x86_32. x86_64 may work,
but still needs more attention. In particular the register
saving code has not been implemented.

These patches also include some reworking of kexec's internals in
order that the page table is not mangled on kdump. These changes
also make x86_64 kexec/kdump somewhat easier to implement.
Collectively this is the pagetable_a approach developed by my colleague
Magnus Damm, and he is working with the linux kexec maintainers to
get it merged there.

The code is broken out into four patches.
They should apply cleanly to xen-unstable.hg 10151.

   1. 51.1-kexec-generic-upstream.patch
      * Common code for all architectures,

        the basic plumbing for kexec/kdump
   2. 51.2.1-kexec-x86-upstream.patch
      * Glue between 1, and 3 and 4.
        This would not be needed for ppc or ia64, but
	neither have been written yet.
	We are planning to commence work on ia64 soon.
      * Depends on 1

   3. 51.2.1.1-kexec-x86_32-upstream.patch
      * Kexec/kdump for x86_32
      * Depends on 2 (and 1)

   4. 51.2.31.2-kexec-x86_64-upstream.patch
      * * Kexec/kdump for x86_64
      * Depends on 2 (and 1)

On Thu, May 18, 2006 at 12:37:54PM +0900, Horms wrote:
> On Wed, May 17, 2006 at 11:10:45AM +0100, Keir Fraser wrote:
> > 
> > On 17 May 2006, at 10:52, Horms wrote:
> > 
> > >as promised earlier in the day, here is an update on the kexec/kdump
> > >patch. The main changes are that SMP now works, and the dumping of
> > >cpu registers for kdump has been moved into the hypervisor so as to
> > >allow all CPUs to be captured, not just dom0's VCPUs.
> > 
> > Just looking at the generic patch:
> >  * Define KEXEC_CMD_* in your public kexec.h header, not xen.h.
> >  * Don't pack all the different arg structs into a union -- the union 
> > will change in size if you ever add a bigger argument substructure, 
> > plus it's ugly. Split them out and put a comment by each KEXEC_CMD_* 
> > definition explaining what its argument parameter points at (see other 
> > header files like vcpu.h for an example).

I have changed both of these things.

> >  * Can you explain the need for all the changesin your kexec.patch? I 
> > guess there are some virt_to_phys address translations that need fixing 
> > up, but you also scatter a few hypercalls around in there (e.g., in 
> > base/cpu.c) -- can they not be handled more cleanly, or is kexec-on-xen 
> > somehow more special than kexec on any native architecture?

Sure. There are several areas of change, I will address them one by one.
If I have missed any, please let me know

* pfn vs mfn

  Linux kexec works in pfns, but as kexec needs to work in real mode
  in Xen mfns are needed. This change should be fairly obvious, though
  more invasive than I would have liked.

* get_crash_notes

  When a kernel is loaded for kexec or kdump part of the work
  is done in user-space. In particular the elf header is created in
  user-space and it needs to know the location of the elf notes
  where the registers are saved on crash dump. As only xen knows
  where all the CPUs the notes are handled by the hypervisor and
  a hypercall is used by get_crash_notes() to get the address of the
  notes which is exposed to userspace as required by kexec-tool.

  It is worth noting that only dom0's vcpus are exposed to user space,
  however all CPUs notes will be written by xen. In practice I stronly
  suspect that a customised tool will be needed to analyise crash dumps,
  well the xen specific parts anyway, and such a tool should
  be able to find the crash notes that are not in the elf header.

  Actually, I'm not really sure why the crash notes need to be in the
  elf header at all. In essence this code is really just there to keep
  kexec-tool happy and avoid having to modify it.  To that end I am
  happy to say that neither kexec-tool nor the target kernel (crash or
  kexec kernel) need to be modified in order to kexec or kdump from xen.

* xen_machine_kexec_load and xen_machine_kexec_unload

  It was originally hoped that the machine_kexec_prepare and
  machine_kexec_cleanup hooks could be used, however it turns out
  that the place that they are called in is not very useful for xen.
  Well, on x86_32 and x86_64 at least. So instead xen_machine_kexec_load
  and xen_machine_kexec_unload were added.
  
  xen_machine_kexec_load loads the kernel into xen. It is at this time
  that all preparation is work is done. Leavking xen_machine_kexec as
  just a trigger.  xen_machine_kexec_unload reverses the work of
  xen_machine_kexec_load.
  
-- 
Horms                                           http://www.vergenet.net/~horms/


[-- Attachment #2: 51.1-kexec-generic-upstream.patch --]
[-- Type: text/plain, Size: 34251 bytes --]

kexec: framework

This is an implementation of kexec for dom0/xen, that allows
kexecing of the physical machine from xen. The approach taken is
to move the architecture-dependant kexec code into a new hypercall.

This patch only includes the framework, it cann't be used without
architecture dependant hooks, however the code should compile as is.

Signed-Off-By: Horms <horms@verge.net.au>
Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>

 linux-2.6-xen-sparse/drivers/xen/core/Makefile                        |    1 
 linux-2.6-xen-sparse/drivers/xen/core/crash.c                         |   48 +
 linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c                 |   84 ++
 linux-2.6-xen-sparse/drivers/xen/core/reboot.c                        |    4 
 patches/linux-2.6.16.13/0-linux-2.6.16-kexec_page_table_a_stubs.patch |   85 ++
 patches/linux-2.6.16.13/kexec-generic.patch                           |  294 ++++++++++
 xen/arch/x86/Makefile                                                 |    2 
 xen/arch/x86/crash.c                                                  |   26 
 xen/arch/x86/machine_kexec.c                                          |   51 +
 xen/common/Makefile                                                   |    1 
 xen/common/kexec.c                                                    |  188 ++++++
 xen/common/page_alloc.c                                               |   33 -
 xen/drivers/char/console.c                                            |    3 
 xen/include/asm-x86/kexec.h                                           |   32 +
 xen/include/public/kexec.h                                            |   85 ++
 xen/include/public/xen.h                                              |    1 
 xen/include/xen/elfcore.h                                             |   73 ++
 xen/include/xen/kexec.h                                               |   33 +
 xen/include/xen/mm.h                                                  |    1 
 19 files changed, 1034 insertions(+), 11 deletions(-)

--- x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
+++ x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_SMP)         += smpboot.o
 obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
 obj-$(CONFIG_SYSFS)       += hypervisor_sysfs.o
 obj-$(CONFIG_XEN_SYSFS)   += xen_sysfs.o
+obj-$(CONFIG_KEXEC)       += machine_kexec.o crash.o
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/crash.c
@@ -0,0 +1,48 @@
+/*
+ * Architecture independent functions for kexec based crash dumps in xen.
+ *
+ * Created by: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <asm/ptrace.h>
+#include <linux/types.h>
+#include <asm/kexec-xen.h>
+#include <asm/hypervisor.h>
+#include <asm/system.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <asm/hw_irq.h>
+#include <xen/interface/kexec.h>
+
+/* 
+ * This passes the registers's down to the hypervisor and has it kexec()
+ * This is a bit different to the linux implementation which
+ * has this call save registers and stop CPUs and then goes into
+ * machine_kexec() later. But for Xen it makes more sense to
+ * have the kexec hypercall do everything, and this call
+ * has the registers parameter that is needed.
+ * to the hypervisor to allow the hypervisor to kdump itself
+ * on an internal panic 
+ */
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	struct cpu_user_regs xen_regs;
+	printk("machine_crash_shutdown: %d\n", smp_processor_id());
+	local_irq_disable();
+#ifdef CONFIG_X86_IO_APIC
+	disable_IO_APIC();
+#endif
+	crash_translate_regs(regs, &xen_regs);
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, KEXEC_TYPE_CRASH, &xen_regs);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
@@ -0,0 +1,84 @@
+/*
+ * drivers/xen/core/machine_kexec.c 
+ * handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <linux/kexec.h>
+#include <xen/interface/kexec.h>
+#include <linux/mm.h>
+#include <asm/hypercall.h>
+#include <asm/kexec-xen.h>
+
+const extern unsigned char relocate_new_kernel[];
+extern unsigned int relocate_new_kernel_size;
+
+static inline unsigned long machine_address(struct page *page)
+{
+	return pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT;
+}
+
+static void setup_hypercall_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+	memset(xki, 0, sizeof(*xki));
+
+	xki->indirection_page = image->head;
+	xki->reboot_code_buffer = 
+		machine_address(image->control_code_page);
+	xki->start_address = image->start;
+}
+
+/*
+ * Load the image into xen so xen can kdump itself
+ * This might have been done in prepare, but prepare
+ * is currently called too early. It might make sense
+ * to move prepare, but for now, just add an extra hook.
+ */
+int xen_machine_kexec_load(struct kimage *image)
+{
+	xen_kexec_image_t xki;
+
+	setup_hypercall_arg(&xki, image);
+	return HYPERVISOR_kexec(KEXEC_CMD_kexec_load, image->type, &xki);
+}
+
+/*
+ * Unload the image that was stored by machine_kexec_load()
+ * This might have been done in machine_kexec_cleanup() but it
+ * is called too late, and its possible xen could try and kdump
+ * using resources that have been freed.
+ */
+void xen_machine_kexec_unload(struct kimage *image)
+{
+	xen_kexec_image_t xki;
+
+	setup_hypercall_arg(&xki, image);
+	HYPERVISOR_kexec(KEXEC_CMD_kexec_unload, image->type, &xki);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ *
+ * This has the hypervisor move to the prefered reboot CPU, 
+ * stop all CPUs and kexec. That is it combines machine_shutdown()
+ * and machine_kexec() in Linux kexec terms.
+ */
+NORET_TYPE void xen_machine_kexec(struct kimage *image)
+{
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, image->type, NULL);
+	panic("KEXEC_CMD_kexec hypercall should not return\n");
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
+++ x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
@@ -67,6 +67,10 @@ void machine_power_off(void)
 	HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 }
 
+#ifdef CONFIG_KEXEC
+void machine_shutdown(void) { }
+#endif
+
 int reboot_thru_bios = 0;	/* for dmi_scan.c */
 EXPORT_SYMBOL(machine_restart);
 EXPORT_SYMBOL(machine_halt);
--- x/xen/arch/x86/Makefile
+++ x/xen/arch/x86/Makefile
@@ -39,6 +39,8 @@ obj-y += trampoline.o
 obj-y += traps.o
 obj-y += usercopy.o
 obj-y += x86_emulate.o
+obj-y += machine_kexec.o
+obj-y += crash.o
 
 ifneq ($(pae),n)
 obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o
--- /dev/null
+++ x/xen/arch/x86/crash.c
@@ -0,0 +1,26 @@
+/******************************************************************************
+ * arch/x86/crash.c
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+void machine_crash_shutdown(struct cpu_user_regs *regs)
+{
+    printk("STUB: arch/x86/crash.c: machine_crash_shutdown: not implemented\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/arch/x86/machine_kexec.c
@@ -0,0 +1,51 @@
+/******************************************************************************
+ * arch/x86/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ */
+
+#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+int machine_kexec_load(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_load: "
+        "not implemented\n");
+    return -1;
+}
+
+void machine_kexec_unload(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_unload: "
+        "not implemented\n");
+}
+
+void machine_kexec_reserved(xen_kexec_reserve_t *reservation)
+{
+    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_reserved: "
+        "not implemented\n");
+}
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: arch/x86/machine_kexec.c: machine_kexec: "
+        "not implemented\n");
+}
+
+void machine_shutdown(xen_kexec_image_t *image)
+{
+    printk("STUB: arch/x86/machine_shutdown.c: machine_shutdown: "
+       "not implemented\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/Makefile
+++ x/xen/common/Makefile
@@ -7,6 +7,7 @@ obj-y += event_channel.o
 obj-y += grant_table.o
 obj-y += kernel.o
 obj-y += keyhandler.o
+obj-y += kexec.o
 obj-y += lib.o
 obj-y += memory.o
 obj-y += multicall.o
--- /dev/null
+++ x/xen/common/kexec.c
@@ -0,0 +1,188 @@
+/*
+ * common/kexec.c - Achitecture independent kexec code for Xen
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Based in part on Linux 2.6.16's kernel/kexec.c
+ */
+
+#include <asm/kexec.h>
+#include <xen/lib.h>
+#include <xen/errno.h>
+#include <xen/guest_access.h>
+#include <xen/sched.h>
+#include <xen/types.h>
+#include <xen/kexec.h>
+#include <public/kexec.h>
+
+DEFINE_PER_CPU (note_buf_t, crash_notes);
+
+static xen_kexec_image_t kexec_image;
+static int kexec_image_set = 0;
+static xen_kexec_image_t kexec_crash_image;
+static int kexec_crash_image_set = 0;
+static int kexec_crash_lock = 0;
+
+/* Must call with kexec_crash_lock held */
+void __crash_kexec(struct cpu_user_regs *regs)
+{
+    struct cpu_user_regs fixed_regs;
+
+    if (!kexec_crash_image_set)
+	    return;
+    crash_setup_regs(&fixed_regs, regs);
+    machine_crash_shutdown(&fixed_regs);
+    machine_kexec(&kexec_crash_image); /* Does not return */
+}
+
+void crash_kexec(struct cpu_user_regs *regs)
+{
+    int locked;
+
+    locked = xchg(&kexec_crash_lock, 1);
+    if (locked)
+        return;
+    __crash_kexec(regs);
+    xchg(&kexec_crash_lock, 0);
+}
+
+static int get_crash_note(int vcpuid, XEN_GUEST_HANDLE(void) uarg)
+{
+    struct domain *domain = current->domain;
+    unsigned long crash_note;
+    struct vcpu *vcpu;
+    int locked;
+
+    if (vcpuid < 0 || vcpuid > MAX_VIRT_CPUS)
+	return -EINVAL;
+
+    if ( ! (vcpu = domain->vcpu[vcpuid]) )
+	return -EINVAL;
+
+    locked = xchg(&kexec_crash_lock, 1);
+    if (locked)
+    {
+       printk("do_kexec: (CMD_kexec_crash_note): dump is locked\n");
+       return -EFAULT;
+    }
+    crash_note = __pa((unsigned long)per_cpu(crash_notes, vcpu->processor));
+    xchg(&kexec_crash_lock, 0);
+
+    if ( unlikely(copy_to_guest(uarg, &crash_note, 1) != 0) )
+    {
+        printk("do_kexec: (CMD_kexec_crash_note): copy_to_guest failed\n");
+        return -EFAULT;
+    }
+    
+    return 0;
+}
+
+static int get_reserve(XEN_GUEST_HANDLE(void) uarg)
+{
+    xen_kexec_reserve_t reservation;
+
+    machine_kexec_reserved(&reservation);
+    if ( unlikely(copy_to_guest(uarg, &reservation, 1) != 0) )
+    {
+        printk("do_kexec (CMD_kexec_reserve): copy_to_guest failed\n");
+        return -EFAULT;
+    }
+    
+    return 0;
+}
+
+static int __do_kexec(unsigned long type, XEN_GUEST_HANDLE(void) uarg,
+		      xen_kexec_image_t *image)
+{
+    cpu_user_regs_t regs;
+
+    if (type == KEXEC_TYPE_DEFAULT)
+        machine_shutdown(image); /* Does not return */
+    else
+    {
+        if ( unlikely(copy_from_guest(&regs, uarg, 1) != 0) )
+        {
+            printk("do_kexec (CMD_kexec): copy_from_guest failed\n");
+            return -EFAULT;
+        }
+        __crash_kexec(&regs); /* Does not return */
+    }
+
+    return -EINVAL;
+}
+
+int do_kexec(unsigned long op, int arg1, XEN_GUEST_HANDLE(void) uarg)
+{
+    xen_kexec_image_t *image;
+    int locked;
+    int *image_set;
+    int status = -EINVAL;
+
+    if ( !IS_PRIV(current->domain) )  
+        return -EPERM;
+
+    switch (op)
+    {
+    case KEXEC_CMD_kexec_crash_note:
+        return get_crash_note(arg1, uarg);
+    case KEXEC_CMD_kexec_reserve:
+	return get_reserve(uarg);
+    }
+
+    /* For all other ops, arg1 is the type of kexec, that is
+     * KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH */
+    if (arg1 == KEXEC_TYPE_CRASH)
+    {
+        image = &kexec_crash_image;
+        image_set = &kexec_crash_image_set;
+        locked = xchg(&kexec_crash_lock, 1);
+        if (locked)
+        {
+           printk("do_kexec: dump is locked\n");
+           return -EFAULT;
+        }
+    }
+    else
+    {
+        image = &kexec_image;
+        image_set = &kexec_image_set;
+    }
+
+    switch(op) {
+    case KEXEC_CMD_kexec:
+        BUG_ON(!*image_set);
+	status = __do_kexec(arg1, uarg, image);
+        break;
+    case KEXEC_CMD_kexec_load:
+        BUG_ON(*image_set);
+        if ( unlikely(copy_from_guest(image, uarg, 1) != 0) )
+        {
+            printk("do_kexec (CMD_kexec_load): copy_from_guest failed\n");
+            status = -EFAULT;
+	    break;
+        }
+        *image_set = 1;
+        status = machine_kexec_load(arg1, image);
+        break;
+    case KEXEC_CMD_kexec_unload:
+        BUG_ON(!*image_set);
+        *image_set = 0;
+        machine_kexec_unload(arg1, image);
+        status = 0;
+        break;
+    }
+
+    if (arg1 == KEXEC_TYPE_CRASH)
+        xchg(&kexec_crash_lock, 0);
+    return status;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/page_alloc.c
+++ x/xen/common/page_alloc.c
@@ -212,24 +212,35 @@ void init_boot_pages(paddr_t ps, paddr_t
     }
 }
 
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at)
+{
+    unsigned long i;
+
+    for ( i = 0; i < nr_pfns; i++ )
+        if ( allocated_in_map(pfn_at + i) )
+             break;
+
+    if ( i == nr_pfns )
+    {
+        map_alloc(pfn_at, nr_pfns);
+        return pfn_at;
+    }
+
+    return 0;
+}
+
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
 {
-    unsigned long pg, i;
+    unsigned long pg, i = 0;
 
     for ( pg = 0; (pg + nr_pfns) < max_page; pg += pfn_align )
     {
-        for ( i = 0; i < nr_pfns; i++ )
-            if ( allocated_in_map(pg + i) )
-                 break;
-
-        if ( i == nr_pfns )
-        {
-            map_alloc(pg, nr_pfns);
-            return pg;
-        }
+        i = alloc_boot_pages_at(nr_pfns, pg);
+        if (i != 0)
+            break;
     }
 
-    return 0;
+    return i;
 }
 
 
--- x/xen/drivers/char/console.c
+++ x/xen/drivers/char/console.c
@@ -677,6 +677,7 @@ void panic(const char *fmt, ...)
     unsigned long flags;
     static spinlock_t lock = SPIN_LOCK_UNLOCKED;
     extern void machine_restart(char *);
+    extern void crash_kexec(struct cpu_user_regs *regs);
     
     debugtrace_dump();
 
@@ -696,6 +697,8 @@ void panic(const char *fmt, ...)
 
     debugger_trap_immediate();
 
+    crash_kexec(NULL);
+
     watchdog_disable();
     mdelay(5000);
     machine_restart(0);
--- /dev/null
+++ x/xen/include/asm-x86/kexec.h
@@ -0,0 +1,32 @@
+/******************************************************************************
+ * include/asm-x86/kexec.h
+ * 
+ * Created By: Horms
+ *
+ */
+
+#ifndef __X86_KEXEC_H__
+#define __X86_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: include/asm-x86/kexec.h: crash_setup_regs: "
+       "not implemented\n");
+}
+
+#endif /* __X86_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/public/kexec.h
@@ -0,0 +1,85 @@
+/*
+ * kexec.h - Public portion
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Types based on those in ./vcpu.h on request from Keir Frasier
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+#include "xen.h"
+
+/*
+ * Prototype for this hypercall is:
+ *  int kexec_op(int cmd, int type, void *extra_args)
+ * @cmd        == KEXEC_CMD_... 
+ *                KEXEC operation to perform
+ * @arg1       == Operation-specific unsigned long argument
+ *                This could be in extra_args, but by putting it here
+ *                copy_from_user can be avoided, inparticular in
+ *                KEXEC_CMD_kexec during a crash dump, which is a failry
+ *                critical section of code.If this turns out not to be
+ *                important then it can be collapsed into extra_args.
+ * @extra_args == Operation-specific extra arguments (NULL if none).
+ */
+
+#define KEXEC_TYPE_DEFAULT 0
+#define KEXEC_TYPE_CRASH   1
+
+/*
+ * Perform kexec having previously loaded a kexec or kdump kernel
+ * as appropritate.
+ * @arg1      == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH
+ * @extra_arg == pointer to cpu_user_regs_t structure.
+ */
+#define KEXEC_CMD_kexec                 0
+
+/*
+ * Load kernel image in preperation for kexec or kdump.
+ * @arg1      == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH
+ * @extra_arg == pointer to xen_kexec_image_t structure.
+ */
+#define KEXEC_CMD_kexec_load            1
+typedef struct xen_kexec_image {
+    unsigned long indirection_page;
+    unsigned long reboot_code_buffer;
+    unsigned long start_address;
+} xen_kexec_image_t;
+
+/*
+ * Clean up image loaded by KEXEC_CMD_kexec_load
+ * @arg1      == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH
+ */
+#define KEXEC_CMD_kexec_unload          2
+
+/*
+ * Find the base pointer and size of the area that xen has 
+ * reserved for use by the crash kernel.
+ * @extra_arg == pointer to xen_kexec_reserve_t structure.
+ */
+#define KEXEC_CMD_kexec_reserve         3
+typedef struct xen_kexec_reserve {
+    unsigned long size;
+    unsigned long start;
+} xen_kexec_reserve_t;
+
+/*
+ * Find the base pointer of the area that xen has 
+ * reserved for use by a crash note for a given VCPU
+ * @extra_arg == pointer to unsigned long.
+ */
+#define KEXEC_CMD_kexec_crash_note      4
+
+#endif /* _XEN_PUBLIC_KEXEC_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/public/xen.h
+++ x/xen/include/public/xen.h
@@ -64,6 +64,7 @@
 #define __HYPERVISOR_xenoprof_op          31
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
+#define __HYPERVISOR_kexec_op             34
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
--- /dev/null
+++ x/xen/include/xen/elfcore.h
@@ -0,0 +1,73 @@
+/******************************************************************************
+ * include/xen/elfcore.h
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on include/linux/elfcore.h from Linux 2.6.16
+ * Naming scheeme based on include/xen/elf.h (not include/linux/elfcore.h)
+ *
+ */
+
+#ifndef __ELFCOREC_H__
+#define __ELFCOREC_H__
+
+#include <xen/types.h>
+#include <xen/elf.h>
+#include <public/xen.h>
+
+#define NT_PRSTATUS     1
+
+typedef struct
+{
+    int signo;                       /* signal number */
+    int code;                        /* extra code */
+    int errno;                       /* errno */
+} ELF_Signifo;
+
+/* These seem to be the same length on all architectures on Linux */
+typedef int ELF_Pid;
+typedef struct {
+	long tv_sec;
+	long tv_usec;
+} ELF_Timeval;
+typedef unsigned long ELF_Greg;
+#define ELF_NGREG (sizeof (struct cpu_user_regs) / sizeof(ELF_Greg))
+typedef ELF_Greg ELF_Gregset[ELF_NGREG];
+
+/*
+ * Definitions to generate Intel SVR4-like core files.
+ * These mostly have the same names as the SVR4 types with "elf_"
+ * tacked on the front to prevent clashes with linux definitions,
+ * and the typedef forms have been avoided.  This is mostly like
+ * the SVR4 structure, but more Linuxy, with things that Linux does
+ * not support and which gdb doesn't really use excluded.
+ */
+typedef struct
+{
+    ELF_Signifo pr_info;         /* Info associated with signal */
+    short pr_cursig;             /* Current signal */
+    unsigned long pr_sigpend;    /* Set of pending signals */
+    unsigned long pr_sighold;    /* Set of held signals */
+    ELF_Pid pr_pid;
+    ELF_Pid pr_ppid;
+    ELF_Pid pr_pgrp;
+    ELF_Pid pr_sid;
+    ELF_Timeval pr_utime;        /* User time */
+    ELF_Timeval pr_stime;        /* System time */
+    ELF_Timeval pr_cutime;       /* Cumulative user time */
+    ELF_Timeval pr_cstime;       /* Cumulative system time */
+    ELF_Gregset pr_reg;          /* GP registers */
+    int pr_fpvalid;              /* True if math co-processor being used.  */
+} ELF_Prstatus;
+
+#endif /* __ELFCOREC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/xen/kexec.h
@@ -0,0 +1,33 @@
+/*
+ * include/xen/kexec.h - Internal archtecture independant portion
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <public/kexec.h>
+
+#define MAX_NOTE_BYTES 1024
+
+typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
+DECLARE_PER_CPU (note_buf_t, crash_notes);
+
+int machine_kexec_load(int type, xen_kexec_image_t *image);
+void machine_kexec_unload(int type, xen_kexec_image_t *image);
+void machine_kexec_reserved(xen_kexec_reserve_t *reservation);
+void machine_kexec(xen_kexec_image_t *image);
+void machine_shutdown(xen_kexec_image_t *image);
+void machine_crash_shutdown(cpu_user_regs_t *regs);
+
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/xen/mm.h
+++ x/xen/include/xen/mm.h
@@ -40,6 +40,7 @@ struct page_info;
 paddr_t init_boot_allocator(paddr_t bitmap_start);
 void init_boot_pages(paddr_t ps, paddr_t pe);
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align);
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at);
 void end_boot_allocator(void);
 
 /* Generic allocator. These functions are *not* interrupt-safe. */
--- /dev/null
+++ x/patches/linux-2.6.16.13/kexec-generic.patch
@@ -0,0 +1,294 @@
+ drivers/base/cpu.c    |   20 ++++++++++++++
+ include/linux/kexec.h |    5 +++
+ kernel/kexec.c        |   68 ++++++++++++++++++++++++++++++++++++++++---------
+ kernel/sys.c          |    4 ++
+ 4 files changed, 85 insertions(+), 12 deletions(-)
+
+--- x/drivers/base/cpu.c
++++ x/drivers/base/cpu.c
+@@ -11,6 +11,10 @@
+ 
+ #include "base.h"
+ 
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#endif
++
+ struct sysdev_class cpu_sysdev_class = {
+ 	set_kset_name("cpu"),
+ };
+@@ -86,6 +90,18 @@ static inline void register_cpu_control(
+ #ifdef CONFIG_KEXEC
+ #include <linux/kexec.h>
+ 
++#ifdef CONFIG_XEN
++static unsigned long get_crash_notes(int cpu)
++{
++	unsigned long crash_note;
++
++	if (HYPERVISOR_kexec(KEXEC_CMD_kexec_crash_note, cpu, &crash_note) < 0)
++		return 0UL;
++	return crash_note;
++}
++#endif
++
++/* XXX: This only finds dom0's CPU's */
+ static ssize_t show_crash_notes(struct sys_device *dev, char *buf)
+ {
+ 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+@@ -101,7 +117,11 @@ static ssize_t show_crash_notes(struct s
+ 	 * boot up and this data does not change there after. Hence this
+ 	 * operation should be safe. No locking required.
+ 	 */
++#ifndef CONFIG_XEN
+ 	addr = __pa(per_cpu_ptr(crash_notes, cpunum));
++#else
++	addr = (unsigned long long)get_crash_notes(cpunum);
++#endif
+ 	rc = sprintf(buf, "%Lx\n", addr);
+ 	return rc;
+ }
+--- x/include/linux/kexec.h
++++ x/include/linux/kexec.h
+@@ -91,6 +91,11 @@ struct kimage {
+ extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
+ extern int machine_kexec_prepare(struct kimage *image);
+ extern void machine_kexec_cleanup(struct kimage *image);
++#ifdef CONFIG_XEN
++extern int xen_machine_kexec_load(struct kimage *image);
++extern void xen_machine_kexec_unload(struct kimage *image);
++extern NORET_TYPE void xen_machine_kexec(struct kimage *image) ATTRIB_NORET;
++#endif
+ extern asmlinkage long sys_kexec_load(unsigned long entry,
+ 					unsigned long nr_segments,
+ 					struct kexec_segment __user *segments,
+--- x/kernel/kexec.c
++++ x/kernel/kexec.c
+@@ -38,6 +38,20 @@ struct resource crashk_res = {
+ 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ };
+ 
++/* Kexec needs to know about the actually physical addresss.
++ * But in xen, a physical address is a pseudo-physical addresss. */
++#ifndef CONFIG_XEN
++#define kexec_page_to_pfn(page)  page_to_pfn(page)
++#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
++#define kexec_virt_to_phys(addr) virt_to_phys(addr)
++#define kexec_phys_to_virt(addr) phys_to_virt(addr)
++#else
++#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
++#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
++#define kexec_virt_to_phys(addr) virt_to_machine(addr)
++#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
++#endif
++
+ int kexec_should_crash(struct task_struct *p)
+ {
+ 	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+@@ -403,7 +417,7 @@ static struct page *kimage_alloc_normal_
+ 		pages = kimage_alloc_pages(GFP_KERNEL, order);
+ 		if (!pages)
+ 			break;
+-		pfn   = page_to_pfn(pages);
++		pfn   = kexec_page_to_pfn(pages);
+ 		epfn  = pfn + count;
+ 		addr  = pfn << PAGE_SHIFT;
+ 		eaddr = epfn << PAGE_SHIFT;
+@@ -437,6 +451,7 @@ static struct page *kimage_alloc_normal_
+ 	return pages;
+ }
+ 
++#ifndef CONFIG_XEN
+ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ 						      unsigned int order)
+ {
+@@ -490,7 +505,7 @@ static struct page *kimage_alloc_crash_c
+ 		}
+ 		/* If I don't overlap any segments I have found my hole! */
+ 		if (i == image->nr_segments) {
+-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
++			pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
+ 			break;
+ 		}
+ 	}
+@@ -517,6 +532,13 @@ struct page *kimage_alloc_control_pages(
+ 
+ 	return pages;
+ }
++#else /* !CONFIG_XEN */
++struct page *kimage_alloc_control_pages(struct kimage *image,
++					 unsigned int order)
++{
++	return kimage_alloc_normal_control_pages(image, order);
++}
++#endif
+ 
+ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+ {
+@@ -532,7 +554,7 @@ static int kimage_add_entry(struct kimag
+ 			return -ENOMEM;
+ 
+ 		ind_page = page_address(page);
+-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
++		*image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
+ 		image->entry = ind_page;
+ 		image->last_entry = ind_page +
+ 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+@@ -593,13 +615,13 @@ static int kimage_terminate(struct kimag
+ #define for_each_kimage_entry(image, ptr, entry) \
+ 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ 		ptr = (entry & IND_INDIRECTION)? \
+-			phys_to_virt((entry & PAGE_MASK)): ptr +1)
++			kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
+ 
+ static void kimage_free_entry(kimage_entry_t entry)
+ {
+ 	struct page *page;
+ 
+-	page = pfn_to_page(entry >> PAGE_SHIFT);
++	page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
+ 	kimage_free_pages(page);
+ }
+ 
+@@ -611,6 +633,10 @@ static void kimage_free(struct kimage *i
+ 	if (!image)
+ 		return;
+ 
++#ifdef CONFIG_XEN
++	xen_machine_kexec_unload(image);
++#endif
++
+ 	kimage_free_extra_pages(image);
+ 	for_each_kimage_entry(image, ptr, entry) {
+ 		if (entry & IND_INDIRECTION) {
+@@ -686,7 +712,7 @@ static struct page *kimage_alloc_page(st
+ 	 * have a match.
+ 	 */
+ 	list_for_each_entry(page, &image->dest_pages, lru) {
+-		addr = page_to_pfn(page) << PAGE_SHIFT;
++		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
+ 		if (addr == destination) {
+ 			list_del(&page->lru);
+ 			return page;
+@@ -701,12 +727,12 @@ static struct page *kimage_alloc_page(st
+ 		if (!page)
+ 			return NULL;
+ 		/* If the page cannot be used file it away */
+-		if (page_to_pfn(page) >
++		if (kexec_page_to_pfn(page) >
+ 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ 			list_add(&page->lru, &image->unuseable_pages);
+ 			continue;
+ 		}
+-		addr = page_to_pfn(page) << PAGE_SHIFT;
++		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
+ 
+ 		/* If it is the destination page we want use it */
+ 		if (addr == destination)
+@@ -729,7 +755,7 @@ static struct page *kimage_alloc_page(st
+ 			struct page *old_page;
+ 
+ 			old_addr = *old & PAGE_MASK;
+-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
++			old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
+ 			copy_highpage(page, old_page);
+ 			*old = addr | (*old & ~PAGE_MASK);
+ 
+@@ -779,7 +805,7 @@ static int kimage_load_normal_segment(st
+ 			result  = -ENOMEM;
+ 			goto out;
+ 		}
+-		result = kimage_add_page(image, page_to_pfn(page)
++		result = kimage_add_page(image, kexec_page_to_pfn(page)
+ 								<< PAGE_SHIFT);
+ 		if (result < 0)
+ 			goto out;
+@@ -811,6 +837,7 @@ out:
+ 	return result;
+ }
+ 
++#ifndef CONFIG_XEN
+ static int kimage_load_crash_segment(struct kimage *image,
+ 					struct kexec_segment *segment)
+ {
+@@ -833,7 +860,7 @@ static int kimage_load_crash_segment(str
+ 		char *ptr;
+ 		size_t uchunk, mchunk;
+ 
+-		page = pfn_to_page(maddr >> PAGE_SHIFT);
++		page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
+ 		if (page == 0) {
+ 			result  = -ENOMEM;
+ 			goto out;
+@@ -881,6 +908,13 @@ static int kimage_load_segment(struct ki
+ 
+ 	return result;
+ }
++#else /* CONFIG_XEN */
++static int kimage_load_segment(struct kimage *image,
++				struct kexec_segment *segment)
++{
++	return kimage_load_normal_segment(image, segment);
++}
++#endif
+ 
+ /*
+  * Exec Kernel system call: for obvious reasons only root may call it.
+@@ -991,6 +1025,11 @@ asmlinkage long sys_kexec_load(unsigned 
+ 		if (result)
+ 			goto out;
+ 	}
++#ifdef CONFIG_XEN
++	result = xen_machine_kexec_load(image);
++	if (result)
++		goto out;
++#endif
+ 	/* Install the new kernel, and  Uninstall the old */
+ 	image = xchg(dest_image, image);
+ 
+@@ -1045,7 +1084,6 @@ void crash_kexec(struct pt_regs *regs)
+ 	struct kimage *image;
+ 	int locked;
+ 
+-
+ 	/* Take the kexec_lock here to prevent sys_kexec_load
+ 	 * running on one cpu from replacing the crash kernel
+ 	 * we are using after a panic on a different cpu.
+@@ -1061,12 +1099,17 @@ void crash_kexec(struct pt_regs *regs)
+ 			struct pt_regs fixed_regs;
+ 			crash_setup_regs(&fixed_regs, regs);
+ 			machine_crash_shutdown(&fixed_regs);
++#ifdef CONFIG_XEN
++			xen_machine_kexec(image);
++#else
+ 			machine_kexec(image);
++#endif
+ 		}
+ 		xchg(&kexec_lock, 0);
+ 	}
+ }
+ 
++#ifndef CONFIG_XEN
+ static int __init crash_notes_memory_init(void)
+ {
+ 	/* Allocate memory for saving cpu registers. */
+@@ -1079,3 +1122,4 @@ static int __init crash_notes_memory_ini
+ 	return 0;
+ }
+ module_init(crash_notes_memory_init)
++#endif
+--- x/kernel/sys.c
++++ x/kernel/sys.c
+@@ -435,8 +435,12 @@ void kernel_kexec(void)
+ 	kernel_restart_prepare(NULL);
+ 	printk(KERN_EMERG "Starting new kernel\n");
+ 	machine_shutdown();
++#ifdef CONFIG_XEN
++	xen_machine_kexec(image);
++#else
+ 	machine_kexec(image);
+ #endif
++#endif
+ }
+ EXPORT_SYMBOL_GPL(kernel_kexec);
+ 
--- /dev/null
+++ x/patches/linux-2.6.16.13/0-linux-2.6.16-kexec_page_table_a_stubs.patch
@@ -0,0 +1,85 @@
+kexec: Avoid overwriting the current pgd (V2, stubs)
+
+This patch adds an architecture specific structure "struct kimage_arch" to
+struct kimage. This structure is filled in with members by the architecture
+specific patches followed by this one.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+---
+
+ Applies on top of 2.6.16 and 2.6.17-rc4.
+
+ include/asm-i386/kexec.h    |    2 ++
+ include/asm-powerpc/kexec.h |    2 ++
+ include/asm-s390/kexec.h    |    2 ++
+ include/asm-sh/kexec.h      |    2 ++
+ include/asm-x86_64/kexec.h  |    2 ++
+ include/linux/kexec.h       |    2 ++
+ 6 files changed, 12 insertions(+)
+
+--- x/include/asm-i386/kexec.h
++++ x/include/asm-i386/kexec.h
+@@ -29,6 +29,8 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ /* CPU does not save ss and esp on stack if execution is already
+  * running in kernel mode at the time of NMI occurrence. This code
+  * fixes it.
+--- x/include/asm-powerpc/kexec.h
++++ x/include/asm-powerpc/kexec.h
+@@ -108,6 +108,8 @@ static inline void crash_setup_regs(stru
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ #ifdef __powerpc64__
+ extern void kexec_smp_wait(void);	/* get and clear naca physid, wait for
+ 					  master to copy new code to 0 */
+--- x/include/asm-s390/kexec.h
++++ x/include/asm-s390/kexec.h
+@@ -36,6 +36,8 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ /* Provide a dummy definition to avoid build failures. */
+ static inline void crash_setup_regs(struct pt_regs *newregs,
+ 					struct pt_regs *oldregs) { }
+--- x/include/asm-sh/kexec.h
++++ x/include/asm-sh/kexec.h
+@@ -25,6 +25,8 @@
+ 
+ #ifndef __ASSEMBLY__
+ 
++struct kimage_arch {};
++
+ extern void machine_shutdown(void);
+ extern void *crash_notes;
+ 
+--- x/include/asm-x86_64/kexec.h
++++ x/include/asm-x86_64/kexec.h
+@@ -29,6 +29,8 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ /*
+  * Saving the registers of the cpu on which panic occured in
+  * crash_kexec to save a valid sp. The registers of other cpus
+--- x/include/linux/kexec.h
++++ x/include/linux/kexec.h
+@@ -69,6 +69,8 @@ struct kimage {
+ 	unsigned long start;
+ 	struct page *control_code_page;
+ 
++	struct kimage_arch arch_data;
++
+ 	unsigned long nr_segments;
+ 	struct kexec_segment segment[KEXEC_SEGMENT_MAX];
+ 

[-- Attachment #3: 51.2.1-kexec-x86-upstream.patch --]
[-- Type: text/plain, Size: 25241 bytes --]

kexec: x86

This is the x86 component of kexec for xen.
The generic component is a prerequsite for this patch.
The x86_64 or x86_32 (i386) patch is also needed
in order to use this code, however the code should compile is.

Signed-Off-By: Horms <horms@verge.net.au>
Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>

 linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c |   14 +
 xen/arch/x86/crash.c                                  |  171 ++++++++++++++++-
 xen/arch/x86/dom0_ops.c                               |    3 
 xen/arch/x86/machine_kexec.c                          |  150 +++++++++++++-
 xen/arch/x86/setup.c                                  |   75 ++++++-
 xen/arch/x86/x86_32/Makefile                          |    1 
 xen/arch/x86/x86_32/machine_kexec.c                   |   27 ++
 xen/arch/x86/x86_64/Makefile                          |    1 
 xen/arch/x86/x86_64/machine_kexec.c                   |   28 ++
 xen/include/asm-x86/elf.h                             |   27 ++
 xen/include/asm-x86/fixmap.h                          |    1 
 xen/include/asm-x86/hypercall.h                       |    5 
 xen/include/asm-x86/kexec.h                           |   14 -
 xen/include/asm-x86/x86_32/elf.h                      |   30 ++
 xen/include/asm-x86/x86_32/kexec.h                    |   51 +++++
 xen/include/asm-x86/x86_64/elf.h                      |   30 ++
 xen/include/asm-x86/x86_64/kexec.h                    |   50 ++++
 xen/include/public/kexec.h                            |    2 
 18 files changed, 649 insertions(+), 31 deletions(-)

--- x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
+++ x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
@@ -23,7 +23,21 @@ static inline unsigned long machine_addr
 
 static void setup_hypercall_arg(xen_kexec_image_t *xki, struct kimage *image)
 {
+#if defined(CONFIG_X86_32) || defined(CONFIG_X86_64)
+	struct kimage_arch *arch = &image->arch_data;
+	int k, n = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
+
+	memset(xki, 0, sizeof(*xki));
+
+	for (k = 0; k < n; k++)
+		xki->page_table_a[k] = machine_address(arch->page_table_a[k]);
+
+#ifdef CONFIG_X86_64
+	xki->page_table_b = machine_address(arch->page_table_b);
+#endif
+#else
 	memset(xki, 0, sizeof(*xki));
+#endif
 
 	xki->indirection_page = image->head;
 	xki->reboot_code_buffer = 
--- x/xen/arch/x86/crash.c
+++ x/xen/arch/x86/crash.c
@@ -3,16 +3,181 @@
  * 
  * Created By: Horms
  *
- * Should be based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
+ * Based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
  */
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <asm/atomic.h>
+#include <asm/elf.h>
+#include <asm/percpu.h>
+#include <asm/kexec.h>
 #include <xen/types.h>
+#include <xen/irq.h>
+#include <asm/ipi.h>
+#include <asm/nmi.h>
+#include <xen/string.h>
+#include <xen/elf.h>
+#include <xen/elfcore.h>
+#include <xen/smp.h>
+#include <xen/delay.h>
+#include <xen/perfc.h>
+#include <xen/kexec.h>
 #include <public/xen.h>
 
+static int crashing_cpu;
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+							       size_t data_len)
+{
+	Elf_Note note;
+
+	note.namesz = strlen(name) + 1;
+	note.descsz = data_len;
+	note.type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) +3)/4;
+	memcpy(buf, name, note.namesz);
+	buf += (note.namesz + 3)/4;
+	memcpy(buf, data, note.descsz);
+	buf += (note.descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	Elf_Note note;
+
+	note.namesz = 0;
+	note.descsz = 0;
+	note.type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static void crash_save_this_cpu(struct cpu_user_regs *regs, int cpu)
+{
+	ELF_Prstatus prstatus;
+	uint32_t *buf;
+
+	printk("crash_save_this_cpu: %d\n",  cpu);
+
+	if ((cpu < 0) || (cpu >= NR_CPUS))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * A well defined structure format with tags is needed
+	 * ELF notes happen to provide this and there is infastructure
+	 * in the Linux kernel to supprot them. In order to make
+	 * crash dumps produced by xen the same, the same
+	 * technique is used here.
+	 */
+
+	/* It should be safe to use per_cpu() here instead of per_cpu_ptr()
+	 * (which does not exist in xen) as kexecing_lock must be held in
+	 * order to get anywhere near here */
+	buf = (uint32_t *)per_cpu(crash_notes, cpu);
+	if (!buf) /* XXX: Can this ever occur? */
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	/* XXX: Xen does not have processes. For the crashing CPU on a dom0
+	 * crash this could be pased down from dom0, but is this
+	 * neccessary?
+	 * prstatus.pr_pid = current->pid; */
+	ELF_CORE_COPY_REGS(prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+				sizeof(prstatus));
+	final_note(buf);
+}
+
+static void crash_save_self(struct cpu_user_regs *regs)
+{
+	crash_save_this_cpu(regs, smp_processor_id());
+}
+
+#ifdef CONFIG_SMP
+static atomic_t waiting_for_crash_ipi;
+
+static int crash_nmi_callback(struct cpu_user_regs *regs, int cpu)
+{
+	struct cpu_user_regs fixed_regs;
+
+	/* Don't do anything if this handler is invoked on crashing cpu.
+	 * Otherwise, system will completely hang. Crashing cpu can get
+	 * an NMI if system was initially booted with nmi_watchdog parameter.
+	 */
+	if (cpu == crashing_cpu)
+		return 1;
+	local_irq_disable();
+
+	if (!user_mode(regs)) {
+		crash_fixup_ss_esp(&fixed_regs, regs);
+		regs = &fixed_regs;
+	}
+	crash_save_this_cpu(regs, cpu);
+	disable_local_APIC();
+	atomic_dec(&waiting_for_crash_ipi);
+	/* Assume hlt works */
+	__asm__ __volatile__ ( "hlt" );
+	for(;;);
+
+	return 1;
+
+	/* Need to use this somewhere as Xen builds with -Werror */
+	crash_setup_regs(&fixed_regs, regs);
+}
+
+/*
+ * By using the NMI code instead of a vector we just sneak thru the
+ * word generator coming out with just what we want.  AND it does
+ * not matter if clustered_apic_mode is set or not.
+ */
+static void smp_send_nmi_allbutself(void)
+{
+	cpumask_t allbutself = cpu_online_map;
+    	cpu_clear(smp_processor_id(), allbutself);
+	send_IPI_mask(allbutself, APIC_DM_NMI);
+}
+
+static void nmi_shootdown_cpus(void)
+{
+	unsigned long msecs;
+
+	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+	/* Would it be better to replace the trap vector here? */
+	set_nmi_callback(crash_nmi_callback);
+	/* Ensure the new callback function is set before sending
+	 * out the NMI
+	 */
+	wmb();
+
+	smp_send_nmi_allbutself();
+
+	msecs = 1000; /* Wait at most a second for the other cpus to stop */
+	while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+		mdelay(1);
+		msecs--;
+	}
+
+	/* Leave the nmi callback set */
+	disable_local_APIC();
+}
+#else
+static void nmi_shootdown_cpus(void)
+{
+	/* There are no cpus to shootdown */
+}
+#endif
+
 void machine_crash_shutdown(struct cpu_user_regs *regs)
 {
-    printk("STUB: arch/x86/crash.c: machine_crash_shutdown: not implemented\n");
+	printk("machine_crash_shutdown: %d\n", smp_processor_id());
+	local_irq_disable();
+
+	crashing_cpu = smp_processor_id();
+	nmi_shootdown_cpus();
+#ifdef CONFIG_X86_IO_APIC
+        disable_IO_APIC();
+#endif
+	crash_save_self(regs);
 }
 
 /*
--- x/xen/arch/x86/dom0_ops.c
+++ x/xen/arch/x86/dom0_ops.c
@@ -29,6 +29,9 @@
 #include <asm/mtrr.h>
 #include "cpu/mtrr/mtrr.h"
 
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
 #define TRC_DOM0OP_ENTER_BASE  0x00020000
 #define TRC_DOM0OP_LEAVE_BASE  0x00030000
 
--- x/xen/arch/x86/machine_kexec.c
+++ x/xen/arch/x86/machine_kexec.c
@@ -6,38 +6,164 @@
  */
 
 #include <xen/lib.h>       /* for printk() used in stubs */
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/flushtlb.h>
+#include <xen/lib.h>
+#include <xen/smp.h>
+#include <xen/nmi.h>
 #include <xen/types.h>
+#include <xen/console.h>
+#include <xen/kexec.h>
 #include <public/kexec.h>
+#include <xen/domain_page.h>
+#include <asm/fixmap.h>
+ 
+#define create_level_mapping(lvl, next, pages, nopages, k, va)               \
+{                                                                            \
+    lvl##_pgentry_t *table;                                                  \
+    void *old = next;                                                        \
+                                                                             \
+    table = (lvl##_pgentry_t *)next + lvl##_table_offset(va);                \
+    if (!(lvl##e_get_flags(*table) & _PAGE_PRESENT)) {                       \
+        if (k >= nopages || pages[k] == 0)                                   \
+            return -1;                                                       \
+        *table = lvl##e_from_pfn(pages[k++]>>PAGE_SHIFT, __PAGE_HYPERVISOR); \
+    }                                                                        \
+    next = map_domain_page(lvl##e_get_pfn(*table));                          \
+    unmap_domain_page(old);                                                  \
+}                                                                            
+
+#define create_level_1_mapping(next, nopages, va, pa)               \
+{                                                                   \
+    l1_pgentry_t *table;                                            \
+                                                                    \
+    table = (l1_pgentry_t *)next + l1_table_offset(va);             \
+    if (!(l1e_get_flags(*table) & _PAGE_PRESENT)) {                 \
+        *table = l1e_from_pfn(pa >> PAGE_SHIFT, __PAGE_HYPERVISOR); \
+    }                                                               \
+    unmap_domain_page(next);                                        \
+}
+
+static int create_mapping(unsigned long root,
+                          unsigned long *pages, int nopages,
+                          unsigned long va, unsigned long pa)
+{
+    void *next = map_domain_page(root >> PAGE_SHIFT);
+    int k = 0;
+
+#if CONFIG_PAGING_LEVELS >= 4
+    create_level_mapping(l4, next, pages, nopages, k, va);
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+    create_level_mapping(l3, next, pages, nopages, k, va);
+#endif
+    create_level_mapping(l2, next, pages, nopages, k, va);
+
+    create_level_1_mapping(next, nopages, va, pa);
+
+    return k;
+}
+
+static int setup_page_table_a(xen_kexec_image_t *image)
+{
+    void *page;
+    int k, n = sizeof(image->page_table_a) / sizeof(image->page_table_a[0]);
+
+    /* clear page_table_a pages */
+
+    for (k = 0; k < n; k++) {
+        if (!image->page_table_a[k])
+            break;
+
+        page = map_domain_page(image->page_table_a[k] >> PAGE_SHIFT);
+        clear_page(page);
+        unmap_domain_page(page);
+    }
+
+    /* check that the first page (root page) is actually non-zero */
+
+    if (k == 0)
+        return -1;
+
+    /* setup fixmap to point to our control page */
+
+    set_fixmap(FIX_KEXEC_PAGE, image->reboot_code_buffer);
+
+    /* fill in page_table_a: create mapping at fixmap address */
+
+    k = create_mapping(image->page_table_a[0],
+                       &image->page_table_a[1],
+                       n - 1, fix_to_virt(FIX_KEXEC_PAGE),
+                       image->reboot_code_buffer);
+    if (k < 0)
+        return -1;
+
+    /* fill in page_table_a: create identity mapping */
+
+    k = create_mapping(image->page_table_a[0],
+                       &image->page_table_a[1 + k],
+                       n - (1 + k), image->reboot_code_buffer,
+                       image->reboot_code_buffer);
+    if (k < 0)
+        return -1;
+
+	return 0;
+}
 
 int machine_kexec_load(int type, xen_kexec_image_t *image)
 {
-    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_load: "
-        "not implemented\n");
-    return -1;
+    return setup_page_table_a(image);
 }
 
 void machine_kexec_unload(int type, xen_kexec_image_t *image)
 {
-    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_unload: "
-        "not implemented\n");
 }
 
 void machine_kexec_reserved(xen_kexec_reserve_t *reservation)
 {
-    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_reserved: "
-        "not implemented\n");
+    reservation->size = opt_kdump_megabytes << 20;
+    reservation->start = opt_kdump_megabytes_base << 20;
 }
 
-void machine_kexec(xen_kexec_image_t *image)
+static void __machine_shutdown(void *data)
 {
-    printk("STUB: arch/x86/machine_kexec.c: machine_kexec: "
-        "not implemented\n");
+    xen_kexec_image_t *image = (xen_kexec_image_t *)data;
+
+    printk("__machine_shutdown: cpu=%u\n", smp_processor_id());
+
+    watchdog_disable();
+    console_start_sync();
+
+    smp_send_stop();
+
+#ifdef CONFIG_X86_IO_APIC
+    disable_IO_APIC();
+#endif   
+
+    machine_kexec(image);
 }
 
 void machine_shutdown(xen_kexec_image_t *image)
 {
-    printk("STUB: arch/x86/machine_shutdown.c: machine_shutdown: "
-       "not implemented\n");
+    int reboot_cpu_id;
+    cpumask_t reboot_cpu;
+
+    reboot_cpu_id = 0;
+
+    if (!cpu_isset(reboot_cpu_id, cpu_online_map))
+        reboot_cpu_id = smp_processor_id();
+    
+    if (reboot_cpu_id != smp_processor_id()) {
+        cpus_clear(reboot_cpu);
+        cpu_set(reboot_cpu_id, reboot_cpu);
+        on_selected_cpus(reboot_cpu, __machine_shutdown, image, 1, 0);
+	for (;;)
+		; /* nothing */
+    }
+    else
+        __machine_shutdown(image);
+    BUG();
 }
 
 /*
--- x/xen/arch/x86/setup.c
+++ x/xen/arch/x86/setup.c
@@ -38,6 +38,11 @@ static unsigned int opt_xenheap_megabyte
 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
 #endif
 
+unsigned int opt_kdump_megabytes = 0;
+integer_param("kdump_megabytes", opt_kdump_megabytes);
+unsigned int opt_kdump_megabytes_base = 0;
+integer_param("kdump_megabytes_base", opt_kdump_megabytes_base);
+
 /* opt_nosmp: If true, secondary processors are ignored. */
 static int opt_nosmp = 0;
 boolean_param("nosmp", opt_nosmp);
@@ -192,6 +197,20 @@ static void percpu_free_unused_areas(voi
                        __pa(__per_cpu_end));
 }
 
+void __init move_memory(unsigned long dst, 
+                          unsigned long src_start, unsigned long src_end)
+{
+#if defined(CONFIG_X86_32)
+    memmove((void *)dst,  /* use low mapping */
+            (void *)src_start,      /* use low mapping */
+            src_end - src_start);
+#elif defined(CONFIG_X86_64)
+    memmove(__va(dst),
+            __va(src_start),
+            src_end - src_start);
+#endif
+}
+
 void __init __start_xen(multiboot_info_t *mbi)
 {
     char __cmdline[] = "", *cmdline = __cmdline;
@@ -327,15 +346,8 @@ void __init __start_xen(multiboot_info_t
         initial_images_start = xenheap_phys_end;
     initial_images_end = initial_images_start + modules_length;
 
-#if defined(CONFIG_X86_32)
-    memmove((void *)initial_images_start,  /* use low mapping */
-            (void *)mod[0].mod_start,      /* use low mapping */
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#elif defined(CONFIG_X86_64)
-    memmove(__va(initial_images_start),
-            __va(mod[0].mod_start),
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#endif
+    move_memory(initial_images_start, 
+                mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
 
     /* Initialise boot-time allocator with all RAM situated after modules. */
     xenheap_phys_start = init_boot_allocator(__pa(&_end));
@@ -383,6 +395,51 @@ void __init __start_xen(multiboot_info_t
 #endif
     }
 
+    if (opt_kdump_megabytes) {
+        unsigned long kdump_start, kdump_size, k;
+
+        /* mark images pages as free for now */
+
+        init_boot_pages(initial_images_start, initial_images_end);
+
+        kdump_start = opt_kdump_megabytes_base << 20;
+        kdump_size = opt_kdump_megabytes << 20;
+
+        printk("Kdump: %luMB (%lukB) at 0x%lx\n", 
+               kdump_size >> 20,
+               kdump_size >> 10,
+               kdump_start);
+
+        if ((kdump_start & ~PAGE_MASK) || (kdump_size & ~PAGE_MASK))
+            panic("Kdump parameters not page aligned\n");
+
+        kdump_start >>= PAGE_SHIFT;
+        kdump_size >>= PAGE_SHIFT;
+
+        /* allocate pages for Kdump memory area */
+
+        k = alloc_boot_pages_at(kdump_size, kdump_start);
+
+        if (k != kdump_start)
+            panic("Unable to reserve Kdump memory\n");
+
+        /* allocate pages for relocated initial images */
+
+        k = ((initial_images_end - initial_images_start) & ~PAGE_MASK) ? 1 : 0;
+        k += (initial_images_end - initial_images_start) >> PAGE_SHIFT;
+
+        k = alloc_boot_pages(k, 1);
+
+        if (!k)
+            panic("Unable to allocate initial images memory\n");
+
+        move_memory(k << PAGE_SHIFT, initial_images_start, initial_images_end);
+
+        initial_images_end -= initial_images_start;
+        initial_images_start = k << PAGE_SHIFT;
+        initial_images_end += initial_images_start;
+    }        
+
     memguard_init();
 
     printk("System RAM: %luMB (%lukB)\n", 
--- x/xen/arch/x86/x86_32/Makefile
+++ x/xen/arch/x86/x86_32/Makefile
@@ -3,5 +3,6 @@ obj-y += entry.o
 obj-y += mm.o
 obj-y += seg_fixup.o
 obj-y += traps.o
+obj-y += machine_kexec.o
 
 obj-$(supervisor_mode_kernel) += supervisor_mode_kernel.o
--- /dev/null
+++ x/xen/arch/x86/x86_32/machine_kexec.c
@@ -0,0 +1,27 @@
+/*
+ * arch/x86/x86_32/machine_kexec.c
+ * Handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Should be losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <public/kexec.h>
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: arch/x86/x86_32/machine_kexec.c: machine_kexec: "
+        "not implemented\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/x86_64/Makefile
+++ x/xen/arch/x86/x86_64/Makefile
@@ -1,3 +1,4 @@
 obj-y += entry.o
 obj-y += mm.o
 obj-y += traps.o
+obj-y += machine_kexec.o
--- /dev/null
+++ x/xen/arch/x86/x86_64/machine_kexec.c
@@ -0,0 +1,28 @@
+/*
+ * arch/x86/x86_64/machine_kexec.c
+ * Handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Should be losely based on arch/x86_64/kernel/machine_kexec.c
+ */
+
+#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: arch/x86/x86_64/machine_kexec.c: machine_kexec: "
+        "not implemented\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/elf.h
@@ -0,0 +1,27 @@
+/******************************************************************************
+ * include/asm-x86/elf.h
+ * 
+ * Created By: Horms
+ *
+ */
+
+#ifndef __X86_ELF_H__
+#define __X86_ELF_H__
+
+#ifdef __x86_64__
+#include <asm/x86_64/elf.h>
+#else
+#include <asm/x86_32/elf.h>
+#endif
+
+#endif /* __X86_ELF_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/asm-x86/fixmap.h
+++ x/xen/include/asm-x86/fixmap.h
@@ -32,6 +32,7 @@ enum fixed_addresses {
     FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
     FIX_HPET_BASE,
     FIX_CYCLONE_TIMER,
+    FIX_KEXEC_PAGE,
     __end_of_fixed_addresses
 };
 
--- x/xen/include/asm-x86/hypercall.h
+++ x/xen/include/asm-x86/hypercall.h
@@ -6,6 +6,7 @@
 #define __ASM_X86_HYPERCALL_H__
 
 #include <public/physdev.h>
+#include <xen/types.h>
 
 extern long
 do_event_channel_op_compat(
@@ -87,6 +88,10 @@ extern long
 arch_do_vcpu_op(
     int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg);
 
+extern int
+do_kexec(
+    unsigned long op, unsigned arg1, XEN_GUEST_HANDLE(void) uarg);
+
 #ifdef __x86_64__
 
 extern long
--- x/xen/include/asm-x86/kexec.h
+++ x/xen/include/asm-x86/kexec.h
@@ -8,16 +8,16 @@
 #ifndef __X86_KEXEC_H__
 #define __X86_KEXEC_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <asm/processor.h>
 #include <xen/types.h>
+#include <xen/string.h>
 #include <public/xen.h>
 
-static void crash_setup_regs(struct cpu_user_regs *newregs,
-			     struct cpu_user_regs *oldregs)
-{
-    printk("STUB: include/asm-x86/kexec.h: crash_setup_regs: "
-       "not implemented\n");
-}
+#ifdef __x86_64__
+#include <asm/x86_64/kexec.h>
+#else
+#include <asm/x86_32/kexec.h>
+#endif
 
 #endif /* __X86_KEXEC_H__ */
 
--- /dev/null
+++ x/xen/include/asm-x86/x86_32/elf.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+ * include/asm-x86/x86_32/elf.h
+ * 
+ * Created By: Horms
+ *
+ * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS
+ * from Linux 2.6.16
+ */
+
+#ifndef __X86_ELF_X86_32_H__
+#define __X86_ELF_X86_32_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
+    printk("STUB: include/asm-x86/x86_32/kexec.h: ELF_CORE_COPY_REGS: " \
+       "not implemented\n")
+  
+
+#endif /* __X86_ELF_X86_32_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_32/kexec.h
@@ -0,0 +1,51 @@
+/******************************************************************************
+ * include/asm-x86/x86_32/kexec.h
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16
+ *
+ */
+
+#ifndef __X86_32_KEXEC_H__
+#define __X86_32_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
+		    struct cpu_user_regs *oldregs)
+{
+    printk("STUB: include/asm-x86/x86_32/kexec.h: crash_fixup_ss_esp: "
+       "not implemented\n");
+    return;
+    crash_fixup_ss_esp(newregs, oldregs);
+}
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: include/asm-x86/x86_32/kexec.h: crash_setup_regs: "
+       "not implemented\n");
+}
+
+static inline int user_mode(struct cpu_user_regs *regs)
+{
+    printk("STUB: include/asm-x86/x86_32/kexec.h: user_mode: "
+       "not implemented\n");
+    return -1;
+}
+
+
+#endif /* __X86_32_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_64/elf.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+ * include/asm-x86/x86_64/elf.h
+ * 
+ * Created By: Horms
+ *
+ * Should pull be based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS
+ * from Linux 2.6.16
+ */
+
+#ifndef __X86_ELF_X86_64_H__
+#define __X86_ELF_X86_64_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
+    printk("STUB: include/asm-x86/x86_64/kexec.h: ELF_CORE_COPY_REGS: " \
+       "not implemented\n")
+  
+
+#endif /* __X86_ELF_X86_64_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_64/kexec.h
@@ -0,0 +1,50 @@
+/******************************************************************************
+ * include/asm-x86/x86_64/kexec.h
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on include/asm-x86_64/kexec.h from Linux 2.6.16
+ *
+ */
+
+#ifndef __X86_64_KEXEC_H__
+#define __X86_64_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
+		    struct cpu_user_regs *oldregs)
+{
+    printk("STUB: include/asm-x86/x86_64/kexec.h: crash_fixup_ss_esp: "
+       "not implemented\n");
+}
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: include/asm-x86/x86_64/kexec.h: crash_setup_regs: "
+       "not implemented\n");
+    return;
+    crash_fixup_ss_esp(newregs, oldregs);
+}
+
+static inline int user_mode(struct cpu_user_regs *regs)
+{
+    printk("STUB: include/asm-x86/x86_64/kexec.h: user_mode: "
+       "not implemented\n");
+    return -1;
+}
+
+#endif /* __X86_64_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/public/kexec.h
+++ x/xen/include/public/kexec.h
@@ -43,6 +43,8 @@
  */
 #define KEXEC_CMD_kexec_load            1
 typedef struct xen_kexec_image {
+    unsigned long page_table_a[7];
+    unsigned long page_table_b;
     unsigned long indirection_page;
     unsigned long reboot_code_buffer;
     unsigned long start_address;

[-- Attachment #4: 51.2.1.1-kexec-x86_32-upstream.patch --]
[-- Type: text/plain, Size: 27851 bytes --]

kexec: x86_32

This is the x86_32 component of kexec for xen.
The x86 component is a prerequsite for this patch.

Signed-Off-By: Horms <horms@verge.net.au>
Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>

 buildconfigs/linux-defconfig_xen_x86_32                              |    4 
 linux-2.6-xen-sparse/arch/i386/Kconfig                               |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/Makefile                       |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c                    |   28 
 linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h                    |   42 
 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h       |    8 
 patches/linux-2.6.16.13/1-linux-2.6.16-kexec_page_table_a_i386.patch |  457 ++++++++++
 xen/arch/x86/x86_32/entry.S                                          |    2 
 xen/arch/x86/x86_32/machine_kexec.c                                  |   28 
 xen/include/asm-x86/x86_32/elf.h                                     |   34 
 xen/include/asm-x86/x86_32/kexec.h                                   |   68 +
 11 files changed, 635 insertions(+), 40 deletions(-)

--- x/buildconfigs/linux-defconfig_xen_x86_32
+++ x/buildconfigs/linux-defconfig_xen_x86_32
@@ -184,10 +184,11 @@ CONFIG_MTRR=y
 CONFIG_REGPARM=y
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
-# CONFIG_CRASH_DUMP is not set
+CONFIG_CRASH_DUMP=y
 CONFIG_PHYSICAL_START=0x100000
 CONFIG_HOTPLUG_CPU=y
 
@@ -2774,6 +2775,7 @@ CONFIG_NTFS_FS=m
 #
 CONFIG_PROC_FS=y
 CONFIG_PROC_KCORE=y
+# CONFIG_PROC_VMCORE is not set
 CONFIG_SYSFS=y
 CONFIG_TMPFS=y
 # CONFIG_HUGETLB_PAGE is not set
--- x/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ x/linux-2.6-xen-sparse/arch/i386/Kconfig
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_XEN
+	depends on EXPERIMENTAL
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
@@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
@@ -68,6 +68,10 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -941,6 +945,7 @@ static void __init parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -951,6 +956,10 @@ static void __init parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1320,9 +1329,22 @@ void __init setup_bootmem_allocator(void
 	}
 #endif
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end)
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
+#else
+	{
+		xen_kexec_reserve_t reservation;
+		BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0,
+					&reservation));
+		if (reservation.size) {
+			crashk_res.start = reservation.start;
+			crashk_res.end = reservation.start + 
+				reservation.size - 1;
+		}
+	}
+#endif
 #endif
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1378,6 +1400,9 @@ legacy_init_iomem_resources(struct e820e
 		res->end = res->start + e820[i].size - 1;
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		request_resource(&iomem_resource, res);
+#ifdef CONFIG_KEXEC
+		request_resource(res, &crashk_res);
+#endif
 #ifndef CONFIG_XEN
 		if (e820[i].type == E820_RAM) {
 			/*
@@ -1387,9 +1412,6 @@ legacy_init_iomem_resources(struct e820e
 			 */
 			request_resource(res, code_resource);
 			request_resource(res, data_resource);
-#ifdef CONFIG_KEXEC
-			request_resource(res, &crashk_res);
-#endif
 		}
 #endif
 	}
--- /dev/null
+++ x/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h
@@ -0,0 +1,42 @@
+/*
+ * include/asm-i386/kexec-xen.h
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _I386_KEXEC_XEN_H
+#define _I386_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_32.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+					struct cpu_user_regs *xen_regs)
+{
+	xen_regs->ebx    = linux_regs->ebx;
+	xen_regs->ecx    = linux_regs->ecx;
+	xen_regs->edx    = linux_regs->edx;
+	xen_regs->esi    = linux_regs->esi;
+	xen_regs->edi    = linux_regs->edi;
+	xen_regs->ebp    = linux_regs->ebp;
+	xen_regs->eax    = linux_regs->eax;
+	xen_regs->esp    = linux_regs->esp;
+	xen_regs->ss     = linux_regs->xss;
+	xen_regs->cs     = linux_regs->xcs;
+	xen_regs->ds     = linux_regs->xds;
+	xen_regs->es     = linux_regs->xes;
+	xen_regs->eflags = linux_regs->eflags;
+}
+
+#endif /* _I386_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
@@ -359,5 +359,13 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, unsigned int arg1, void * extra_args)
+{
+	return _hypercall3(int, kexec_op, op, arg1, extra_args);
+}
+
+
 
 #endif /* __HYPERCALL_H__ */
--- x/xen/arch/x86/x86_32/entry.S
+++ x/xen/arch/x86/x86_32/entry.S
@@ -648,6 +648,7 @@ ENTRY(hypercall_table)
         .long do_xenoprof_op
         .long do_event_channel_op
         .long do_physdev_op
+        .long do_kexec
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -687,6 +688,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_xenoprof_op       */
         .byte 2 /* do_event_channel_op  */
         .byte 2 /* do_physdev_op        */
+        .byte 3 /* do_kexec             */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- x/xen/arch/x86/x86_32/machine_kexec.c
+++ x/xen/arch/x86/x86_32/machine_kexec.c
@@ -1,19 +1,31 @@
-/*
+/******************************************************************************
  * arch/x86/x86_32/machine_kexec.c
- * Handle transition of Linux booting another kernel
- *
- * Created By: Horms <horms@verge.net.au>
+ * 
+ * Created By: Horms
  *
- * Should be losely based on arch/i386/kernel/machine_kexec.c
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
  */
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
 #include <public/kexec.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+                    unsigned long indirection_page,
+                    unsigned long reboot_code_buffer,
+                    unsigned long start_address,
+                    unsigned long page_table_a,
+                    unsigned long has_pae);
 
 void machine_kexec(xen_kexec_image_t *image)
 {
-    printk("STUB: arch/x86/x86_32/machine_kexec.c: machine_kexec: "
-        "not implemented\n");
+    relocate_new_kernel_t rnk;
+
+    rnk = (relocate_new_kernel_t) fix_to_virt(FIX_KEXEC_PAGE);
+    (*rnk)(image->indirection_page, image->reboot_code_buffer, 
+           image->start_address, image->page_table_a[0],
+           (unsigned long)cpu_has_pae);
 }
 
 /*
--- x/xen/include/asm-x86/x86_32/elf.h
+++ x/xen/include/asm-x86/x86_32/elf.h
@@ -3,19 +3,39 @@
  * 
  * Created By: Horms
  *
- * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS
- * from Linux 2.6.16
+ * Based heavily on include/asm-i386/elf.h and 
+ * include/asm-i386/system.h from Linux 2.6.16
  */
 
 #ifndef __X86_ELF_X86_32_H__
 #define __X86_ELF_X86_32_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+/* XXX: Xen doesn't have orig_eax.  For kdump, on a dom0 crash, the values
+ * for the crashing CPU could could be passed down from dom0, but is that
+ * neccessary?
+ * Also, I'm not sure why fs and gs are derived from the CPU
+ * rather than regs */
 
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
-    printk("STUB: include/asm-x86/x86_32/kexec.h: ELF_CORE_COPY_REGS: " \
-       "not implemented\n")
-  
+#define ELF_CORE_COPY_REGS(pr_reg, regs) do {                           \
+    unsigned i;                                                         \
+    pr_reg[0] = regs->ebx;                                              \
+    pr_reg[1] = regs->ecx;                                              \
+    pr_reg[2] = regs->edx;                                              \
+    pr_reg[3] = regs->esi;                                              \
+    pr_reg[4] = regs->edi;                                              \
+    pr_reg[5] = regs->ebp;                                              \
+    pr_reg[6] = regs->eax;                                              \
+    pr_reg[7] = regs->ds;                                               \
+    pr_reg[8] = regs->es;                                               \
+    asm volatile("mov %%fs,%0":"=rm" (i)); pr_reg[9]  = i;              \
+    asm volatile("mov %%gs,%0":"=rm" (i)); pr_reg[10] = i;              \
+    pr_reg[11] = 0; /* regs->orig_eax; */                               \
+    pr_reg[12] = regs->eip;                                             \
+    pr_reg[13] = regs->cs;                                              \
+    pr_reg[14] = regs->eflags;                                          \
+    pr_reg[15] = regs->esp;                                             \
+    pr_reg[16] = regs->ss;                                              \
+} while(0);
 
 #endif /* __X86_ELF_X86_32_H__ */
 
--- x/xen/include/asm-x86/x86_32/kexec.h
+++ x/xen/include/asm-x86/x86_32/kexec.h
@@ -3,42 +3,72 @@
  * 
  * Created By: Horms
  *
- * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16
- *
+ * Based heavily on include/asm-i386/kexec.h from Linux 2.6.16
  */
 
-#ifndef __X86_32_KEXEC_H__
-#define __X86_32_KEXEC_H__
-
-#include <xen/lib.h>       /* for printk() used in stub */
-#include <xen/types.h>
-#include <public/xen.h>
+#ifndef __X86_KEXEC_X86_32_H__
+#define __X86_KEXEC_X86_32_H__
 
+/* CPU does not save ss and esp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
 static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
-		    struct cpu_user_regs *oldregs)
+                    struct cpu_user_regs *oldregs)
 {
-    printk("STUB: include/asm-x86/x86_32/kexec.h: crash_fixup_ss_esp: "
-       "not implemented\n");
-    return;
-    crash_fixup_ss_esp(newregs, oldregs);
+    memcpy(newregs, oldregs, sizeof(*newregs));
+    newregs->esp = (unsigned long)&(oldregs->esp);
+    __asm__ __volatile__(
+            "xorl %%eax, %%eax\n\t"
+            "movw %%ss, %%ax\n\t"
+            :"=a"(newregs->ss));
 }
 
+/*
+ * This function is responsible for capturing register states if coming
+ * via panic otherwise just fix up the ss and esp if coming via kernel
+ * mode exception.
+ */
 static void crash_setup_regs(struct cpu_user_regs *newregs,
 			     struct cpu_user_regs *oldregs)
 {
-    printk("STUB: include/asm-x86/x86_32/kexec.h: crash_setup_regs: "
-       "not implemented\n");
+    if (oldregs)
+        crash_fixup_ss_esp(newregs, oldregs);
+    else {
+        __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx));
+        __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx));
+        __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx));
+        __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi));
+        __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi));
+        __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp));
+        __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax));
+        __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp));
+        __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->ss));
+        __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->cs));
+        __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->ds));
+        __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->es));
+        __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags));
+
+        newregs->eip = (unsigned long)current_text_addr();
+    }
 }
 
+/*              
+ * From Linux 2.6.16's include/asm-i386/mach-xen/asm/ptrace.h
+ *
+ * user_mode_vm(regs) determines whether a register set came from user mode.
+ * This is true if V8086 mode was enabled OR if the register set was from
+ * protected mode with RPL-3 CS value.  This tricky test checks that with
+ * one comparison.  Many places in the kernel can bypass this full check
+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ */
 static inline int user_mode(struct cpu_user_regs *regs)
 {
-    printk("STUB: include/asm-x86/x86_32/kexec.h: user_mode: "
-       "not implemented\n");
-    return -1;
+    return (regs->cs & 2) != 0;
 }
 
 
-#endif /* __X86_32_KEXEC_H__ */
+#endif /* __X86_KEXEC_X86_32_H__ */
 
 /*
  * Local variables:
--- /dev/null
+++ x/patches/linux-2.6.16.13/1-linux-2.6.16-kexec_page_table_a_i386.patch
@@ -0,0 +1,457 @@
+kexec: Avoid overwriting the current pgd (V2, i386)
+
+This patch upgrades the i386-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables called "page_table_a". These
+tables are used to provide an executable identity mapping without overwriting
+the current pgd. This updated version of the patch fixes a PAE bug and moves
+the segment handling code into the reloacte_kernel.S.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+---
+
+ The patch has been tested with regular kexec and CONFIG_CRASH_DUMP.
+ Both PAE and non-PAE configurations work well.
+ Applies on top of 2.6.16 and 2.6.17-rc4.
+
+ arch/i386/kernel/machine_kexec.c   |  230 ++++++++++++++----------------------
+ arch/i386/kernel/relocate_kernel.S |   92 ++++++++++++++
+ include/asm-i386/kexec.h           |   12 +
+ 3 files changed, 192 insertions(+), 142 deletions(-)
+
+--- x/arch/i386/kernel/machine_kexec.c
++++ x/arch/i386/kernel/machine_kexec.c
+@@ -2,6 +2,10 @@
+  * machine_kexec.c - handle transition of Linux booting another kernel
+  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - rewrote identity map code to avoid overwriting current pgd
++ * - moved segment handling code into relocate_kernel.S
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+@@ -19,123 +23,73 @@
+ #include <asm/desc.h>
+ #include <asm/system.h>
+ 
+-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+-
+-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L2_ATTR (_PAGE_PRESENT)
+-
+-#define LEVEL0_SIZE (1UL << 12UL)
++typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
++					unsigned long indirection_page,
++					unsigned long reboot_code_buffer,
++					unsigned long start_address,
++					unsigned long page_table_a,
++					unsigned long has_pae) ATTRIB_NORET;
+ 
+-#ifndef CONFIG_X86_PAE
+-#define LEVEL1_SIZE (1UL << 22UL)
+-static u32 pgtable_level1[1024] PAGE_ALIGNED;
++const extern unsigned char relocate_new_kernel[];
++extern void relocate_new_kernel_end(void);
++const extern unsigned int relocate_new_kernel_size;
+ 
+-static void identity_map_page(unsigned long address)
++static int allocate_page_table_a(struct kimage *image)
+ {
+-	unsigned long level1_index, level2_index;
+-	u32 *pgtable_level2;
+-
+-	/* Find the current page table */
+-	pgtable_level2 = __va(read_cr3());
++	struct kimage_arch *arch = &image->arch_data;
++	struct page *page;
++	int k = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
++
++	for (; k > 0; k--) {
++		page = kimage_alloc_control_pages(image, 0);
++		if (!page)
++			return -ENOMEM;
++
++		clear_page(page_address(page));
++		arch->page_table_a[k - 1] = page;
++	}
+ 
+-	/* Find the indexes of the physical address to identity map */
+-	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+-	level2_index = address / LEVEL1_SIZE;
+-
+-	/* Identity map the page table entry */
+-	pgtable_level1[level1_index] = address | L0_ATTR;
+-	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+-
+-	/* Flush the tlb so the new mapping takes effect.
+-	 * Global tlb entries are not flushed but that is not an issue.
+-	 */
+-	load_cr3(pgtable_level2);
++	return 0;
+ }
+ 
+-#else
+-#define LEVEL1_SIZE (1UL << 21UL)
+-#define LEVEL2_SIZE (1UL << 30UL)
+-static u64 pgtable_level1[512] PAGE_ALIGNED;
+-static u64 pgtable_level2[512] PAGE_ALIGNED;
+-
+-static void identity_map_page(unsigned long address)
+-{
+-	unsigned long level1_index, level2_index, level3_index;
+-	u64 *pgtable_level3;
++/* workaround for include/asm-i386/pgtable-3level.h */
+ 
+-	/* Find the current page table */
+-	pgtable_level3 = __va(read_cr3());
+-
+-	/* Find the indexes of the physical address to identity map */
+-	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+-	level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
+-	level3_index = address / LEVEL2_SIZE;
+-
+-	/* Identity map the page table entry */
+-	pgtable_level1[level1_index] = address | L0_ATTR;
+-	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+-	set_64bit(&pgtable_level3[level3_index],
+-					       __pa(pgtable_level2) | L2_ATTR);
+-
+-	/* Flush the tlb so the new mapping takes effect.
+-	 * Global tlb entries are not flushed but that is not an issue.
+-	 */
+-	load_cr3(pgtable_level3);
+-}
++#ifdef CONFIG_X86_PAE
++#undef pgd_present
++#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
++#define _PGD_ATTR _PAGE_PRESENT
++#else
++#define _PGD_ATTR _KERNPG_TABLE
+ #endif
+ 
+-static void set_idt(void *newidt, __u16 limit)
+-{
+-	struct Xgt_desc_struct curidt;
+-
+-	/* ia32 supports unaliged loads & stores */
+-	curidt.size    = limit;
+-	curidt.address = (unsigned long)newidt;
+-
+-	load_idt(&curidt);
+-};
++#define pa_page(page) __pa(page_address(page))
+ 
+-
+-static void set_gdt(void *newgdt, __u16 limit)
++static int create_mapping(struct page *root, struct page **pages, 
++			  unsigned long va, unsigned long pa)
+ {
+-	struct Xgt_desc_struct curgdt;
+-
+-	/* ia32 supports unaligned loads & stores */
+-	curgdt.size    = limit;
+-	curgdt.address = (unsigned long)newgdt;
++	pgd_t *pgd;
++	pud_t *pud;
++	pmd_t *pmd;
++	pte_t *pte;
++	int k = 0;
+ 
+-	load_gdt(&curgdt);
+-};
++	pgd = (pgd_t *)page_address(root) + pgd_index(va);
++	if (!pgd_present(*pgd))
++		set_pgd(pgd, __pgd(pa_page(pages[k++]) | _PGD_ATTR));
+ 
+-static void load_segments(void)
+-{
+-#define __STR(X) #X
+-#define STR(X) __STR(X)
++	pud = pud_offset(pgd, va);
++	if (!pud_present(*pud))
++		set_pud(pud, __pud(pa_page(pages[k++]) | _KERNPG_TABLE));
+ 
+-	__asm__ __volatile__ (
+-		"\tljmp $"STR(__KERNEL_CS)",$1f\n"
+-		"\t1:\n"
+-		"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
+-		"\tmovl %%eax,%%ds\n"
+-		"\tmovl %%eax,%%es\n"
+-		"\tmovl %%eax,%%fs\n"
+-		"\tmovl %%eax,%%gs\n"
+-		"\tmovl %%eax,%%ss\n"
+-		::: "eax", "memory");
+-#undef STR
+-#undef __STR
+-}
++	pmd = pmd_offset(pud, va);
++	if (!pmd_present(*pmd))
++		set_pmd(pmd, __pmd(pa_page(pages[k++]) | _KERNPG_TABLE));
+ 
+-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
+-					unsigned long indirection_page,
+-					unsigned long reboot_code_buffer,
+-					unsigned long start_address,
+-					unsigned int has_pae) ATTRIB_NORET;
++	pte = (pte_t *)page_address(pmd_page(*pmd)) + pte_index(va);
++	set_pte(pte, __pte(pa | _PAGE_KERNEL_EXEC));
+ 
+-const extern unsigned char relocate_new_kernel[];
+-extern void relocate_new_kernel_end(void);
+-const extern unsigned int relocate_new_kernel_size;
++	return k;
++}
+ 
+ /*
+  * A architecture hook called to validate the
+@@ -147,11 +101,38 @@ const extern unsigned int relocate_new_k
+  * Do what every setup is needed on image and the
+  * reboot code buffer to allow us to avoid allocations
+  * later.
+- *
+- * Currently nothing.
+  */
+ int machine_kexec_prepare(struct kimage *image)
+ {
++	void *control_page;
++	unsigned long pa;
++	int k;
++
++	memset(&image->arch_data, 0, sizeof(image->arch_data));
++
++	k = allocate_page_table_a(image);
++	if (k)
++		return k;
++
++	/* fill in control_page with assembly code */
++
++	control_page = page_address(image->control_code_page);
++	memcpy(control_page, relocate_new_kernel, relocate_new_kernel_size);
++
++	/* map the control_page at the virtual address of relocate_kernel.S */
++
++	pa = __pa(control_page);
++
++	k = create_mapping(image->arch_data.page_table_a[0], 
++			   &image->arch_data.page_table_a[1],
++			   (unsigned long)relocate_new_kernel, pa);
++
++	/* identity map the control_page */
++
++	create_mapping(image->arch_data.page_table_a[0], 
++		       &image->arch_data.page_table_a[k + 1],
++		       pa, pa);
++
+ 	return 0;
+ }
+ 
+@@ -170,45 +151,16 @@ void machine_kexec_cleanup(struct kimage
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+ 	unsigned long page_list;
+-	unsigned long reboot_code_buffer;
+-
++	unsigned long control_code;
++	unsigned long page_table_a;
+ 	relocate_new_kernel_t rnk;
+ 
+-	/* Interrupts aren't acceptable while we reboot */
+-	local_irq_disable();
+-
+-	/* Compute some offsets */
+-	reboot_code_buffer = page_to_pfn(image->control_code_page)
+-								<< PAGE_SHIFT;
+ 	page_list = image->head;
+-
+-	/* Set up an identity mapping for the reboot_code_buffer */
+-	identity_map_page(reboot_code_buffer);
+-
+-	/* copy it out */
+-	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+-						relocate_new_kernel_size);
+-
+-	/* The segment registers are funny things, they are
+-	 * automatically loaded from a table, in memory wherever you
+-	 * set them to a specific selector, but this table is never
+-	 * accessed again you set the segment to a different selector.
+-	 *
+-	 * The more common model is are caches where the behide
+-	 * the scenes work is done, but is also dropped at arbitrary
+-	 * times.
+-	 *
+-	 * I take advantage of this here by force loading the
+-	 * segments, before I zap the gdt with an invalid value.
+-	 */
+-	load_segments();
+-	/* The gdt & idt are now invalid.
+-	 * If you want to load them you must set up your own idt & gdt.
+-	 */
+-	set_gdt(phys_to_virt(0),0);
+-	set_idt(phys_to_virt(0),0);
++	control_code = __pa(page_address(image->control_code_page));
++	page_table_a = __pa(page_address(image->arch_data.page_table_a[0]));
+ 
+ 	/* now call it */
+-	rnk = (relocate_new_kernel_t) reboot_code_buffer;
+-	(*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
++	rnk = (relocate_new_kernel_t) relocate_new_kernel;
++	(*rnk)(page_list, control_code, image->start, 
++	       page_table_a, (unsigned long)cpu_has_pae);
+ }
+--- x/arch/i386/kernel/relocate_kernel.S
++++ x/arch/i386/kernel/relocate_kernel.S
+@@ -2,12 +2,20 @@
+  * relocate_kernel.S - put the kernel image in place to boot
+  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - moved segment handling code from machine_kexec.c
++ * - gdt tables stolen from arch/i386/boot/setup.S
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+ 
+ #include <linux/linkage.h>
++#include <asm/page.h>
+ 
++.text
++.align (1 << PAGE_SHIFT)
++	
+ 	/*
+ 	 * Must be relocatable PIC code callable as a C function, that once
+ 	 * it starts can not use the previous processes stack.
+@@ -18,18 +26,68 @@ relocate_new_kernel:
+ 	movl  4(%esp), %ebx /* page_list */
+ 	movl  8(%esp), %ebp /* reboot_code_buffer */
+ 	movl  12(%esp), %edx /* start address */
+-	movl  16(%esp), %ecx /* cpu_has_pae */
++	movl  16(%esp), %edi /* page_table_a */
++	movl  20(%esp), %ecx /* cpu_has_pae */
+ 
+ 	/* zero out flags, and disable interrupts */
+ 	pushl $0
+ 	popfl
+ 
++	/* switch to page_table_a */
++	movl	%edi, %eax
++	movl	%eax, %cr3
++
++	/* setup idt */
++
++	movl	%ebp, %eax
++	addl	$(idt_48 - relocate_new_kernel), %eax
++	lidtl	(%eax)
++
++	/* setup gdt */
++
++	movl	%ebp, %eax
++	addl	$(gdt - relocate_new_kernel), %eax
++	movl	%ebp, %esi
++	addl	$((gdt_48 - relocate_new_kernel) + 2), %esi
++	movl	%eax, (%esi)
++	
++	movl	%ebp, %eax
++	addl	$(gdt_48 - relocate_new_kernel), %eax
++	lgdtl	(%eax)
++
++	/* setup data segment registers */
++	
++	mov	$(gdt_ds - gdt), %eax
++	mov	%eax, %ds
++	mov	%eax, %es
++	mov	%eax, %fs
++	mov	%eax, %gs
++	mov	%eax, %ss
++
+ 	/* set a new stack at the bottom of our page... */
+ 	lea   4096(%ebp), %esp
+ 
++	/* load new code segment */
++
++	movl	%ebp, %esi
++	xorl	%eax, %eax
++	pushl	%eax
++	pushl	%esi
++	pushl	%eax
++	
++	movl	$(gdt_cs - gdt), %eax
++	pushl	%eax
++	
++	movl	%ebp, %eax
++	addl	$(identity_mapped - relocate_new_kernel),%eax
++	pushl	%eax
++	iretl
++
++identity_mapped:	
++
+ 	/* store the parameters back on the stack */
+ 	pushl   %edx /* store the start address */
+-
++	
+ 	/* Set cr0 to a known state:
+ 	 * 31 0 == Paging disabled
+ 	 * 18 0 == Alignment check disabled
+@@ -113,6 +171,36 @@ relocate_new_kernel:
+ 	xorl    %edi, %edi
+ 	xorl    %ebp, %ebp
+ 	ret
++
++	.align	16
++gdt:
++	.fill	1,8,0
++
++gdt_cs:	
++	.word	0xFFFF				# 4Gb - (0x100000*0x1000 = 4Gb)
++	.word	0				# base address = 0
++	.word	0x9A00				# code read/exec
++	.word	0x00CF				# granularity = 4096, 386
++						#  (+5th nibble of limit)
++gdt_ds:
++	.word	0xFFFF				# 4Gb - (0x100000*0x1000 = 4Gb)
++	.word	0				# base address = 0
++	.word	0x9200				# data read/write
++	.word	0x00CF				# granularity = 4096, 386
++						#  (+5th nibble of limit)
++gdt_end:
++	.align	4
++	
++	.word	0				# alignment byte
++idt_48:
++	.word	0				# idt limit = 0
++	.word	0, 0				# idt base = 0L
++
++	.word	0				# alignment byte
++gdt_48:
++	.word	gdt_end - gdt - 1		# gdt limit
++	.word	0, 0				# gdt base (filled in later)
++	
+ relocate_new_kernel_end:
+ 
+ 	.globl relocate_new_kernel_size
+--- x/include/asm-i386/kexec.h
++++ x/include/asm-i386/kexec.h
+@@ -29,7 +29,17 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
+-struct kimage_arch {};
++struct kimage_arch {
++       /* page_table_a[] holds enough pages to create a new page table
++        * that maps the control page twice..
++        */
++
++#if defined(CONFIG_X86_PAE)
++       struct page *page_table_a[5]; /* (2 * pte) + (2 * pmd) + pgd */
++#else
++       struct page *page_table_a[3]; /* (2 * pte) + pgd */
++#endif
++};
+ 
+ /* CPU does not save ss and esp on stack if execution is already
+  * running in kernel mode at the time of NMI occurrence. This code

[-- Attachment #5: 51.2.1.2-kexec-x86_64-upstream.patch --]
[-- Type: text/plain, Size: 23387 bytes --]

kexec: x86_64

This is the x86_64 component of kexec for xen.
The x86 component is a prerequsite for this patch.

Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
Signed-Off-By: Horms <horms@verge.net.au>

 buildconfigs/linux-defconfig_xen_x86_64                                    |    2 
 linux-2.6-xen-sparse/arch/x86_64/Kconfig                                   |    2 
 linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile                           |    2 
 linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c                        |   26 
 linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h                        |   30 
 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h           |    7 
 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h              |    2 
 patches/linux-2.6.16.13/2-linux-2.6.16-kexec_page_table_a_x86_64.patch     |  421 ++++++++++
 patches/linux-2.6.16.13/3-linux-2.6.16-kexec_page_table_a_x86_64-xen.patch |  116 ++
 xen/arch/x86/x86_64/entry.S                                                |    2 
 xen/arch/x86/x86_64/machine_kexec.c                                        |   16 
 11 files changed, 621 insertions(+), 5 deletions(-)

--- x/buildconfigs/linux-defconfig_xen_x86_64
+++ x/buildconfigs/linux-defconfig_xen_x86_64
@@ -139,6 +139,8 @@ CONFIG_SWIOTLB=y
 CONFIG_PHYSICAL_START=0x100000
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
+CONFIG_CRASH_DUMP=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
--- x/linux-2.6-xen-sparse/arch/x86_64/Kconfig
+++ x/linux-2.6-xen-sparse/arch/x86_64/Kconfig
@@ -433,7 +433,7 @@ config X86_MCE_AMD
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_64_XEN
+	depends on EXPERIMENTAL
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
@@ -59,7 +59,7 @@ pci-dma-y			+= ../../i386/kernel/pci-dma
 microcode-$(subst m,y,$(CONFIG_MICROCODE))  := ../../i386/kernel/microcode-xen.o
 quirks-y			:= ../../i386/kernel/quirks-xen.o
 
-n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o
+n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o crash.o
 
 include $(srctree)/scripts/Makefile.xen
 
--- x/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
@@ -79,6 +79,10 @@
 #include <asm/mach-xen/setup_arch_post.h>
 #include <xen/interface/memory.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 extern unsigned long start_pfn;
 extern struct edid_info edid_info;
 
@@ -446,6 +450,7 @@ static __init void parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -456,6 +461,10 @@ static __init void parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 
@@ -810,10 +819,23 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif	/* !CONFIG_XEN */
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end) {
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
 	}
+#else
+	{
+		xen_kexec_reserve_t reservation;
+		BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0,
+					&reservation));
+		if (reservation.size) {
+			crashk_res.start = reservation.start;
+			crashk_res.end = reservation.start + 
+				reservation.size - 1;
+		}
+	}
+#endif
 #endif
 
 	paging_init();
@@ -972,6 +994,10 @@ void __init setup_arch(char **cmdline_p)
 	iommu_hole_init();
 #endif
 
+#ifdef CONFIG_KEXEC
+	request_resource(&ioport_resource, &crashk_res);
+#endif
+
 #ifdef CONFIG_XEN
 	{
 		struct physdev_set_iopl set_iopl;
--- /dev/null
+++ x/linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h
@@ -0,0 +1,30 @@
+/*
+ * include/asm-x86_64/kexec-xen.h
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _X86_64_KEXEC_XEN_H
+#define _X86_64_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_64.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+					struct cpu_user_regs *xen_regs)
+{
+#warning Implement me!
+}
+
+#endif /* _X86_64_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h
@@ -360,4 +360,11 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, unsigned int arg1, void * extra_args)
+{
+	return _hypercall3(int, kexec_op, op, arg1, extra_args);
+}
+
 #endif /* __HYPERCALL_H__ */
--- x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h
+++ x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h
@@ -90,6 +90,8 @@ extern unsigned long profile_pc(struct p
 #define profile_pc(regs) instruction_pointer(regs)
 #endif
 
+#include <linux/compiler.h>
+
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
 
 struct task_struct;
--- x/xen/arch/x86/x86_64/entry.S
+++ x/xen/arch/x86/x86_64/entry.S
@@ -556,6 +556,7 @@ ENTRY(hypercall_table)
         .quad do_xenoprof_op
         .quad do_event_channel_op
         .quad do_physdev_op
+        .quad do_kexec
         .rept NR_hypercalls-((.-hypercall_table)/8)
         .quad do_ni_hypercall
         .endr
@@ -595,6 +596,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_xenoprof_op       */
         .byte 2 /* do_event_channel_op  */
         .byte 2 /* do_physdev_op        */
+        .byte 3 /* do_kexec             */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- x/xen/arch/x86/x86_64/machine_kexec.c
+++ x/xen/arch/x86/x86_64/machine_kexec.c
@@ -7,14 +7,24 @@
  * Should be losely based on arch/x86_64/kernel/machine_kexec.c
  */
 
-#include <xen/lib.h>       /* for printk() used in stubs */
 #include <xen/types.h>
 #include <public/kexec.h>
+#include <asm/fixmap.h>
+
+typedef void (*relocate_new_kernel_t)(unsigned long indirection_page,
+                                      unsigned long control_code_buffer,
+                                      unsigned long start_address,
+                                      unsigned long page_table_a,
+                                      unsigned long page_table_b);
 
 void machine_kexec(xen_kexec_image_t *image)
 {
-    printk("STUB: arch/x86/x86_64/machine_kexec.c: machine_kexec: "
-        "not implemented\n");
+    relocate_new_kernel_t rnk;
+
+    rnk = (relocate_new_kernel_t) fix_to_virt(FIX_KEXEC_PAGE);
+    (*rnk)(image->indirection_page, image->reboot_code_buffer, 
+           image->start_address, image->page_table_a[0],
+           image->page_table_b);
 }
 
 /*
--- /dev/null
+++ x/patches/linux-2.6.16.13/2-linux-2.6.16-kexec_page_table_a_x86_64.patch
@@ -0,0 +1,421 @@
+kexec: Avoid overwriting the current pgd (V2, x86_64)
+
+This patch upgrades the x86_64-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables called "page_table_a". These
+tables are used to provide an executable identity mapping without overwriting
+the current pgd. The already existing page table is renamed to "page_table_b".
+
+KEXEC_CONTROL_CODE_SIZE is changed into a single page. This updated version of
+the patch also moves the segment handling code into the reloacte_kernel.S.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+---
+
+ The patch has been tested with regular kexec and CONFIG_CRASH_DUMP.
+ Applies on top of 2.6.16 and 2.6.17-rc4.
+
+ arch/x86_64/kernel/machine_kexec.c   |  193 +++++++++++++++++-----------------
+ arch/x86_64/kernel/relocate_kernel.S |   84 +++++++++++++-
+ include/asm-x86_64/kexec.h           |   15 ++
+ 3 files changed, 189 insertions(+), 103 deletions(-)
+
+--- x/arch/x86_64/kernel/machine_kexec.c
++++ x/arch/x86_64/kernel/machine_kexec.c
+@@ -2,6 +2,10 @@
+  * machine_kexec.c - handle transition of Linux booting another kernel
+  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - rewrote identity map code to avoid overwriting current pgd
++ * - moved segment handling code into relocate_kernel.S
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+@@ -96,81 +100,110 @@ out:
+ }
+ 
+ 
+-static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
++static int create_page_table_b(struct kimage *image)
+ {
+-	pgd_t *level4p;
+-	level4p = (pgd_t *)__va(start_pgtable);
+- 	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+-}
++	struct kimage_arch *arch = &image->arch_data;
+ 
+-static void set_idt(void *newidt, u16 limit)
+-{
+-	struct desc_ptr curidt;
++	arch->page_table_b = kimage_alloc_control_pages(image, 0);
+ 
+-	/* x86-64 supports unaliged loads & stores */
+-	curidt.size    = limit;
+-	curidt.address = (unsigned long)newidt;
++	if (!arch->page_table_b)
++		return -ENOMEM;
+ 
+-	__asm__ __volatile__ (
+-		"lidtq %0\n"
+-		: : "m" (curidt)
+-		);
+-};
++ 	return init_level4_page(image, page_address(arch->page_table_b),
++				0, end_pfn << PAGE_SHIFT);
++}
+ 
++typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
++					unsigned long control_code_buffer,
++					unsigned long start_address,
++					unsigned long page_table_a,
++					unsigned long page_table_b) ATTRIB_NORET;
++
++const extern unsigned char relocate_new_kernel[];
++const extern unsigned long relocate_new_kernel_size;
+ 
+-static void set_gdt(void *newgdt, u16 limit)
++static int allocate_page_table_a(struct kimage *image)
+ {
+-	struct desc_ptr curgdt;
++	struct kimage_arch *arch = &image->arch_data;
++	struct page *page;
++	int k = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
+ 
+-	/* x86-64 supports unaligned loads & stores */
+-	curgdt.size    = limit;
+-	curgdt.address = (unsigned long)newgdt;
++	for (; k > 0; k--) {
++		page = kimage_alloc_control_pages(image, 0);
++		if (!page)
++			return -ENOMEM;
+ 
+-	__asm__ __volatile__ (
+-		"lgdtq %0\n"
+-		: : "m" (curgdt)
+-		);
+-};
++		clear_page(page_address(page));
++		arch->page_table_a[k - 1] = page;
++	}
+ 
+-static void load_segments(void)
+-{
+-	__asm__ __volatile__ (
+-		"\tmovl %0,%%ds\n"
+-		"\tmovl %0,%%es\n"
+-		"\tmovl %0,%%ss\n"
+-		"\tmovl %0,%%fs\n"
+-		"\tmovl %0,%%gs\n"
+-		: : "a" (__KERNEL_DS) : "memory"
+-		);
++	return 0;
+ }
+ 
+-typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
+-					unsigned long control_code_buffer,
+-					unsigned long start_address,
+-					unsigned long pgtable) ATTRIB_NORET;
++#define _PAGE_KERNEL_EXEC __PAGE_KERNEL_EXEC
++#define pa_page(page) __pa_symbol(page_address(page)) /* __pa() miscompiles */
+ 
+-const extern unsigned char relocate_new_kernel[];
+-const extern unsigned long relocate_new_kernel_size;
++static int create_mapping(struct page *root, struct page **pages, 
++			  unsigned long va, unsigned long pa)
++{
++	pgd_t *pgd;
++	pud_t *pud;
++	pmd_t *pmd;
++	pte_t *pte;
++	int k = 0;
++
++	pgd = (pgd_t *)page_address(root) + pgd_index(va);
++	if (!pgd_present(*pgd))
++		set_pgd(pgd, __pgd(pa_page(pages[k++]) | _KERNPG_TABLE));
++
++	pud = pud_offset(pgd, va);
++	if (!pud_present(*pud))
++		set_pud(pud, __pud(pa_page(pages[k++]) | _KERNPG_TABLE));
++
++	pmd = pmd_offset(pud, va);
++	if (!pmd_present(*pmd))
++		set_pmd(pmd, __pmd(pa_page(pages[k++]) | _KERNPG_TABLE));
++
++	pte = (pte_t *)page_address(pmd_page(*pmd)) + pte_index(va);
++	set_pte(pte, __pte(pa | _PAGE_KERNEL_EXEC));
++
++	return k;
++}
+ 
+ int machine_kexec_prepare(struct kimage *image)
+ {
+-	unsigned long start_pgtable, control_code_buffer;
+-	int result;
++	void *control_page;
++	unsigned long pa;
++	int k;
+ 
+-	/* Calculate the offsets */
+-	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+-	control_code_buffer = start_pgtable + PAGE_SIZE;
+-
+-	/* Setup the identity mapped 64bit page table */
+-	result = init_pgtable(image, start_pgtable);
+-	if (result)
+-		return result;
+-
+-	/* Place the code in the reboot code buffer */
+-	memcpy(__va(control_code_buffer), relocate_new_kernel,
+-						relocate_new_kernel_size);
++	memset(&image->arch_data, 0, sizeof(image->arch_data));
+ 
+-	return 0;
++	k = allocate_page_table_a(image);
++	if (k)
++		return k;
++
++	/* fill in control_page with assembly code */
++
++	control_page = page_address(image->control_code_page);
++	memcpy(control_page, relocate_new_kernel, relocate_new_kernel_size);
++
++	/* map the control_page at the virtual address of relocate_kernel.S */
++
++	pa = __pa(control_page);
++
++	k = create_mapping(image->arch_data.page_table_a[0], 
++			   &image->arch_data.page_table_a[1],
++			   (unsigned long)relocate_new_kernel, pa);
++
++	/* identity map the control_page */
++
++	create_mapping(image->arch_data.page_table_a[0], 
++		       &image->arch_data.page_table_a[k + 1],
++		       pa, pa);
++
++	/* create identity mapped page table aka page_table_b */
++
++	return create_page_table_b(image);
+ }
+ 
+ void machine_kexec_cleanup(struct kimage *image)
+@@ -185,47 +218,17 @@ void machine_kexec_cleanup(struct kimage
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+ 	unsigned long page_list;
+-	unsigned long control_code_buffer;
+-	unsigned long start_pgtable;
++	unsigned long control_code;
++	unsigned long page_table_a;
++	unsigned long page_table_b;
+ 	relocate_new_kernel_t rnk;
+ 
+-	/* Interrupts aren't acceptable while we reboot */
+-	local_irq_disable();
+-
+-	/* Calculate the offsets */
+ 	page_list = image->head;
+-	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+-	control_code_buffer = start_pgtable + PAGE_SIZE;
++	control_code = __pa(page_address(image->control_code_page));
++	page_table_a = __pa(page_address(image->arch_data.page_table_a[0]));
++	page_table_b = __pa(page_address(image->arch_data.page_table_b));
+ 
+-	/* Set the low half of the page table to my identity mapped
+-	 * page table for kexec.  Leave the high half pointing at the
+-	 * kernel pages.   Don't bother to flush the global pages
+-	 * as that will happen when I fully switch to my identity mapped
+-	 * page table anyway.
+-	 */
+-	memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
+-	__flush_tlb();
+-
+-
+-	/* The segment registers are funny things, they are
+-	 * automatically loaded from a table, in memory wherever you
+-	 * set them to a specific selector, but this table is never
+-	 * accessed again unless you set the segment to a different selector.
+-	 *
+-	 * The more common model are caches where the behide
+-	 * the scenes work is done, but is also dropped at arbitrary
+-	 * times.
+-	 *
+-	 * I take advantage of this here by force loading the
+-	 * segments, before I zap the gdt with an invalid value.
+-	 */
+-	load_segments();
+-	/* The gdt & idt are now invalid.
+-	 * If you want to load them you must set up your own idt & gdt.
+-	 */
+-	set_gdt(phys_to_virt(0),0);
+-	set_idt(phys_to_virt(0),0);
+ 	/* now call it */
+-	rnk = (relocate_new_kernel_t) control_code_buffer;
+-	(*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
++	rnk = (relocate_new_kernel_t) relocate_new_kernel;
++	(*rnk)(page_list, control_code, image->start, page_table_a, page_table_b);
+ }
+--- x/arch/x86_64/kernel/relocate_kernel.S
++++ x/arch/x86_64/kernel/relocate_kernel.S
+@@ -2,11 +2,18 @@
+  * relocate_kernel.S - put the kernel image in place to boot
+  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - moved segment handling code from machine_kexec.c
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+ 
+ #include <linux/linkage.h>
++#include <asm/page.h>
++
++.text
++.align (1 << PAGE_SHIFT)
+ 
+ 	/*
+ 	 * Must be relocatable PIC code callable as a C function, that once
+@@ -18,21 +25,69 @@ relocate_new_kernel:
+ 	/* %rdi page_list
+ 	 * %rsi reboot_code_buffer
+ 	 * %rdx start address
+-	 * %rcx page_table
+-	 * %r8  arg5
++	 * %rcx page_table_a
++	 * %r8  page_table_b
+ 	 * %r9  arg6
+ 	 */
+-
++	
+ 	/* zero out flags, and disable interrupts */
+ 	pushq $0
+ 	popfq
+ 
++	/* switch to page_table_a */
++	movq    %rcx, %cr3
++
++	/* setup idt */
++
++	movq	%rsi, %rax
++	addq	$(idt_48 - relocate_new_kernel), %rax
++	lidtq	(%rax)
++
++	/* setup gdt */
++
++	movq	%rsi, %rax
++	addq	$(gdt - relocate_new_kernel), %rax
++	movq	%rsi, %r9
++	addq	$((gdt_48 - relocate_new_kernel) + 2), %r9
++	movq	%rax, (%r9)
++	
++	movq	%rsi, %rax
++	addq	$(gdt_48 - relocate_new_kernel), %rax
++	lgdtq	(%rax)
++
++	/* setup data segment registers */
++
++	xorl	%eax,%eax
++	movl	%eax, %ds
++	movl	%eax, %es
++	movl	%eax, %fs
++	movl	%eax, %gs
++	movl	%eax, %ss
++
+ 	/* set a new stack at the bottom of our page... */
+ 	lea   4096(%rsi), %rsp
+ 
++	/* load new code segment */
++
++	movq	%rsp, %rcx
++	xorq	%rax, %rax
++	pushq	%rax                                              /* SS */
++	pushq	%rcx                                              /* ESP */
++	pushq	%rax                                              /* RFLAGS */
++
++	movq	$(gdt_code - gdt), %rax
++	pushq	%rax                                              /* CS */
++
++	movq	%rsi, %rax
++	addq	$(identity_mapped - relocate_new_kernel), %rax
++	pushq	%rax                                              /* RIP */
++
++	iretq
++	
++identity_mapped:
+ 	/* store the parameters back on the stack */
+ 	pushq	%rdx /* store the start address */
+-
++	
+ 	/* Set cr0 to a known state:
+ 	 * 31 1 == Paging enabled
+ 	 * 18 0 == Alignment check disabled
+@@ -69,7 +124,7 @@ relocate_new_kernel:
+ 	/* Switch to the identity mapped page tables,
+ 	 * and flush the TLB.
+ 	*/
+-	movq	%rcx, %cr3
++	movq	%r8, %cr3
+ 
+ 	/* Do the copies */
+ 	movq	%rdi, %rcx 	/* Put the page_list in %rcx */
+@@ -136,6 +191,25 @@ relocate_new_kernel:
+ 	xorq	%r15, %r15
+ 
+ 	ret
++	.align	16
++gdt:
++	.long   0x00000000  /* NULL descriptor */
++	.long   0x00000000
++gdt_code:
++	.long   0x00000000  /* code descriptor */
++	.long   0x00209800
++
++gdt_end:
++	.align	4
++	
++idt_48:
++	.word	0				# idt limit = 0
++	.quad	0, 0				# idt base = 0L
++
++gdt_48:
++	.word	gdt_end - gdt - 1		# gdt limit
++	.quad	0, 0				# gdt base (filled in later)
++
+ relocate_new_kernel_end:
+ 
+ 	.globl relocate_new_kernel_size
+--- x/include/asm-x86_64/kexec.h
++++ x/include/asm-x86_64/kexec.h
+@@ -21,15 +21,24 @@
+ /* Maximum address we can use for the control pages */
+ #define KEXEC_CONTROL_MEMORY_LIMIT     (0xFFFFFFFFFFUL)
+ 
+-/* Allocate one page for the pdp and the second for the code */
+-#define KEXEC_CONTROL_CODE_SIZE  (4096UL + 4096UL)
++#define KEXEC_CONTROL_CODE_SIZE  4096
+ 
+ /* The native architecture */
+ #define KEXEC_ARCH KEXEC_ARCH_X86_64
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
+-struct kimage_arch {};
++struct kimage_arch {
++	/* page_table_a[] holds enough pages to create a new page table
++	 * that maps the control page twice..
++	 *
++	 * page_table_b points to the root page of a page table which is used
++	 * to provide identity mapping of all ram.
++	 */
++
++	struct page *page_table_a[7]; /* 2 * (pte + pud + pmd) + pgd */
++	struct page *page_table_b;
++};
+ 
+ /*
+  * Saving the registers of the cpu on which panic occured in
--- /dev/null
+++ x/patches/linux-2.6.16.13/3-linux-2.6.16-kexec_page_table_a_x86_64-xen.patch
@@ -0,0 +1,116 @@
+ arch/x86_64/kernel/machine_kexec.c |   61 ++++++++++++++++++++++++++++++++----
+ 1 file changed, 55 insertions(+), 6 deletions(-)
+
+--- x/arch/x86_64/kernel/machine_kexec.c
++++ x/arch/x86_64/kernel/machine_kexec.c
+@@ -19,6 +19,48 @@
+ #include <asm/mmu_context.h>
+ #include <asm/io.h>
+ 
++#ifdef CONFIG_XEN
++#include <xen/interface/memory.h>
++#define x__pmd(x) ((pmd_t) { (x) } )
++#define x__pud(x) ((pud_t) { (x) } )
++#define x__pgd(x) ((pgd_t) { (x) } )
++
++#define x_pmd_val(x)	((x).pmd)
++#define x_pud_val(x)	((x).pud)
++#define x_pgd_val(x)	((x).pgd)
++
++static inline void x_set_pmd(pmd_t *dst, pmd_t val)
++{
++        x_pmd_val(*dst) = x_pmd_val(val); 
++} 
++
++static inline void x_set_pud(pud_t *dst, pud_t val)
++{
++	x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
++}
++
++static inline void x_pud_clear (pud_t *pud)
++{
++	x_pud_val(*pud) = 0;
++}
++
++static inline void x_set_pgd(pgd_t *dst, pgd_t val)
++{
++	x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val)); 
++} 
++
++static inline void x_pgd_clear (pgd_t * pgd)
++{
++	x_pgd_val(*pgd) = 0; 
++}
++
++#define MY_LARGE_EXEC _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
++#define MY_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
++#else
++#define MY_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
++#define MY_TABLE _KERNPG_TABLE
++#endif
++
+ static void init_level2_page(pmd_t *level2p, unsigned long addr)
+ {
+ 	unsigned long end_addr;
+@@ -26,7 +68,7 @@
+ 	addr &= PAGE_MASK;
+ 	end_addr = addr + PUD_SIZE;
+ 	while (addr < end_addr) {
+-		set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
++		x_set_pmd(level2p++, x__pmd(addr | MY_LARGE_EXEC));
+ 		addr += PMD_SIZE;
+ 	}
+ }
+@@ -51,12 +93,12 @@
+ 		}
+ 		level2p = (pmd_t *)page_address(page);
+ 		init_level2_page(level2p, addr);
+-		set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
++		x_set_pud(level3p++, x__pud(__pa(level2p) | MY_TABLE));
+ 		addr += PUD_SIZE;
+ 	}
+ 	/* clear the unused entries */
+ 	while (addr < end_addr) {
+-		pud_clear(level3p++);
++		x_pud_clear(level3p++);
+ 		addr += PUD_SIZE;
+ 	}
+ out:
+@@ -87,12 +129,12 @@
+ 		if (result) {
+ 			goto out;
+ 		}
+-		set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
++		x_set_pgd(level4p++, x__pgd(__pa(level3p) | MY_TABLE));
+ 		addr += PGDIR_SIZE;
+ 	}
+ 	/* clear the unused entries */
+ 	while (addr < end_addr) {
+-		pgd_clear(level4p++);
++		x_pgd_clear(level4p++);
+ 		addr += PGDIR_SIZE;
+ 	}
+ out:
+@@ -103,14 +145,21 @@
+ static int create_page_table_b(struct kimage *image)
+ {
+ 	struct kimage_arch *arch = &image->arch_data;
++	unsigned long last_page;
+ 
+ 	arch->page_table_b = kimage_alloc_control_pages(image, 0);
+ 
+ 	if (!arch->page_table_b)
+ 		return -ENOMEM;
+ 
++#ifdef CONFIG_XEN
++	last_page = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
++#else
++	last_page = end_pfn;
++#endif
++
+  	return init_level4_page(image, page_address(arch->page_table_b),
+-				0, end_pfn << PAGE_SHIFT);
++				0, last_page << PAGE_SHIFT);
+ }
+ 
+ typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,

[-- Attachment #6: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] kexec: framework and i386 (Take X)
  2006-05-25  7:20                                                   ` [PATCH] kexec: framework and i386 (Take X) Horms
@ 2006-06-05  2:53                                                     ` Akio Takebe
  2006-06-15  7:29                                                     ` [PATCH] kexec: framework and i386 (Take XI) Horms
  1 sibling, 0 replies; 68+ messages in thread
From: Akio Takebe @ 2006-06-05  2:53 UTC (permalink / raw)
  To: Horms, Keir Fraser
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Akio Takebe, Isaku Yamahata,
	Magnus Damm, Mark Williamson

Hi, Horms

When I tried your patch, I had compile errors.
If you use xchg(), you should get return value of xchg().
The above compile error occurre at the below part.

+void crash_kexec(struct cpu_user_regs *regs)
+{
+    int locked;
+
+    locked = xchg(&kexec_crash_lock, 1);
+    if (locked)
+        return;
+    __crash_kexec(regs);
+    xchg(&kexec_crash_lock, 0); <-------------this one
+}
+
+static int get_crash_note(int vcpuid, XEN_GUEST_HANDLE(void) uarg)
+{
+    struct domain *domain = current->domain;
+    unsigned long crash_note;
+    struct vcpu *vcpu;
+    int locked;
+
+    if (vcpuid < 0 || vcpuid > MAX_VIRT_CPUS)
+	return -EINVAL;
+
+    if ( ! (vcpu = domain->vcpu[vcpuid]) )
+	return -EINVAL;
+
+    locked = xchg(&kexec_crash_lock, 1);
+    if (locked)
+    {
+       printk("do_kexec: (CMD_kexec_crash_note): dump is locked\n");
+       return -EFAULT;
+    }
+    crash_note = __pa((unsigned long)per_cpu(crash_notes, vcpu->
processor));
+    xchg(&kexec_crash_lock, 0); <-------------this one
+
+    if ( unlikely(copy_to_guest(uarg, &crash_note, 1) != 0) )
+    {
+        printk("do_kexec: (CMD_kexec_crash_note): copy_to_guest failed
\n");
+        return -EFAULT;
+    }
+    
+    return 0;
+}
+
+int do_kexec(unsigned long op, int arg1, XEN_GUEST_HANDLE(void) uarg)
+{
+    xen_kexec_image_t *image;
+    int locked;
+    int *image_set;
+    int status = -EINVAL;
+
+    if ( !IS_PRIV(current->domain) )  
+        return -EPERM;
+
+    switch (op)
+    {
+    case KEXEC_CMD_kexec_crash_note:
+        return get_crash_note(arg1, uarg);
+    case KEXEC_CMD_kexec_reserve:
+	return get_reserve(uarg);
+    }
+
+    /* For all other ops, arg1 is the type of kexec, that is
+     * KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH */
+    if (arg1 == KEXEC_TYPE_CRASH)
+    {
+        image = &kexec_crash_image;
+        image_set = &kexec_crash_image_set;
+        locked = xchg(&kexec_crash_lock, 1);
+        if (locked)
+        {
+           printk("do_kexec: dump is locked\n");
+           return -EFAULT;
+        }
+    }
+    else
+    {
+        image = &kexec_image;
+        image_set = &kexec_image_set;
+    }
+
+    switch(op) {
+    case KEXEC_CMD_kexec:
+        BUG_ON(!*image_set);
+	status = __do_kexec(arg1, uarg, image);
+        break;
+    case KEXEC_CMD_kexec_load:
+        BUG_ON(*image_set);
+        if ( unlikely(copy_from_guest(image, uarg, 1) != 0) )
+        {
+            printk("do_kexec (CMD_kexec_load): copy_from_guest failed\n
");
+            status = -EFAULT;
+	    break;
+        }
+        *image_set = 1;
+        status = machine_kexec_load(arg1, image);
+        break;
+    case KEXEC_CMD_kexec_unload:
+        BUG_ON(!*image_set);
+        *image_set = 0;
+        machine_kexec_unload(arg1, image);
+        status = 0;
+        break;
+    }
+
+    if (arg1 == KEXEC_TYPE_CRASH)
+        xchg(&kexec_crash_lock, 0);  <-------------this one
+    return status;
+}
+


Best Regards,

Akio Takebe

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] kexec: framework and i386 (Take XI)
  2006-05-25  7:20                                                   ` [PATCH] kexec: framework and i386 (Take X) Horms
  2006-06-05  2:53                                                     ` Akio Takebe
@ 2006-06-15  7:29                                                     ` Horms
  2006-07-11  3:39                                                       ` [PATCH] kexec: framework and i386 (Take XII) Horms
  1 sibling, 1 reply; 68+ messages in thread
From: Horms @ 2006-06-15  7:29 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Akio Takebe, Isaku Yamahata,
	Magnus Damm, Mark Williamson

[-- Attachment #1: Type: text/plain, Size: 2805 bytes --]

On Thu, May 25, 2006 at 04:20:19PM +0900, Horms wrote:
> Hi,
> 
> sorry for the somewhat long delay between sending updates.
> I'm happy to announce tenth take of the kexec/kdump patch.
> I'll address Keir's questions from the 9th release below,
> but first I would like to quickly summarise the patches.
> 
> Kexec/kdump is implemented by moving the privelaged portions
> (and related plumbing where needed) from linux into the hypervisor.
> This is primarily done by implementing kexec's architecture
> independent hooks as hypercalls.
> 
> Both Kexec is working for x86_32 and x86_64 for SMP and UP.
> Kdump is also working for SMP and UP on x86_32. x86_64 may work,
> but still needs more attention. In particular the register
> saving code has not been implemented.
> 
> These patches also include some reworking of kexec's internals in
> order that the page table is not mangled on kdump. These changes
> also make x86_64 kexec/kdump somewhat easier to implement.
> Collectively this is the pagetable_a approach developed by my colleague
> Magnus Damm, and he is working with the linux kexec maintainers to
> get it merged there.
> 
> The code is broken out into four patches.
> They should apply cleanly to xen-unstable.hg 10151.
> 
>    1. 51.1-kexec-generic-upstream.patch
>       * Common code for all architectures,
> 
>         the basic plumbing for kexec/kdump
>    2. 51.2.1-kexec-x86-upstream.patch
>       * Glue between 1, and 3 and 4.
>         This would not be needed for ppc or ia64, but
> 	neither have been written yet.
> 	We are planning to commence work on ia64 soon.
>       * Depends on 1
> 
>    3. 51.2.1.1-kexec-x86_32-upstream.patch
>       * Kexec/kdump for x86_32
>       * Depends on 2 (and 1)
> 
>    4. 51.2.31.2-kexec-x86_64-upstream.patch
>       * * Kexec/kdump for x86_64
>       * Depends on 2 (and 1)

Hi,

here is a modest update to the kexec patches, broken out as per the
description above. The changes are:

* Kconfig: don't allow kexec to be build for a non-privelaged domain as
           this makes no sense at this time.
* fix a gcc compilation error that became apparent in 
  gcc (GCC) 4.1.2 20060604 (prerelease) (Debian 4.1.1-2).
  There is a warning produced that causes the build to fail because of the
  use of -Werror when compiling the hypervisor. The warning relates
  to kexec's use of xchg as a simple locking mechanism and not always
  using the return value as it isn't of any value.
* Record dom0's cr3 in vmcore for analysis by crash
  https://www.redhat.com/archives/crash-utility/2006-June/msg00015.html
* Upport from xen-unstable.hg 10151 to 10352, which is the current tree
  at present. This involved fixing two minor diffing issues, nothing more.

-- 
Horms                                           http://www.vergenet.net/~horms/

[-- Attachment #2: 51.1-kexec-generic-upstream.patch --]
[-- Type: text/plain, Size: 35283 bytes --]

kexec: framework

This is an implementation of kexec for dom0/xen, that allows
kexecing of the physical machine from xen. The approach taken is
to move the architecture-dependant kexec code into a new hypercall.

This patch only includes the framework, it cann't be used without
architecture dependant hooks, however the code should compile as is.

Signed-Off-By: Horms <horms@verge.net.au>
Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>

 linux-2.6-xen-sparse/drivers/xen/core/Makefile                        |    1 
 linux-2.6-xen-sparse/drivers/xen/core/crash.c                         |   48 +
 linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c                 |   84 ++
 linux-2.6-xen-sparse/drivers/xen/core/reboot.c                        |    4 
 patches/linux-2.6.16.13/0-linux-2.6.16-kexec_page_table_a_stubs.patch |   85 ++
 patches/linux-2.6.16.13/kexec-generic.patch                           |  294 ++++++++++
 xen/arch/x86/Makefile                                                 |    2 
 xen/arch/x86/crash.c                                                  |   26 
 xen/arch/x86/machine_kexec.c                                          |   51 +
 xen/common/Makefile                                                   |    1 
 xen/common/kexec.c                                                    |  211 +++++++
 xen/common/page_alloc.c                                               |   33 -
 xen/drivers/char/console.c                                            |    3 
 xen/include/asm-x86/kexec.h                                           |   32 +
 xen/include/public/kexec.h                                            |   85 ++
 xen/include/public/xen.h                                              |    1 
 xen/include/xen/elfcore.h                                             |   73 ++
 xen/include/xen/kexec.h                                               |   33 +
 xen/include/xen/mm.h                                                  |    1 
 19 files changed, 1057 insertions(+), 11 deletions(-)

--- x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
+++ x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
@@ -12,3 +12,4 @@ obj-$(CONFIG_IA64)		+= xenia64_init.o
 obj-$(CONFIG_XEN_SKBUFF)	+= skbuff.o
 obj-$(CONFIG_XEN_REBOOT)	+= reboot.o
 obj-$(CONFIG_XEN_SMPBOOT)	+= smpboot.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o crash.o
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/crash.c
@@ -0,0 +1,48 @@
+/*
+ * Architecture independent functions for kexec based crash dumps in xen.
+ *
+ * Created by: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <asm/ptrace.h>
+#include <linux/types.h>
+#include <asm/kexec-xen.h>
+#include <asm/hypervisor.h>
+#include <asm/system.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <asm/hw_irq.h>
+#include <xen/interface/kexec.h>
+
+/* 
+ * This passes the registers's down to the hypervisor and has it kexec()
+ * This is a bit different to the linux implementation which
+ * has this call save registers and stop CPUs and then goes into
+ * machine_kexec() later. But for Xen it makes more sense to
+ * have the kexec hypercall do everything, and this call
+ * has the registers parameter that is needed.
+ * to the hypervisor to allow the hypervisor to kdump itself
+ * on an internal panic 
+ */
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	struct cpu_user_regs xen_regs;
+	printk("machine_crash_shutdown: %d\n", smp_processor_id());
+	local_irq_disable();
+#ifdef CONFIG_X86_IO_APIC
+	disable_IO_APIC();
+#endif
+	crash_translate_regs(regs, &xen_regs);
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, KEXEC_TYPE_CRASH, &xen_regs);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
@@ -0,0 +1,84 @@
+/*
+ * drivers/xen/core/machine_kexec.c 
+ * handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <linux/kexec.h>
+#include <xen/interface/kexec.h>
+#include <linux/mm.h>
+#include <asm/hypercall.h>
+#include <asm/kexec-xen.h>
+
+const extern unsigned char relocate_new_kernel[];
+extern unsigned int relocate_new_kernel_size;
+
+static inline unsigned long machine_address(struct page *page)
+{
+	return pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT;
+}
+
+static void setup_hypercall_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+	memset(xki, 0, sizeof(*xki));
+
+	xki->indirection_page = image->head;
+	xki->reboot_code_buffer = 
+		machine_address(image->control_code_page);
+	xki->start_address = image->start;
+}
+
+/*
+ * Load the image into xen so xen can kdump itself
+ * This might have been done in prepare, but prepare
+ * is currently called too early. It might make sense
+ * to move prepare, but for now, just add an extra hook.
+ */
+int xen_machine_kexec_load(struct kimage *image)
+{
+	xen_kexec_image_t xki;
+
+	setup_hypercall_arg(&xki, image);
+	return HYPERVISOR_kexec(KEXEC_CMD_kexec_load, image->type, &xki);
+}
+
+/*
+ * Unload the image that was stored by machine_kexec_load()
+ * This might have been done in machine_kexec_cleanup() but it
+ * is called too late, and its possible xen could try and kdump
+ * using resources that have been freed.
+ */
+void xen_machine_kexec_unload(struct kimage *image)
+{
+	xen_kexec_image_t xki;
+
+	setup_hypercall_arg(&xki, image);
+	HYPERVISOR_kexec(KEXEC_CMD_kexec_unload, image->type, &xki);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ *
+ * This has the hypervisor move to the prefered reboot CPU, 
+ * stop all CPUs and kexec. That is it combines machine_shutdown()
+ * and machine_kexec() in Linux kexec terms.
+ */
+NORET_TYPE void xen_machine_kexec(struct kimage *image)
+{
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, image->type, NULL);
+	panic("KEXEC_CMD_kexec hypercall should not return\n");
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
+++ x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
@@ -67,6 +67,10 @@ void machine_power_off(void)
 	HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 }
 
+#ifdef CONFIG_KEXEC
+void machine_shutdown(void) { }
+#endif
+
 int reboot_thru_bios = 0;	/* for dmi_scan.c */
 EXPORT_SYMBOL(machine_restart);
 EXPORT_SYMBOL(machine_halt);
--- x/xen/arch/x86/Makefile
+++ x/xen/arch/x86/Makefile
@@ -39,6 +39,8 @@ obj-y += trampoline.o
 obj-y += traps.o
 obj-y += usercopy.o
 obj-y += x86_emulate.o
+obj-y += machine_kexec.o
+obj-y += crash.o
 
 ifneq ($(pae),n)
 obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o
--- /dev/null
+++ x/xen/arch/x86/crash.c
@@ -0,0 +1,26 @@
+/******************************************************************************
+ * arch/x86/crash.c
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+void machine_crash_shutdown(struct cpu_user_regs *regs)
+{
+    printk("STUB: arch/x86/crash.c: machine_crash_shutdown: not implemented\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/arch/x86/machine_kexec.c
@@ -0,0 +1,51 @@
+/******************************************************************************
+ * arch/x86/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ */
+
+#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+int machine_kexec_load(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_load: "
+        "not implemented\n");
+    return -1;
+}
+
+void machine_kexec_unload(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_unload: "
+        "not implemented\n");
+}
+
+void machine_kexec_reserved(xen_kexec_reserve_t *reservation)
+{
+    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_reserved: "
+        "not implemented\n");
+}
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: arch/x86/machine_kexec.c: machine_kexec: "
+        "not implemented\n");
+}
+
+void machine_shutdown(xen_kexec_image_t *image)
+{
+    printk("STUB: arch/x86/machine_shutdown.c: machine_shutdown: "
+       "not implemented\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/Makefile
+++ x/xen/common/Makefile
@@ -7,6 +7,7 @@ obj-y += event_channel.o
 obj-y += grant_table.o
 obj-y += kernel.o
 obj-y += keyhandler.o
+obj-y += kexec.o
 obj-y += lib.o
 obj-y += memory.o
 obj-y += multicall.o
--- /dev/null
+++ x/xen/common/kexec.c
@@ -0,0 +1,211 @@
+/*
+ * common/kexec.c - Achitecture independent kexec code for Xen
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Based in part on Linux 2.6.16's kernel/kexec.c
+ */
+
+#include <asm/kexec.h>
+#include <xen/lib.h>
+#include <xen/errno.h>
+#include <xen/guest_access.h>
+#include <xen/sched.h>
+#include <xen/types.h>
+#include <xen/kexec.h>
+#include <public/kexec.h>
+
+DEFINE_PER_CPU (note_buf_t, crash_notes);
+
+static xen_kexec_image_t kexec_image;
+static int kexec_image_set = 0;
+static xen_kexec_image_t kexec_crash_image;
+static int kexec_crash_image_set = 0;
+static int kexec_crash_lock = 0;
+
+/* Must call with kexec_crash_lock held */
+void __crash_kexec(struct cpu_user_regs *regs)
+{
+    struct cpu_user_regs fixed_regs;
+
+    if (!kexec_crash_image_set)
+	    return;
+    crash_setup_regs(&fixed_regs, regs);
+    machine_crash_shutdown(&fixed_regs);
+    machine_kexec(&kexec_crash_image); /* Does not return */
+}
+
+void crash_kexec(struct cpu_user_regs *regs)
+{
+    int locked;
+
+    locked = xchg(&kexec_crash_lock, 1);
+    if (locked)
+        return;
+    __crash_kexec(regs);
+
+    /* The if() here is bogus, but gcc will throws a warning that the
+     * computed value is unused and xen compiles with -Werror.
+     * This seems like a viable work around.
+     * This did not seem to happen with slightly older gcc.
+     * Observed with: 
+     * gcc version 4.1.2 20060604 (prerelease) (Debian * 4.1.1-2) */
+    if (xchg(&kexec_crash_lock, 0)) ;
+
+    return;
+}
+
+static int get_crash_note(int vcpuid, XEN_GUEST_HANDLE(void) uarg)
+{
+    struct domain *domain = current->domain;
+    unsigned long crash_note;
+    struct vcpu *vcpu;
+    int locked;
+
+    if (vcpuid < 0 || vcpuid > MAX_VIRT_CPUS)
+	return -EINVAL;
+
+    if ( ! (vcpu = domain->vcpu[vcpuid]) )
+	return -EINVAL;
+
+    locked = xchg(&kexec_crash_lock, 1);
+    if (locked)
+    {
+       printk("do_kexec: (CMD_kexec_crash_note): dump is locked\n");
+       return -EFAULT;
+    }
+    crash_note = __pa((unsigned long)per_cpu(crash_notes, vcpu->processor));
+
+    /* The if() here is bogus, but gcc will throws a warning that the
+     * computed value is unused and xen compiles with -Werror.
+     * This seems like a viable work around.
+     * This did not seem to happen with slightly older gcc.
+     * Observed with: 
+     * gcc version 4.1.2 20060604 (prerelease) (Debian * 4.1.1-2) */
+    if (xchg(&kexec_crash_lock, 0)) ;
+
+    if ( unlikely(copy_to_guest(uarg, &crash_note, 1) != 0) )
+    {
+        printk("do_kexec: (CMD_kexec_crash_note): copy_to_guest failed\n");
+        return -EFAULT;
+    }
+    
+    return 0;
+}
+
+static int get_reserve(XEN_GUEST_HANDLE(void) uarg)
+{
+    xen_kexec_reserve_t reservation;
+
+    machine_kexec_reserved(&reservation);
+    if ( unlikely(copy_to_guest(uarg, &reservation, 1) != 0) )
+    {
+        printk("do_kexec (CMD_kexec_reserve): copy_to_guest failed\n");
+        return -EFAULT;
+    }
+    
+    return 0;
+}
+
+static int __do_kexec(unsigned long type, XEN_GUEST_HANDLE(void) uarg,
+		      xen_kexec_image_t *image)
+{
+    cpu_user_regs_t regs;
+
+    if (type == KEXEC_TYPE_DEFAULT)
+        machine_shutdown(image); /* Does not return */
+    else
+    {
+        if ( unlikely(copy_from_guest(&regs, uarg, 1) != 0) )
+        {
+            printk("do_kexec (CMD_kexec): copy_from_guest failed\n");
+            return -EFAULT;
+        }
+        __crash_kexec(&regs); /* Does not return */
+    }
+
+    return -EINVAL;
+}
+
+int do_kexec(unsigned long op, int arg1, XEN_GUEST_HANDLE(void) uarg)
+{
+    xen_kexec_image_t *image;
+    int locked;
+    int *image_set;
+    int status = -EINVAL;
+
+    if ( !IS_PRIV(current->domain) )  
+        return -EPERM;
+
+    switch (op)
+    {
+    case KEXEC_CMD_kexec_crash_note:
+        return get_crash_note(arg1, uarg);
+    case KEXEC_CMD_kexec_reserve:
+	return get_reserve(uarg);
+    }
+
+    /* For all other ops, arg1 is the type of kexec, that is
+     * KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH */
+    if (arg1 == KEXEC_TYPE_CRASH)
+    {
+        image = &kexec_crash_image;
+        image_set = &kexec_crash_image_set;
+        locked = xchg(&kexec_crash_lock, 1);
+        if (locked)
+        {
+           printk("do_kexec: dump is locked\n");
+           return -EFAULT;
+        }
+    }
+    else
+    {
+        image = &kexec_image;
+        image_set = &kexec_image_set;
+    }
+
+    switch(op) {
+    case KEXEC_CMD_kexec:
+        BUG_ON(!*image_set);
+	status = __do_kexec(arg1, uarg, image);
+        break;
+    case KEXEC_CMD_kexec_load:
+        BUG_ON(*image_set);
+        if ( unlikely(copy_from_guest(image, uarg, 1) != 0) )
+        {
+            printk("do_kexec (CMD_kexec_load): copy_from_guest failed\n");
+            status = -EFAULT;
+	    break;
+        }
+        *image_set = 1;
+        status = machine_kexec_load(arg1, image);
+        break;
+    case KEXEC_CMD_kexec_unload:
+        BUG_ON(!*image_set);
+        *image_set = 0;
+        machine_kexec_unload(arg1, image);
+        status = 0;
+        break;
+    }
+
+    if (arg1 == KEXEC_TYPE_CRASH)
+        /* The if() here is bogus, but gcc will throws a warning that the
+         * computed value is unused and xen compiles with -Werror.
+         * This seems like a viable work around.
+         * This did not seem to happen with slightly older gcc.
+         * Observed with: 
+         * gcc version 4.1.2 20060604 (prerelease) (Debian * 4.1.1-2) */
+        if (xchg(&kexec_crash_lock, 0)) ;
+
+    return status;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/page_alloc.c
+++ x/xen/common/page_alloc.c
@@ -212,24 +212,35 @@ void init_boot_pages(paddr_t ps, paddr_t
     }
 }
 
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at)
+{
+    unsigned long i;
+
+    for ( i = 0; i < nr_pfns; i++ )
+        if ( allocated_in_map(pfn_at + i) )
+             break;
+
+    if ( i == nr_pfns )
+    {
+        map_alloc(pfn_at, nr_pfns);
+        return pfn_at;
+    }
+
+    return 0;
+}
+
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
 {
-    unsigned long pg, i;
+    unsigned long pg, i = 0;
 
     for ( pg = 0; (pg + nr_pfns) < max_page; pg += pfn_align )
     {
-        for ( i = 0; i < nr_pfns; i++ )
-            if ( allocated_in_map(pg + i) )
-                 break;
-
-        if ( i == nr_pfns )
-        {
-            map_alloc(pg, nr_pfns);
-            return pg;
-        }
+        i = alloc_boot_pages_at(nr_pfns, pg);
+        if (i != 0)
+            break;
     }
 
-    return 0;
+    return i;
 }
 
 
--- x/xen/drivers/char/console.c
+++ x/xen/drivers/char/console.c
@@ -680,6 +680,7 @@ void panic(const char *fmt, ...)
     unsigned long flags;
     static DEFINE_SPINLOCK(lock);
     extern void machine_restart(char *);
+    extern void crash_kexec(struct cpu_user_regs *regs);
     
     debugtrace_dump();
 
@@ -699,6 +700,8 @@ void panic(const char *fmt, ...)
 
     debugger_trap_immediate();
 
+    crash_kexec(NULL);
+
     watchdog_disable();
     mdelay(5000);
     machine_restart(0);
--- /dev/null
+++ x/xen/include/asm-x86/kexec.h
@@ -0,0 +1,32 @@
+/******************************************************************************
+ * include/asm-x86/kexec.h
+ * 
+ * Created By: Horms
+ *
+ */
+
+#ifndef __X86_KEXEC_H__
+#define __X86_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: include/asm-x86/kexec.h: crash_setup_regs: "
+       "not implemented\n");
+}
+
+#endif /* __X86_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/public/kexec.h
@@ -0,0 +1,85 @@
+/*
+ * kexec.h - Public portion
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Types based on those in ./vcpu.h on request from Keir Frasier
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+#include "xen.h"
+
+/*
+ * Prototype for this hypercall is:
+ *  int kexec_op(int cmd, int type, void *extra_args)
+ * @cmd        == KEXEC_CMD_... 
+ *                KEXEC operation to perform
+ * @arg1       == Operation-specific unsigned long argument
+ *                This could be in extra_args, but by putting it here
+ *                copy_from_user can be avoided, inparticular in
+ *                KEXEC_CMD_kexec during a crash dump, which is a failry
+ *                critical section of code.If this turns out not to be
+ *                important then it can be collapsed into extra_args.
+ * @extra_args == Operation-specific extra arguments (NULL if none).
+ */
+
+#define KEXEC_TYPE_DEFAULT 0
+#define KEXEC_TYPE_CRASH   1
+
+/*
+ * Perform kexec having previously loaded a kexec or kdump kernel
+ * as appropritate.
+ * @arg1      == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH
+ * @extra_arg == pointer to cpu_user_regs_t structure.
+ */
+#define KEXEC_CMD_kexec                 0
+
+/*
+ * Load kernel image in preperation for kexec or kdump.
+ * @arg1      == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH
+ * @extra_arg == pointer to xen_kexec_image_t structure.
+ */
+#define KEXEC_CMD_kexec_load            1
+typedef struct xen_kexec_image {
+    unsigned long indirection_page;
+    unsigned long reboot_code_buffer;
+    unsigned long start_address;
+} xen_kexec_image_t;
+
+/*
+ * Clean up image loaded by KEXEC_CMD_kexec_load
+ * @arg1      == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH
+ */
+#define KEXEC_CMD_kexec_unload          2
+
+/*
+ * Find the base pointer and size of the area that xen has 
+ * reserved for use by the crash kernel.
+ * @extra_arg == pointer to xen_kexec_reserve_t structure.
+ */
+#define KEXEC_CMD_kexec_reserve         3
+typedef struct xen_kexec_reserve {
+    unsigned long size;
+    unsigned long start;
+} xen_kexec_reserve_t;
+
+/*
+ * Find the base pointer of the area that xen has 
+ * reserved for use by a crash note for a given VCPU
+ * @extra_arg == pointer to unsigned long.
+ */
+#define KEXEC_CMD_kexec_crash_note      4
+
+#endif /* _XEN_PUBLIC_KEXEC_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/public/xen.h
+++ x/xen/include/public/xen.h
@@ -64,6 +64,7 @@
 #define __HYPERVISOR_xenoprof_op          31
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
+#define __HYPERVISOR_kexec_op             34
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
--- /dev/null
+++ x/xen/include/xen/elfcore.h
@@ -0,0 +1,73 @@
+/******************************************************************************
+ * include/xen/elfcore.h
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on include/linux/elfcore.h from Linux 2.6.16
+ * Naming scheeme based on include/xen/elf.h (not include/linux/elfcore.h)
+ *
+ */
+
+#ifndef __ELFCOREC_H__
+#define __ELFCOREC_H__
+
+#include <xen/types.h>
+#include <xen/elf.h>
+#include <public/xen.h>
+
+#define NT_PRSTATUS     1
+
+typedef struct
+{
+    int signo;                       /* signal number */
+    int code;                        /* extra code */
+    int errno;                       /* errno */
+} ELF_Signifo;
+
+/* These seem to be the same length on all architectures on Linux */
+typedef int ELF_Pid;
+typedef struct {
+	long tv_sec;
+	long tv_usec;
+} ELF_Timeval;
+typedef unsigned long ELF_Greg;
+#define ELF_NGREG (sizeof (struct cpu_user_regs) / sizeof(ELF_Greg))
+typedef ELF_Greg ELF_Gregset[ELF_NGREG];
+
+/*
+ * Definitions to generate Intel SVR4-like core files.
+ * These mostly have the same names as the SVR4 types with "elf_"
+ * tacked on the front to prevent clashes with linux definitions,
+ * and the typedef forms have been avoided.  This is mostly like
+ * the SVR4 structure, but more Linuxy, with things that Linux does
+ * not support and which gdb doesn't really use excluded.
+ */
+typedef struct
+{
+    ELF_Signifo pr_info;         /* Info associated with signal */
+    short pr_cursig;             /* Current signal */
+    unsigned long pr_sigpend;    /* Set of pending signals */
+    unsigned long pr_sighold;    /* Set of held signals */
+    ELF_Pid pr_pid;
+    ELF_Pid pr_ppid;
+    ELF_Pid pr_pgrp;
+    ELF_Pid pr_sid;
+    ELF_Timeval pr_utime;        /* User time */
+    ELF_Timeval pr_stime;        /* System time */
+    ELF_Timeval pr_cutime;       /* Cumulative user time */
+    ELF_Timeval pr_cstime;       /* Cumulative system time */
+    ELF_Gregset pr_reg;          /* GP registers */
+    int pr_fpvalid;              /* True if math co-processor being used.  */
+} ELF_Prstatus;
+
+#endif /* __ELFCOREC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/xen/kexec.h
@@ -0,0 +1,33 @@
+/*
+ * include/xen/kexec.h - Internal archtecture independant portion
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <public/kexec.h>
+
+#define MAX_NOTE_BYTES 1024
+
+typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
+DECLARE_PER_CPU (note_buf_t, crash_notes);
+
+int machine_kexec_load(int type, xen_kexec_image_t *image);
+void machine_kexec_unload(int type, xen_kexec_image_t *image);
+void machine_kexec_reserved(xen_kexec_reserve_t *reservation);
+void machine_kexec(xen_kexec_image_t *image);
+void machine_shutdown(xen_kexec_image_t *image);
+void machine_crash_shutdown(cpu_user_regs_t *regs);
+
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/xen/mm.h
+++ x/xen/include/xen/mm.h
@@ -40,6 +40,7 @@ struct page_info;
 paddr_t init_boot_allocator(paddr_t bitmap_start);
 void init_boot_pages(paddr_t ps, paddr_t pe);
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align);
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at);
 void end_boot_allocator(void);
 
 /* Generic allocator. These functions are *not* interrupt-safe. */
--- /dev/null
+++ x/patches/linux-2.6.16.13/kexec-generic.patch
@@ -0,0 +1,294 @@
+ drivers/base/cpu.c    |   20 ++++++++++++++
+ include/linux/kexec.h |    5 +++
+ kernel/kexec.c        |   68 ++++++++++++++++++++++++++++++++++++++++---------
+ kernel/sys.c          |    4 ++
+ 4 files changed, 85 insertions(+), 12 deletions(-)
+
+--- x/drivers/base/cpu.c
++++ x/drivers/base/cpu.c
+@@ -11,6 +11,10 @@
+ 
+ #include "base.h"
+ 
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#endif
++
+ struct sysdev_class cpu_sysdev_class = {
+ 	set_kset_name("cpu"),
+ };
+@@ -86,6 +90,18 @@ static inline void register_cpu_control(
+ #ifdef CONFIG_KEXEC
+ #include <linux/kexec.h>
+ 
++#ifdef CONFIG_XEN
++static unsigned long get_crash_notes(int cpu)
++{
++	unsigned long crash_note;
++
++	if (HYPERVISOR_kexec(KEXEC_CMD_kexec_crash_note, cpu, &crash_note) < 0)
++		return 0UL;
++	return crash_note;
++}
++#endif
++
++/* XXX: This only finds dom0's CPU's */
+ static ssize_t show_crash_notes(struct sys_device *dev, char *buf)
+ {
+ 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+@@ -101,7 +117,11 @@ static ssize_t show_crash_notes(struct s
+ 	 * boot up and this data does not change there after. Hence this
+ 	 * operation should be safe. No locking required.
+ 	 */
++#ifndef CONFIG_XEN
+ 	addr = __pa(per_cpu_ptr(crash_notes, cpunum));
++#else
++	addr = (unsigned long long)get_crash_notes(cpunum);
++#endif
+ 	rc = sprintf(buf, "%Lx\n", addr);
+ 	return rc;
+ }
+--- x/include/linux/kexec.h
++++ x/include/linux/kexec.h
+@@ -91,6 +91,11 @@ struct kimage {
+ extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
+ extern int machine_kexec_prepare(struct kimage *image);
+ extern void machine_kexec_cleanup(struct kimage *image);
++#ifdef CONFIG_XEN
++extern int xen_machine_kexec_load(struct kimage *image);
++extern void xen_machine_kexec_unload(struct kimage *image);
++extern NORET_TYPE void xen_machine_kexec(struct kimage *image) ATTRIB_NORET;
++#endif
+ extern asmlinkage long sys_kexec_load(unsigned long entry,
+ 					unsigned long nr_segments,
+ 					struct kexec_segment __user *segments,
+--- x/kernel/kexec.c
++++ x/kernel/kexec.c
+@@ -38,6 +38,20 @@ struct resource crashk_res = {
+ 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ };
+ 
++/* Kexec needs to know about the actually physical addresss.
++ * But in xen, a physical address is a pseudo-physical addresss. */
++#ifndef CONFIG_XEN
++#define kexec_page_to_pfn(page)  page_to_pfn(page)
++#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
++#define kexec_virt_to_phys(addr) virt_to_phys(addr)
++#define kexec_phys_to_virt(addr) phys_to_virt(addr)
++#else
++#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
++#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
++#define kexec_virt_to_phys(addr) virt_to_machine(addr)
++#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
++#endif
++
+ int kexec_should_crash(struct task_struct *p)
+ {
+ 	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+@@ -403,7 +417,7 @@ static struct page *kimage_alloc_normal_
+ 		pages = kimage_alloc_pages(GFP_KERNEL, order);
+ 		if (!pages)
+ 			break;
+-		pfn   = page_to_pfn(pages);
++		pfn   = kexec_page_to_pfn(pages);
+ 		epfn  = pfn + count;
+ 		addr  = pfn << PAGE_SHIFT;
+ 		eaddr = epfn << PAGE_SHIFT;
+@@ -437,6 +451,7 @@ static struct page *kimage_alloc_normal_
+ 	return pages;
+ }
+ 
++#ifndef CONFIG_XEN
+ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ 						      unsigned int order)
+ {
+@@ -490,7 +505,7 @@ static struct page *kimage_alloc_crash_c
+ 		}
+ 		/* If I don't overlap any segments I have found my hole! */
+ 		if (i == image->nr_segments) {
+-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
++			pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
+ 			break;
+ 		}
+ 	}
+@@ -517,6 +532,13 @@ struct page *kimage_alloc_control_pages(
+ 
+ 	return pages;
+ }
++#else /* !CONFIG_XEN */
++struct page *kimage_alloc_control_pages(struct kimage *image,
++					 unsigned int order)
++{
++	return kimage_alloc_normal_control_pages(image, order);
++}
++#endif
+ 
+ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+ {
+@@ -532,7 +554,7 @@ static int kimage_add_entry(struct kimag
+ 			return -ENOMEM;
+ 
+ 		ind_page = page_address(page);
+-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
++		*image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
+ 		image->entry = ind_page;
+ 		image->last_entry = ind_page +
+ 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+@@ -593,13 +615,13 @@ static int kimage_terminate(struct kimag
+ #define for_each_kimage_entry(image, ptr, entry) \
+ 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ 		ptr = (entry & IND_INDIRECTION)? \
+-			phys_to_virt((entry & PAGE_MASK)): ptr +1)
++			kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
+ 
+ static void kimage_free_entry(kimage_entry_t entry)
+ {
+ 	struct page *page;
+ 
+-	page = pfn_to_page(entry >> PAGE_SHIFT);
++	page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
+ 	kimage_free_pages(page);
+ }
+ 
+@@ -611,6 +633,10 @@ static void kimage_free(struct kimage *i
+ 	if (!image)
+ 		return;
+ 
++#ifdef CONFIG_XEN
++	xen_machine_kexec_unload(image);
++#endif
++
+ 	kimage_free_extra_pages(image);
+ 	for_each_kimage_entry(image, ptr, entry) {
+ 		if (entry & IND_INDIRECTION) {
+@@ -686,7 +712,7 @@ static struct page *kimage_alloc_page(st
+ 	 * have a match.
+ 	 */
+ 	list_for_each_entry(page, &image->dest_pages, lru) {
+-		addr = page_to_pfn(page) << PAGE_SHIFT;
++		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
+ 		if (addr == destination) {
+ 			list_del(&page->lru);
+ 			return page;
+@@ -701,12 +727,12 @@ static struct page *kimage_alloc_page(st
+ 		if (!page)
+ 			return NULL;
+ 		/* If the page cannot be used file it away */
+-		if (page_to_pfn(page) >
++		if (kexec_page_to_pfn(page) >
+ 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ 			list_add(&page->lru, &image->unuseable_pages);
+ 			continue;
+ 		}
+-		addr = page_to_pfn(page) << PAGE_SHIFT;
++		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
+ 
+ 		/* If it is the destination page we want use it */
+ 		if (addr == destination)
+@@ -729,7 +755,7 @@ static struct page *kimage_alloc_page(st
+ 			struct page *old_page;
+ 
+ 			old_addr = *old & PAGE_MASK;
+-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
++			old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
+ 			copy_highpage(page, old_page);
+ 			*old = addr | (*old & ~PAGE_MASK);
+ 
+@@ -779,7 +805,7 @@ static int kimage_load_normal_segment(st
+ 			result  = -ENOMEM;
+ 			goto out;
+ 		}
+-		result = kimage_add_page(image, page_to_pfn(page)
++		result = kimage_add_page(image, kexec_page_to_pfn(page)
+ 								<< PAGE_SHIFT);
+ 		if (result < 0)
+ 			goto out;
+@@ -811,6 +837,7 @@ out:
+ 	return result;
+ }
+ 
++#ifndef CONFIG_XEN
+ static int kimage_load_crash_segment(struct kimage *image,
+ 					struct kexec_segment *segment)
+ {
+@@ -833,7 +860,7 @@ static int kimage_load_crash_segment(str
+ 		char *ptr;
+ 		size_t uchunk, mchunk;
+ 
+-		page = pfn_to_page(maddr >> PAGE_SHIFT);
++		page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
+ 		if (page == 0) {
+ 			result  = -ENOMEM;
+ 			goto out;
+@@ -881,6 +908,13 @@ static int kimage_load_segment(struct ki
+ 
+ 	return result;
+ }
++#else /* CONFIG_XEN */
++static int kimage_load_segment(struct kimage *image,
++				struct kexec_segment *segment)
++{
++	return kimage_load_normal_segment(image, segment);
++}
++#endif
+ 
+ /*
+  * Exec Kernel system call: for obvious reasons only root may call it.
+@@ -991,6 +1025,11 @@ asmlinkage long sys_kexec_load(unsigned 
+ 		if (result)
+ 			goto out;
+ 	}
++#ifdef CONFIG_XEN
++	result = xen_machine_kexec_load(image);
++	if (result)
++		goto out;
++#endif
+ 	/* Install the new kernel, and  Uninstall the old */
+ 	image = xchg(dest_image, image);
+ 
+@@ -1045,7 +1084,6 @@ void crash_kexec(struct pt_regs *regs)
+ 	struct kimage *image;
+ 	int locked;
+ 
+-
+ 	/* Take the kexec_lock here to prevent sys_kexec_load
+ 	 * running on one cpu from replacing the crash kernel
+ 	 * we are using after a panic on a different cpu.
+@@ -1061,12 +1099,17 @@ void crash_kexec(struct pt_regs *regs)
+ 			struct pt_regs fixed_regs;
+ 			crash_setup_regs(&fixed_regs, regs);
+ 			machine_crash_shutdown(&fixed_regs);
++#ifdef CONFIG_XEN
++			xen_machine_kexec(image);
++#else
+ 			machine_kexec(image);
++#endif
+ 		}
+ 		xchg(&kexec_lock, 0);
+ 	}
+ }
+ 
++#ifndef CONFIG_XEN
+ static int __init crash_notes_memory_init(void)
+ {
+ 	/* Allocate memory for saving cpu registers. */
+@@ -1079,3 +1122,4 @@ static int __init crash_notes_memory_ini
+ 	return 0;
+ }
+ module_init(crash_notes_memory_init)
++#endif
+--- x/kernel/sys.c
++++ x/kernel/sys.c
+@@ -435,8 +435,12 @@ void kernel_kexec(void)
+ 	kernel_restart_prepare(NULL);
+ 	printk(KERN_EMERG "Starting new kernel\n");
+ 	machine_shutdown();
++#ifdef CONFIG_XEN
++	xen_machine_kexec(image);
++#else
+ 	machine_kexec(image);
+ #endif
++#endif
+ }
+ EXPORT_SYMBOL_GPL(kernel_kexec);
+ 
--- /dev/null
+++ x/patches/linux-2.6.16.13/0-linux-2.6.16-kexec_page_table_a_stubs.patch
@@ -0,0 +1,85 @@
+kexec: Avoid overwriting the current pgd (V2, stubs)
+
+This patch adds an architecture specific structure "struct kimage_arch" to
+struct kimage. This structure is filled in with members by the architecture
+specific patches followed by this one.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+---
+
+ Applies on top of 2.6.16 and 2.6.17-rc4.
+
+ include/asm-i386/kexec.h    |    2 ++
+ include/asm-powerpc/kexec.h |    2 ++
+ include/asm-s390/kexec.h    |    2 ++
+ include/asm-sh/kexec.h      |    2 ++
+ include/asm-x86_64/kexec.h  |    2 ++
+ include/linux/kexec.h       |    2 ++
+ 6 files changed, 12 insertions(+)
+
+--- x/include/asm-i386/kexec.h
++++ x/include/asm-i386/kexec.h
+@@ -29,6 +29,8 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ /* CPU does not save ss and esp on stack if execution is already
+  * running in kernel mode at the time of NMI occurrence. This code
+  * fixes it.
+--- x/include/asm-powerpc/kexec.h
++++ x/include/asm-powerpc/kexec.h
+@@ -108,6 +108,8 @@ static inline void crash_setup_regs(stru
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ #ifdef __powerpc64__
+ extern void kexec_smp_wait(void);	/* get and clear naca physid, wait for
+ 					  master to copy new code to 0 */
+--- x/include/asm-s390/kexec.h
++++ x/include/asm-s390/kexec.h
+@@ -36,6 +36,8 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ /* Provide a dummy definition to avoid build failures. */
+ static inline void crash_setup_regs(struct pt_regs *newregs,
+ 					struct pt_regs *oldregs) { }
+--- x/include/asm-sh/kexec.h
++++ x/include/asm-sh/kexec.h
+@@ -25,6 +25,8 @@
+ 
+ #ifndef __ASSEMBLY__
+ 
++struct kimage_arch {};
++
+ extern void machine_shutdown(void);
+ extern void *crash_notes;
+ 
+--- x/include/asm-x86_64/kexec.h
++++ x/include/asm-x86_64/kexec.h
+@@ -29,6 +29,8 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ /*
+  * Saving the registers of the cpu on which panic occured in
+  * crash_kexec to save a valid sp. The registers of other cpus
+--- x/include/linux/kexec.h
++++ x/include/linux/kexec.h
+@@ -69,6 +69,8 @@ struct kimage {
+ 	unsigned long start;
+ 	struct page *control_code_page;
+ 
++	struct kimage_arch arch_data;
++
+ 	unsigned long nr_segments;
+ 	struct kexec_segment segment[KEXEC_SEGMENT_MAX];
+ 

[-- Attachment #3: 51.2.1-kexec-x86-upstream.patch --]
[-- Type: text/plain, Size: 25601 bytes --]

kexec: x86

This is the x86 component of kexec for xen.
The generic component is a prerequsite for this patch.
The x86_64 or x86_32 (i386) patch is also needed
in order to use this code, however the code should compile is.

Signed-Off-By: Horms <horms@verge.net.au>
Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>

 linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c |   14 +
 xen/arch/x86/crash.c                                  |  171 ++++++++++++++++-
 xen/arch/x86/dom0_ops.c                               |    3 
 xen/arch/x86/machine_kexec.c                          |  150 +++++++++++++-
 xen/arch/x86/setup.c                                  |   75 ++++++-
 xen/arch/x86/x86_32/Makefile                          |    1 
 xen/arch/x86/x86_32/machine_kexec.c                   |   27 ++
 xen/arch/x86/x86_64/Makefile                          |    1 
 xen/arch/x86/x86_64/machine_kexec.c                   |   28 ++
 xen/include/asm-x86/elf.h                             |   27 ++
 xen/include/asm-x86/fixmap.h                          |    1 
 xen/include/asm-x86/hypercall.h                       |    5 
 xen/include/asm-x86/kexec.h                           |   14 -
 xen/include/asm-x86/x86_32/elf.h                      |   30 ++
 xen/include/asm-x86/x86_32/kexec.h                    |   51 +++++
 xen/include/asm-x86/x86_64/elf.h                      |   30 ++
 xen/include/asm-x86/x86_64/kexec.h                    |   50 ++++
 xen/include/public/kexec.h                            |    2 
 xen/include/xen/elfcore.h                             |    3 
 19 files changed, 652 insertions(+), 31 deletions(-)

--- x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
+++ x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
@@ -23,7 +23,21 @@ static inline unsigned long machine_addr
 
 static void setup_hypercall_arg(xen_kexec_image_t *xki, struct kimage *image)
 {
+#if defined(CONFIG_X86_32) || defined(CONFIG_X86_64)
+	struct kimage_arch *arch = &image->arch_data;
+	int k, n = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
+
+	memset(xki, 0, sizeof(*xki));
+
+	for (k = 0; k < n; k++)
+		xki->page_table_a[k] = machine_address(arch->page_table_a[k]);
+
+#ifdef CONFIG_X86_64
+	xki->page_table_b = machine_address(arch->page_table_b);
+#endif
+#else
 	memset(xki, 0, sizeof(*xki));
+#endif
 
 	xki->indirection_page = image->head;
 	xki->reboot_code_buffer = 
--- x/xen/arch/x86/crash.c
+++ x/xen/arch/x86/crash.c
@@ -3,16 +3,181 @@
  * 
  * Created By: Horms
  *
- * Should be based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
+ * Based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
  */
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <asm/atomic.h>
+#include <asm/elf.h>
+#include <asm/percpu.h>
+#include <asm/kexec.h>
 #include <xen/types.h>
+#include <xen/irq.h>
+#include <asm/ipi.h>
+#include <asm/nmi.h>
+#include <xen/string.h>
+#include <xen/elf.h>
+#include <xen/elfcore.h>
+#include <xen/smp.h>
+#include <xen/delay.h>
+#include <xen/perfc.h>
+#include <xen/kexec.h>
 #include <public/xen.h>
 
+static int crashing_cpu;
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+							       size_t data_len)
+{
+	Elf_Note note;
+
+	note.namesz = strlen(name) + 1;
+	note.descsz = data_len;
+	note.type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) +3)/4;
+	memcpy(buf, name, note.namesz);
+	buf += (note.namesz + 3)/4;
+	memcpy(buf, data, note.descsz);
+	buf += (note.descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	Elf_Note note;
+
+	note.namesz = 0;
+	note.descsz = 0;
+	note.type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static void crash_save_this_cpu(struct cpu_user_regs *regs, int cpu)
+{
+	ELF_Prstatus prstatus;
+	uint32_t *buf;
+
+	printk("crash_save_this_cpu: %d\n",  cpu);
+
+	if ((cpu < 0) || (cpu >= NR_CPUS))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * A well defined structure format with tags is needed
+	 * ELF notes happen to provide this and there is infastructure
+	 * in the Linux kernel to supprot them. In order to make
+	 * crash dumps produced by xen the same, the same
+	 * technique is used here.
+	 */
+
+	/* It should be safe to use per_cpu() here instead of per_cpu_ptr()
+	 * (which does not exist in xen) as kexecing_lock must be held in
+	 * order to get anywhere near here */
+	buf = (uint32_t *)per_cpu(crash_notes, cpu);
+	if (!buf) /* XXX: Can this ever occur? */
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	/* XXX: Xen does not have processes. For the crashing CPU on a dom0
+	 * crash this could be pased down from dom0, but is this
+	 * neccessary?
+	 * prstatus.pr_pid = current->pid; */
+	ELF_CORE_COPY_REGS(prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+				sizeof(prstatus));
+	final_note(buf);
+}
+
+static void crash_save_self(struct cpu_user_regs *regs)
+{
+	crash_save_this_cpu(regs, smp_processor_id());
+}
+
+#ifdef CONFIG_SMP
+static atomic_t waiting_for_crash_ipi;
+
+static int crash_nmi_callback(struct cpu_user_regs *regs, int cpu)
+{
+	struct cpu_user_regs fixed_regs;
+
+	/* Don't do anything if this handler is invoked on crashing cpu.
+	 * Otherwise, system will completely hang. Crashing cpu can get
+	 * an NMI if system was initially booted with nmi_watchdog parameter.
+	 */
+	if (cpu == crashing_cpu)
+		return 1;
+	local_irq_disable();
+
+	if (!user_mode(regs)) {
+		crash_fixup_ss_esp(&fixed_regs, regs);
+		regs = &fixed_regs;
+	}
+	crash_save_this_cpu(regs, cpu);
+	disable_local_APIC();
+	atomic_dec(&waiting_for_crash_ipi);
+	/* Assume hlt works */
+	__asm__ __volatile__ ( "hlt" );
+	for(;;);
+
+	return 1;
+
+	/* Need to use this somewhere as Xen builds with -Werror */
+	crash_setup_regs(&fixed_regs, regs);
+}
+
+/*
+ * By using the NMI code instead of a vector we just sneak thru the
+ * word generator coming out with just what we want.  AND it does
+ * not matter if clustered_apic_mode is set or not.
+ */
+static void smp_send_nmi_allbutself(void)
+{
+	cpumask_t allbutself = cpu_online_map;
+    	cpu_clear(smp_processor_id(), allbutself);
+	send_IPI_mask(allbutself, APIC_DM_NMI);
+}
+
+static void nmi_shootdown_cpus(void)
+{
+	unsigned long msecs;
+
+	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+	/* Would it be better to replace the trap vector here? */
+	set_nmi_callback(crash_nmi_callback);
+	/* Ensure the new callback function is set before sending
+	 * out the NMI
+	 */
+	wmb();
+
+	smp_send_nmi_allbutself();
+
+	msecs = 1000; /* Wait at most a second for the other cpus to stop */
+	while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+		mdelay(1);
+		msecs--;
+	}
+
+	/* Leave the nmi callback set */
+	disable_local_APIC();
+}
+#else
+static void nmi_shootdown_cpus(void)
+{
+	/* There are no cpus to shootdown */
+}
+#endif
+
 void machine_crash_shutdown(struct cpu_user_regs *regs)
 {
-    printk("STUB: arch/x86/crash.c: machine_crash_shutdown: not implemented\n");
+	printk("machine_crash_shutdown: %d\n", smp_processor_id());
+	local_irq_disable();
+
+	crashing_cpu = smp_processor_id();
+	nmi_shootdown_cpus();
+#ifdef CONFIG_X86_IO_APIC
+        disable_IO_APIC();
+#endif
+	crash_save_self(regs);
 }
 
 /*
--- x/xen/arch/x86/dom0_ops.c
+++ x/xen/arch/x86/dom0_ops.c
@@ -29,6 +29,9 @@
 #include <asm/mtrr.h>
 #include "cpu/mtrr/mtrr.h"
 
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
 #define TRC_DOM0OP_ENTER_BASE  0x00020000
 #define TRC_DOM0OP_LEAVE_BASE  0x00030000
 
--- x/xen/arch/x86/machine_kexec.c
+++ x/xen/arch/x86/machine_kexec.c
@@ -6,38 +6,164 @@
  */
 
 #include <xen/lib.h>       /* for printk() used in stubs */
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/flushtlb.h>
+#include <xen/lib.h>
+#include <xen/smp.h>
+#include <xen/nmi.h>
 #include <xen/types.h>
+#include <xen/console.h>
+#include <xen/kexec.h>
 #include <public/kexec.h>
+#include <xen/domain_page.h>
+#include <asm/fixmap.h>
+ 
+#define create_level_mapping(lvl, next, pages, nopages, k, va)               \
+{                                                                            \
+    lvl##_pgentry_t *table;                                                  \
+    void *old = next;                                                        \
+                                                                             \
+    table = (lvl##_pgentry_t *)next + lvl##_table_offset(va);                \
+    if (!(lvl##e_get_flags(*table) & _PAGE_PRESENT)) {                       \
+        if (k >= nopages || pages[k] == 0)                                   \
+            return -1;                                                       \
+        *table = lvl##e_from_pfn(pages[k++]>>PAGE_SHIFT, __PAGE_HYPERVISOR); \
+    }                                                                        \
+    next = map_domain_page(lvl##e_get_pfn(*table));                          \
+    unmap_domain_page(old);                                                  \
+}                                                                            
+
+#define create_level_1_mapping(next, nopages, va, pa)               \
+{                                                                   \
+    l1_pgentry_t *table;                                            \
+                                                                    \
+    table = (l1_pgentry_t *)next + l1_table_offset(va);             \
+    if (!(l1e_get_flags(*table) & _PAGE_PRESENT)) {                 \
+        *table = l1e_from_pfn(pa >> PAGE_SHIFT, __PAGE_HYPERVISOR); \
+    }                                                               \
+    unmap_domain_page(next);                                        \
+}
+
+static int create_mapping(unsigned long root,
+                          unsigned long *pages, int nopages,
+                          unsigned long va, unsigned long pa)
+{
+    void *next = map_domain_page(root >> PAGE_SHIFT);
+    int k = 0;
+
+#if CONFIG_PAGING_LEVELS >= 4
+    create_level_mapping(l4, next, pages, nopages, k, va);
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+    create_level_mapping(l3, next, pages, nopages, k, va);
+#endif
+    create_level_mapping(l2, next, pages, nopages, k, va);
+
+    create_level_1_mapping(next, nopages, va, pa);
+
+    return k;
+}
+
+static int setup_page_table_a(xen_kexec_image_t *image)
+{
+    void *page;
+    int k, n = sizeof(image->page_table_a) / sizeof(image->page_table_a[0]);
+
+    /* clear page_table_a pages */
+
+    for (k = 0; k < n; k++) {
+        if (!image->page_table_a[k])
+            break;
+
+        page = map_domain_page(image->page_table_a[k] >> PAGE_SHIFT);
+        clear_page(page);
+        unmap_domain_page(page);
+    }
+
+    /* check that the first page (root page) is actually non-zero */
+
+    if (k == 0)
+        return -1;
+
+    /* setup fixmap to point to our control page */
+
+    set_fixmap(FIX_KEXEC_PAGE, image->reboot_code_buffer);
+
+    /* fill in page_table_a: create mapping at fixmap address */
+
+    k = create_mapping(image->page_table_a[0],
+                       &image->page_table_a[1],
+                       n - 1, fix_to_virt(FIX_KEXEC_PAGE),
+                       image->reboot_code_buffer);
+    if (k < 0)
+        return -1;
+
+    /* fill in page_table_a: create identity mapping */
+
+    k = create_mapping(image->page_table_a[0],
+                       &image->page_table_a[1 + k],
+                       n - (1 + k), image->reboot_code_buffer,
+                       image->reboot_code_buffer);
+    if (k < 0)
+        return -1;
+
+	return 0;
+}
 
 int machine_kexec_load(int type, xen_kexec_image_t *image)
 {
-    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_load: "
-        "not implemented\n");
-    return -1;
+    return setup_page_table_a(image);
 }
 
 void machine_kexec_unload(int type, xen_kexec_image_t *image)
 {
-    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_unload: "
-        "not implemented\n");
 }
 
 void machine_kexec_reserved(xen_kexec_reserve_t *reservation)
 {
-    printk("STUB: arch/x86/machine_kexec.c: machine_kexec_reserved: "
-        "not implemented\n");
+    reservation->size = opt_kdump_megabytes << 20;
+    reservation->start = opt_kdump_megabytes_base << 20;
 }
 
-void machine_kexec(xen_kexec_image_t *image)
+static void __machine_shutdown(void *data)
 {
-    printk("STUB: arch/x86/machine_kexec.c: machine_kexec: "
-        "not implemented\n");
+    xen_kexec_image_t *image = (xen_kexec_image_t *)data;
+
+    printk("__machine_shutdown: cpu=%u\n", smp_processor_id());
+
+    watchdog_disable();
+    console_start_sync();
+
+    smp_send_stop();
+
+#ifdef CONFIG_X86_IO_APIC
+    disable_IO_APIC();
+#endif   
+
+    machine_kexec(image);
 }
 
 void machine_shutdown(xen_kexec_image_t *image)
 {
-    printk("STUB: arch/x86/machine_shutdown.c: machine_shutdown: "
-       "not implemented\n");
+    int reboot_cpu_id;
+    cpumask_t reboot_cpu;
+
+    reboot_cpu_id = 0;
+
+    if (!cpu_isset(reboot_cpu_id, cpu_online_map))
+        reboot_cpu_id = smp_processor_id();
+    
+    if (reboot_cpu_id != smp_processor_id()) {
+        cpus_clear(reboot_cpu);
+        cpu_set(reboot_cpu_id, reboot_cpu);
+        on_selected_cpus(reboot_cpu, __machine_shutdown, image, 1, 0);
+	for (;;)
+		; /* nothing */
+    }
+    else
+        __machine_shutdown(image);
+    BUG();
 }
 
 /*
--- x/xen/arch/x86/setup.c
+++ x/xen/arch/x86/setup.c
@@ -38,6 +38,11 @@ static unsigned int opt_xenheap_megabyte
 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
 #endif
 
+unsigned int opt_kdump_megabytes = 0;
+integer_param("kdump_megabytes", opt_kdump_megabytes);
+unsigned int opt_kdump_megabytes_base = 0;
+integer_param("kdump_megabytes_base", opt_kdump_megabytes_base);
+
 /* opt_nosmp: If true, secondary processors are ignored. */
 static int opt_nosmp = 0;
 boolean_param("nosmp", opt_nosmp);
@@ -192,6 +197,20 @@ static void percpu_free_unused_areas(voi
                        __pa(__per_cpu_end));
 }
 
+void __init move_memory(unsigned long dst, 
+                          unsigned long src_start, unsigned long src_end)
+{
+#if defined(CONFIG_X86_32)
+    memmove((void *)dst,  /* use low mapping */
+            (void *)src_start,      /* use low mapping */
+            src_end - src_start);
+#elif defined(CONFIG_X86_64)
+    memmove(__va(dst),
+            __va(src_start),
+            src_end - src_start);
+#endif
+}
+
 void __init __start_xen(multiboot_info_t *mbi)
 {
     char __cmdline[] = "", *cmdline = __cmdline;
@@ -327,15 +346,8 @@ void __init __start_xen(multiboot_info_t
         initial_images_start = xenheap_phys_end;
     initial_images_end = initial_images_start + modules_length;
 
-#if defined(CONFIG_X86_32)
-    memmove((void *)initial_images_start,  /* use low mapping */
-            (void *)mod[0].mod_start,      /* use low mapping */
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#elif defined(CONFIG_X86_64)
-    memmove(__va(initial_images_start),
-            __va(mod[0].mod_start),
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#endif
+    move_memory(initial_images_start, 
+                mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
 
     /* Initialise boot-time allocator with all RAM situated after modules. */
     xenheap_phys_start = init_boot_allocator(__pa(&_end));
@@ -383,6 +395,51 @@ void __init __start_xen(multiboot_info_t
 #endif
     }
 
+    if (opt_kdump_megabytes) {
+        unsigned long kdump_start, kdump_size, k;
+
+        /* mark images pages as free for now */
+
+        init_boot_pages(initial_images_start, initial_images_end);
+
+        kdump_start = opt_kdump_megabytes_base << 20;
+        kdump_size = opt_kdump_megabytes << 20;
+
+        printk("Kdump: %luMB (%lukB) at 0x%lx\n", 
+               kdump_size >> 20,
+               kdump_size >> 10,
+               kdump_start);
+
+        if ((kdump_start & ~PAGE_MASK) || (kdump_size & ~PAGE_MASK))
+            panic("Kdump parameters not page aligned\n");
+
+        kdump_start >>= PAGE_SHIFT;
+        kdump_size >>= PAGE_SHIFT;
+
+        /* allocate pages for Kdump memory area */
+
+        k = alloc_boot_pages_at(kdump_size, kdump_start);
+
+        if (k != kdump_start)
+            panic("Unable to reserve Kdump memory\n");
+
+        /* allocate pages for relocated initial images */
+
+        k = ((initial_images_end - initial_images_start) & ~PAGE_MASK) ? 1 : 0;
+        k += (initial_images_end - initial_images_start) >> PAGE_SHIFT;
+
+        k = alloc_boot_pages(k, 1);
+
+        if (!k)
+            panic("Unable to allocate initial images memory\n");
+
+        move_memory(k << PAGE_SHIFT, initial_images_start, initial_images_end);
+
+        initial_images_end -= initial_images_start;
+        initial_images_start = k << PAGE_SHIFT;
+        initial_images_end += initial_images_start;
+    }        
+
     memguard_init();
 
     printk("System RAM: %luMB (%lukB)\n", 
--- x/xen/arch/x86/x86_32/Makefile
+++ x/xen/arch/x86/x86_32/Makefile
@@ -3,5 +3,6 @@ obj-y += entry.o
 obj-y += mm.o
 obj-y += seg_fixup.o
 obj-y += traps.o
+obj-y += machine_kexec.o
 
 obj-$(supervisor_mode_kernel) += supervisor_mode_kernel.o
--- /dev/null
+++ x/xen/arch/x86/x86_32/machine_kexec.c
@@ -0,0 +1,27 @@
+/*
+ * arch/x86/x86_32/machine_kexec.c
+ * Handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Should be losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <public/kexec.h>
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: arch/x86/x86_32/machine_kexec.c: machine_kexec: "
+        "not implemented\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/x86_64/Makefile
+++ x/xen/arch/x86/x86_64/Makefile
@@ -1,3 +1,4 @@
 obj-y += entry.o
 obj-y += mm.o
 obj-y += traps.o
+obj-y += machine_kexec.o
--- /dev/null
+++ x/xen/arch/x86/x86_64/machine_kexec.c
@@ -0,0 +1,28 @@
+/*
+ * arch/x86/x86_64/machine_kexec.c
+ * Handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Should be losely based on arch/x86_64/kernel/machine_kexec.c
+ */
+
+#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: arch/x86/x86_64/machine_kexec.c: machine_kexec: "
+        "not implemented\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/elf.h
@@ -0,0 +1,27 @@
+/******************************************************************************
+ * include/asm-x86/elf.h
+ * 
+ * Created By: Horms
+ *
+ */
+
+#ifndef __X86_ELF_H__
+#define __X86_ELF_H__
+
+#ifdef __x86_64__
+#include <asm/x86_64/elf.h>
+#else
+#include <asm/x86_32/elf.h>
+#endif
+
+#endif /* __X86_ELF_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/asm-x86/fixmap.h
+++ x/xen/include/asm-x86/fixmap.h
@@ -32,6 +32,7 @@ enum fixed_addresses {
     FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
     FIX_HPET_BASE,
     FIX_CYCLONE_TIMER,
+    FIX_KEXEC_PAGE,
     __end_of_fixed_addresses
 };
 
--- x/xen/include/asm-x86/hypercall.h
+++ x/xen/include/asm-x86/hypercall.h
@@ -6,6 +6,7 @@
 #define __ASM_X86_HYPERCALL_H__
 
 #include <public/physdev.h>
+#include <xen/types.h>
 
 extern long
 do_event_channel_op_compat(
@@ -87,6 +88,10 @@ extern long
 arch_do_vcpu_op(
     int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg);
 
+extern int
+do_kexec(
+    unsigned long op, unsigned arg1, XEN_GUEST_HANDLE(void) uarg);
+
 #ifdef __x86_64__
 
 extern long
--- x/xen/include/asm-x86/kexec.h
+++ x/xen/include/asm-x86/kexec.h
@@ -8,16 +8,16 @@
 #ifndef __X86_KEXEC_H__
 #define __X86_KEXEC_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <asm/processor.h>
 #include <xen/types.h>
+#include <xen/string.h>
 #include <public/xen.h>
 
-static void crash_setup_regs(struct cpu_user_regs *newregs,
-			     struct cpu_user_regs *oldregs)
-{
-    printk("STUB: include/asm-x86/kexec.h: crash_setup_regs: "
-       "not implemented\n");
-}
+#ifdef __x86_64__
+#include <asm/x86_64/kexec.h>
+#else
+#include <asm/x86_32/kexec.h>
+#endif
 
 #endif /* __X86_KEXEC_H__ */
 
--- /dev/null
+++ x/xen/include/asm-x86/x86_32/elf.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+ * include/asm-x86/x86_32/elf.h
+ * 
+ * Created By: Horms
+ *
+ * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS
+ * from Linux 2.6.16
+ */
+
+#ifndef __X86_ELF_X86_32_H__
+#define __X86_ELF_X86_32_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
+    printk("STUB: include/asm-x86/x86_32/kexec.h: ELF_CORE_COPY_REGS: " \
+       "not implemented\n")
+  
+
+#endif /* __X86_ELF_X86_32_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_32/kexec.h
@@ -0,0 +1,51 @@
+/******************************************************************************
+ * include/asm-x86/x86_32/kexec.h
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16
+ *
+ */
+
+#ifndef __X86_32_KEXEC_H__
+#define __X86_32_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
+		    struct cpu_user_regs *oldregs)
+{
+    printk("STUB: include/asm-x86/x86_32/kexec.h: crash_fixup_ss_esp: "
+       "not implemented\n");
+    return;
+    crash_fixup_ss_esp(newregs, oldregs);
+}
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: include/asm-x86/x86_32/kexec.h: crash_setup_regs: "
+       "not implemented\n");
+}
+
+static inline int user_mode(struct cpu_user_regs *regs)
+{
+    printk("STUB: include/asm-x86/x86_32/kexec.h: user_mode: "
+       "not implemented\n");
+    return -1;
+}
+
+
+#endif /* __X86_32_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_64/elf.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+ * include/asm-x86/x86_64/elf.h
+ * 
+ * Created By: Horms
+ *
+ * Should pull be based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS
+ * from Linux 2.6.16
+ */
+
+#ifndef __X86_ELF_X86_64_H__
+#define __X86_ELF_X86_64_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
+    printk("STUB: include/asm-x86/x86_64/kexec.h: ELF_CORE_COPY_REGS: " \
+       "not implemented\n")
+  
+
+#endif /* __X86_ELF_X86_64_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_64/kexec.h
@@ -0,0 +1,50 @@
+/******************************************************************************
+ * include/asm-x86/x86_64/kexec.h
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on include/asm-x86_64/kexec.h from Linux 2.6.16
+ *
+ */
+
+#ifndef __X86_64_KEXEC_H__
+#define __X86_64_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
+		    struct cpu_user_regs *oldregs)
+{
+    printk("STUB: include/asm-x86/x86_64/kexec.h: crash_fixup_ss_esp: "
+       "not implemented\n");
+}
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: include/asm-x86/x86_64/kexec.h: crash_setup_regs: "
+       "not implemented\n");
+    return;
+    crash_fixup_ss_esp(newregs, oldregs);
+}
+
+static inline int user_mode(struct cpu_user_regs *regs)
+{
+    printk("STUB: include/asm-x86/x86_64/kexec.h: user_mode: "
+       "not implemented\n");
+    return -1;
+}
+
+#endif /* __X86_64_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/public/kexec.h
+++ x/xen/include/public/kexec.h
@@ -43,6 +43,8 @@
  */
 #define KEXEC_CMD_kexec_load            1
 typedef struct xen_kexec_image {
+    unsigned long page_table_a[7];
+    unsigned long page_table_b;
     unsigned long indirection_page;
     unsigned long reboot_code_buffer;
     unsigned long start_address;
--- x/xen/include/xen/elfcore.h
+++ x/xen/include/xen/elfcore.h
@@ -16,6 +16,9 @@
 #include <public/xen.h>
 
 #define NT_PRSTATUS     1
+#define NT_XEN_DOM0_CR3 0x10000001 /* XXX: Hopefully this is unused,
+					   feel free to change to a 
+					   better/different value */
 
 typedef struct
 {

[-- Attachment #4: 51.2.1.1-kexec-x86_32-upstream.patch --]
[-- Type: text/plain, Size: 29971 bytes --]

kexec: x86_32

This is the x86_32 component of kexec for xen.
The x86 component is a prerequsite for this patch.

Signed-Off-By: Horms <horms@verge.net.au>
Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>

 buildconfigs/linux-defconfig_xen_x86_32                              |    4 
 linux-2.6-xen-sparse/arch/i386/Kconfig                               |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/Makefile                       |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c                    |   29 
 linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h                    |   42 
 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h       |    8 
 patches/linux-2.6.16.13/1-linux-2.6.16-kexec_page_table_a_i386.patch |  457 ++++++++++
 xen/arch/x86/crash.c                                                 |   47 +
 xen/arch/x86/x86_32/entry.S                                          |    2 
 xen/arch/x86/x86_32/machine_kexec.c                                  |   28 
 xen/include/asm-x86/x86_32/elf.h                                     |   34 
 xen/include/asm-x86/x86_32/kexec.h                                   |   68 +
 12 files changed, 683 insertions(+), 40 deletions(-)

--- x/buildconfigs/linux-defconfig_xen_x86_32
+++ x/buildconfigs/linux-defconfig_xen_x86_32
@@ -184,10 +184,11 @@ CONFIG_MTRR=y
 CONFIG_REGPARM=y
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
-# CONFIG_CRASH_DUMP is not set
+CONFIG_CRASH_DUMP=y
 CONFIG_PHYSICAL_START=0x100000
 CONFIG_HOTPLUG_CPU=y
 
@@ -2774,6 +2775,7 @@ CONFIG_NTFS_FS=m
 #
 CONFIG_PROC_FS=y
 CONFIG_PROC_KCORE=y
+# CONFIG_PROC_VMCORE is not set
 CONFIG_SYSFS=y
 CONFIG_TMPFS=y
 # CONFIG_HUGETLB_PAGE is not set
--- x/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ x/linux-2.6-xen-sparse/arch/i386/Kconfig
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_XEN
+	depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
@@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
@@ -68,6 +68,10 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -941,6 +945,7 @@ static void __init parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -951,6 +956,10 @@ static void __init parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1320,9 +1329,22 @@ void __init setup_bootmem_allocator(void
 	}
 #endif
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end)
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
+#else
+	{
+		xen_kexec_reserve_t reservation;
+		BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0,
+					&reservation));
+		if (reservation.size) {
+			crashk_res.start = reservation.start;
+			crashk_res.end = reservation.start + 
+				reservation.size - 1;
+		}
+	}
+#endif
 #endif
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1378,6 +1400,10 @@ legacy_init_iomem_resources(struct e820e
 		res->end = res->start + e820[i].size - 1;
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		request_resource(&iomem_resource, res);
+#ifdef CONFIG_KEXEC
+		request_resource(res, &crashk_res);
+#endif
+
 		if (e820[i].type == E820_RAM) {
 			/*
 			 *  We don't know which RAM region contains kernel data,
@@ -1386,9 +1412,6 @@ legacy_init_iomem_resources(struct e820e
 			 */
 			request_resource(res, code_resource);
 			request_resource(res, data_resource);
-#ifdef CONFIG_KEXEC
-			request_resource(res, &crashk_res);
-#endif
 		}
 	}
 }
--- /dev/null
+++ x/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h
@@ -0,0 +1,42 @@
+/*
+ * include/asm-i386/kexec-xen.h
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _I386_KEXEC_XEN_H
+#define _I386_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_32.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+					struct cpu_user_regs *xen_regs)
+{
+	xen_regs->ebx    = linux_regs->ebx;
+	xen_regs->ecx    = linux_regs->ecx;
+	xen_regs->edx    = linux_regs->edx;
+	xen_regs->esi    = linux_regs->esi;
+	xen_regs->edi    = linux_regs->edi;
+	xen_regs->ebp    = linux_regs->ebp;
+	xen_regs->eax    = linux_regs->eax;
+	xen_regs->esp    = linux_regs->esp;
+	xen_regs->ss     = linux_regs->xss;
+	xen_regs->cs     = linux_regs->xcs;
+	xen_regs->ds     = linux_regs->xds;
+	xen_regs->es     = linux_regs->xes;
+	xen_regs->eflags = linux_regs->eflags;
+}
+
+#endif /* _I386_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
@@ -368,5 +368,13 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, unsigned int arg1, void * extra_args)
+{
+	return _hypercall3(int, kexec_op, op, arg1, extra_args);
+}
+
+
 
 #endif /* __HYPERCALL_H__ */
--- x/xen/arch/x86/crash.c
+++ x/xen/arch/x86/crash.c
@@ -21,6 +21,7 @@
 #include <xen/delay.h>
 #include <xen/perfc.h>
 #include <xen/kexec.h>
+#include <xen/sched.h>
 #include <public/xen.h>
 
 static int crashing_cpu;
@@ -167,6 +168,51 @@ static void nmi_shootdown_cpus(void)
 }
 #endif
 
+/* The cr3 for dom0 on each of its vcpus
+ * It is added as ELF_Prstatus prstatus.pr_reg[ELF_NGREG-1)], where
+ * prstatus is the data of the elf note, and ELF_NGREG was extended
+ * by one to allow extra space.
+ * This code runs after all cpus except the crashing one have
+ * been shutdown so as to avoid having to hold domlist_lock,
+ * as locking after a crash is playing with fire */
+void find_dom0_cr3(void)
+{
+	struct domain *d;
+	struct vcpu   *v;
+	uint32_t *buf;
+	uint32_t cr3;
+	Elf_Note note;
+
+	/* Don't need to grab domlist_lock as we are the only thing running */
+
+	/* No need to traverse domain_list, as dom0 is always first */
+	d = domain_list;
+	BUG_ON(d->domain_id);
+
+	for_each_vcpu ( d, v ) {
+		if ( test_bit(_VCPUF_down, &v->vcpu_flags) )
+			continue;
+		buf = (uint32_t *)per_cpu(crash_notes, v->processor);
+		if (!buf) /* XXX: Can this ever occur? */
+			continue;
+
+		memcpy(&note, buf, sizeof(Elf_Note));
+		buf += (sizeof(Elf_Note) +3)/4 + (note.namesz + 3)/4 +
+			(note.descsz + 3)/4;
+
+		/* XXX: This probably doesn't take into account shadow mode,
+		 * but that might not be a problem */
+		cr3 = pagetable_get_pfn(v->arch.guest_table);
+
+		buf = append_elf_note(buf, "Xen Domanin-0 CR3",
+			NT_XEN_DOM0_CR3, &cr3, 4);
+		final_note(buf);
+
+		printk("domain:%i vcpu:%u processor:%u cr3:%08x\n", 
+		       d->domain_id, v->vcpu_id, v->processor, cr3);
+	}
+}
+
 void machine_crash_shutdown(struct cpu_user_regs *regs)
 {
 	printk("machine_crash_shutdown: %d\n", smp_processor_id());
@@ -178,6 +224,7 @@ void machine_crash_shutdown(struct cpu_u
         disable_IO_APIC();
 #endif
 	crash_save_self(regs);
+	find_dom0_cr3();
 }
 
 /*
--- x/xen/arch/x86/x86_32/entry.S
+++ x/xen/arch/x86/x86_32/entry.S
@@ -656,6 +656,7 @@ ENTRY(hypercall_table)
         .long do_xenoprof_op
         .long do_event_channel_op
         .long do_physdev_op
+        .long do_kexec
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -695,6 +696,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_xenoprof_op       */
         .byte 2 /* do_event_channel_op  */
         .byte 2 /* do_physdev_op        */
+        .byte 3 /* do_kexec             */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- x/xen/arch/x86/x86_32/machine_kexec.c
+++ x/xen/arch/x86/x86_32/machine_kexec.c
@@ -1,19 +1,31 @@
-/*
+/******************************************************************************
  * arch/x86/x86_32/machine_kexec.c
- * Handle transition of Linux booting another kernel
- *
- * Created By: Horms <horms@verge.net.au>
+ * 
+ * Created By: Horms
  *
- * Should be losely based on arch/i386/kernel/machine_kexec.c
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
  */
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
 #include <public/kexec.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+                    unsigned long indirection_page,
+                    unsigned long reboot_code_buffer,
+                    unsigned long start_address,
+                    unsigned long page_table_a,
+                    unsigned long has_pae);
 
 void machine_kexec(xen_kexec_image_t *image)
 {
-    printk("STUB: arch/x86/x86_32/machine_kexec.c: machine_kexec: "
-        "not implemented\n");
+    relocate_new_kernel_t rnk;
+
+    rnk = (relocate_new_kernel_t) fix_to_virt(FIX_KEXEC_PAGE);
+    (*rnk)(image->indirection_page, image->reboot_code_buffer, 
+           image->start_address, image->page_table_a[0],
+           (unsigned long)cpu_has_pae);
 }
 
 /*
--- x/xen/include/asm-x86/x86_32/elf.h
+++ x/xen/include/asm-x86/x86_32/elf.h
@@ -3,19 +3,39 @@
  * 
  * Created By: Horms
  *
- * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS
- * from Linux 2.6.16
+ * Based heavily on include/asm-i386/elf.h and 
+ * include/asm-i386/system.h from Linux 2.6.16
  */
 
 #ifndef __X86_ELF_X86_32_H__
 #define __X86_ELF_X86_32_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+/* XXX: Xen doesn't have orig_eax.  For kdump, on a dom0 crash, the values
+ * for the crashing CPU could could be passed down from dom0, but is that
+ * neccessary?
+ * Also, I'm not sure why fs and gs are derived from the CPU
+ * rather than regs */
 
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
-    printk("STUB: include/asm-x86/x86_32/kexec.h: ELF_CORE_COPY_REGS: " \
-       "not implemented\n")
-  
+#define ELF_CORE_COPY_REGS(pr_reg, regs) do {                           \
+    unsigned i;                                                         \
+    pr_reg[0] = regs->ebx;                                              \
+    pr_reg[1] = regs->ecx;                                              \
+    pr_reg[2] = regs->edx;                                              \
+    pr_reg[3] = regs->esi;                                              \
+    pr_reg[4] = regs->edi;                                              \
+    pr_reg[5] = regs->ebp;                                              \
+    pr_reg[6] = regs->eax;                                              \
+    pr_reg[7] = regs->ds;                                               \
+    pr_reg[8] = regs->es;                                               \
+    asm volatile("mov %%fs,%0":"=rm" (i)); pr_reg[9]  = i;              \
+    asm volatile("mov %%gs,%0":"=rm" (i)); pr_reg[10] = i;              \
+    pr_reg[11] = 0; /* regs->orig_eax; */                               \
+    pr_reg[12] = regs->eip;                                             \
+    pr_reg[13] = regs->cs;                                              \
+    pr_reg[14] = regs->eflags;                                          \
+    pr_reg[15] = regs->esp;                                             \
+    pr_reg[16] = regs->ss;                                              \
+} while(0);
 
 #endif /* __X86_ELF_X86_32_H__ */
 
--- x/xen/include/asm-x86/x86_32/kexec.h
+++ x/xen/include/asm-x86/x86_32/kexec.h
@@ -3,42 +3,72 @@
  * 
  * Created By: Horms
  *
- * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16
- *
+ * Based heavily on include/asm-i386/kexec.h from Linux 2.6.16
  */
 
-#ifndef __X86_32_KEXEC_H__
-#define __X86_32_KEXEC_H__
-
-#include <xen/lib.h>       /* for printk() used in stub */
-#include <xen/types.h>
-#include <public/xen.h>
+#ifndef __X86_KEXEC_X86_32_H__
+#define __X86_KEXEC_X86_32_H__
 
+/* CPU does not save ss and esp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
 static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
-		    struct cpu_user_regs *oldregs)
+                    struct cpu_user_regs *oldregs)
 {
-    printk("STUB: include/asm-x86/x86_32/kexec.h: crash_fixup_ss_esp: "
-       "not implemented\n");
-    return;
-    crash_fixup_ss_esp(newregs, oldregs);
+    memcpy(newregs, oldregs, sizeof(*newregs));
+    newregs->esp = (unsigned long)&(oldregs->esp);
+    __asm__ __volatile__(
+            "xorl %%eax, %%eax\n\t"
+            "movw %%ss, %%ax\n\t"
+            :"=a"(newregs->ss));
 }
 
+/*
+ * This function is responsible for capturing register states if coming
+ * via panic otherwise just fix up the ss and esp if coming via kernel
+ * mode exception.
+ */
 static void crash_setup_regs(struct cpu_user_regs *newregs,
 			     struct cpu_user_regs *oldregs)
 {
-    printk("STUB: include/asm-x86/x86_32/kexec.h: crash_setup_regs: "
-       "not implemented\n");
+    if (oldregs)
+        crash_fixup_ss_esp(newregs, oldregs);
+    else {
+        __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx));
+        __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx));
+        __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx));
+        __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi));
+        __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi));
+        __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp));
+        __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax));
+        __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp));
+        __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->ss));
+        __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->cs));
+        __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->ds));
+        __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->es));
+        __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags));
+
+        newregs->eip = (unsigned long)current_text_addr();
+    }
 }
 
+/*              
+ * From Linux 2.6.16's include/asm-i386/mach-xen/asm/ptrace.h
+ *
+ * user_mode_vm(regs) determines whether a register set came from user mode.
+ * This is true if V8086 mode was enabled OR if the register set was from
+ * protected mode with RPL-3 CS value.  This tricky test checks that with
+ * one comparison.  Many places in the kernel can bypass this full check
+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ */
 static inline int user_mode(struct cpu_user_regs *regs)
 {
-    printk("STUB: include/asm-x86/x86_32/kexec.h: user_mode: "
-       "not implemented\n");
-    return -1;
+    return (regs->cs & 2) != 0;
 }
 
 
-#endif /* __X86_32_KEXEC_H__ */
+#endif /* __X86_KEXEC_X86_32_H__ */
 
 /*
  * Local variables:
--- /dev/null
+++ x/patches/linux-2.6.16.13/1-linux-2.6.16-kexec_page_table_a_i386.patch
@@ -0,0 +1,457 @@
+kexec: Avoid overwriting the current pgd (V2, i386)
+
+This patch upgrades the i386-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables called "page_table_a". These
+tables are used to provide an executable identity mapping without overwriting
+the current pgd. This updated version of the patch fixes a PAE bug and moves
+the segment handling code into the reloacte_kernel.S.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+---
+
+ The patch has been tested with regular kexec and CONFIG_CRASH_DUMP.
+ Both PAE and non-PAE configurations work well.
+ Applies on top of 2.6.16 and 2.6.17-rc4.
+
+ arch/i386/kernel/machine_kexec.c   |  230 ++++++++++++++----------------------
+ arch/i386/kernel/relocate_kernel.S |   92 ++++++++++++++
+ include/asm-i386/kexec.h           |   12 +
+ 3 files changed, 192 insertions(+), 142 deletions(-)
+
+--- x/arch/i386/kernel/machine_kexec.c
++++ x/arch/i386/kernel/machine_kexec.c
+@@ -2,6 +2,10 @@
+  * machine_kexec.c - handle transition of Linux booting another kernel
+  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - rewrote identity map code to avoid overwriting current pgd
++ * - moved segment handling code into relocate_kernel.S
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+@@ -19,123 +23,73 @@
+ #include <asm/desc.h>
+ #include <asm/system.h>
+ 
+-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+-
+-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L2_ATTR (_PAGE_PRESENT)
+-
+-#define LEVEL0_SIZE (1UL << 12UL)
++typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
++					unsigned long indirection_page,
++					unsigned long reboot_code_buffer,
++					unsigned long start_address,
++					unsigned long page_table_a,
++					unsigned long has_pae) ATTRIB_NORET;
+ 
+-#ifndef CONFIG_X86_PAE
+-#define LEVEL1_SIZE (1UL << 22UL)
+-static u32 pgtable_level1[1024] PAGE_ALIGNED;
++const extern unsigned char relocate_new_kernel[];
++extern void relocate_new_kernel_end(void);
++const extern unsigned int relocate_new_kernel_size;
+ 
+-static void identity_map_page(unsigned long address)
++static int allocate_page_table_a(struct kimage *image)
+ {
+-	unsigned long level1_index, level2_index;
+-	u32 *pgtable_level2;
+-
+-	/* Find the current page table */
+-	pgtable_level2 = __va(read_cr3());
++	struct kimage_arch *arch = &image->arch_data;
++	struct page *page;
++	int k = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
++
++	for (; k > 0; k--) {
++		page = kimage_alloc_control_pages(image, 0);
++		if (!page)
++			return -ENOMEM;
++
++		clear_page(page_address(page));
++		arch->page_table_a[k - 1] = page;
++	}
+ 
+-	/* Find the indexes of the physical address to identity map */
+-	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+-	level2_index = address / LEVEL1_SIZE;
+-
+-	/* Identity map the page table entry */
+-	pgtable_level1[level1_index] = address | L0_ATTR;
+-	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+-
+-	/* Flush the tlb so the new mapping takes effect.
+-	 * Global tlb entries are not flushed but that is not an issue.
+-	 */
+-	load_cr3(pgtable_level2);
++	return 0;
+ }
+ 
+-#else
+-#define LEVEL1_SIZE (1UL << 21UL)
+-#define LEVEL2_SIZE (1UL << 30UL)
+-static u64 pgtable_level1[512] PAGE_ALIGNED;
+-static u64 pgtable_level2[512] PAGE_ALIGNED;
+-
+-static void identity_map_page(unsigned long address)
+-{
+-	unsigned long level1_index, level2_index, level3_index;
+-	u64 *pgtable_level3;
++/* workaround for include/asm-i386/pgtable-3level.h */
+ 
+-	/* Find the current page table */
+-	pgtable_level3 = __va(read_cr3());
+-
+-	/* Find the indexes of the physical address to identity map */
+-	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+-	level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
+-	level3_index = address / LEVEL2_SIZE;
+-
+-	/* Identity map the page table entry */
+-	pgtable_level1[level1_index] = address | L0_ATTR;
+-	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+-	set_64bit(&pgtable_level3[level3_index],
+-					       __pa(pgtable_level2) | L2_ATTR);
+-
+-	/* Flush the tlb so the new mapping takes effect.
+-	 * Global tlb entries are not flushed but that is not an issue.
+-	 */
+-	load_cr3(pgtable_level3);
+-}
++#ifdef CONFIG_X86_PAE
++#undef pgd_present
++#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
++#define _PGD_ATTR _PAGE_PRESENT
++#else
++#define _PGD_ATTR _KERNPG_TABLE
+ #endif
+ 
+-static void set_idt(void *newidt, __u16 limit)
+-{
+-	struct Xgt_desc_struct curidt;
+-
+-	/* ia32 supports unaliged loads & stores */
+-	curidt.size    = limit;
+-	curidt.address = (unsigned long)newidt;
+-
+-	load_idt(&curidt);
+-};
++#define pa_page(page) __pa(page_address(page))
+ 
+-
+-static void set_gdt(void *newgdt, __u16 limit)
++static int create_mapping(struct page *root, struct page **pages, 
++			  unsigned long va, unsigned long pa)
+ {
+-	struct Xgt_desc_struct curgdt;
+-
+-	/* ia32 supports unaligned loads & stores */
+-	curgdt.size    = limit;
+-	curgdt.address = (unsigned long)newgdt;
++	pgd_t *pgd;
++	pud_t *pud;
++	pmd_t *pmd;
++	pte_t *pte;
++	int k = 0;
+ 
+-	load_gdt(&curgdt);
+-};
++	pgd = (pgd_t *)page_address(root) + pgd_index(va);
++	if (!pgd_present(*pgd))
++		set_pgd(pgd, __pgd(pa_page(pages[k++]) | _PGD_ATTR));
+ 
+-static void load_segments(void)
+-{
+-#define __STR(X) #X
+-#define STR(X) __STR(X)
++	pud = pud_offset(pgd, va);
++	if (!pud_present(*pud))
++		set_pud(pud, __pud(pa_page(pages[k++]) | _KERNPG_TABLE));
+ 
+-	__asm__ __volatile__ (
+-		"\tljmp $"STR(__KERNEL_CS)",$1f\n"
+-		"\t1:\n"
+-		"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
+-		"\tmovl %%eax,%%ds\n"
+-		"\tmovl %%eax,%%es\n"
+-		"\tmovl %%eax,%%fs\n"
+-		"\tmovl %%eax,%%gs\n"
+-		"\tmovl %%eax,%%ss\n"
+-		::: "eax", "memory");
+-#undef STR
+-#undef __STR
+-}
++	pmd = pmd_offset(pud, va);
++	if (!pmd_present(*pmd))
++		set_pmd(pmd, __pmd(pa_page(pages[k++]) | _KERNPG_TABLE));
+ 
+-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
+-					unsigned long indirection_page,
+-					unsigned long reboot_code_buffer,
+-					unsigned long start_address,
+-					unsigned int has_pae) ATTRIB_NORET;
++	pte = (pte_t *)page_address(pmd_page(*pmd)) + pte_index(va);
++	set_pte(pte, __pte(pa | _PAGE_KERNEL_EXEC));
+ 
+-const extern unsigned char relocate_new_kernel[];
+-extern void relocate_new_kernel_end(void);
+-const extern unsigned int relocate_new_kernel_size;
++	return k;
++}
+ 
+ /*
+  * A architecture hook called to validate the
+@@ -147,11 +101,38 @@ const extern unsigned int relocate_new_k
+  * Do what every setup is needed on image and the
+  * reboot code buffer to allow us to avoid allocations
+  * later.
+- *
+- * Currently nothing.
+  */
+ int machine_kexec_prepare(struct kimage *image)
+ {
++	void *control_page;
++	unsigned long pa;
++	int k;
++
++	memset(&image->arch_data, 0, sizeof(image->arch_data));
++
++	k = allocate_page_table_a(image);
++	if (k)
++		return k;
++
++	/* fill in control_page with assembly code */
++
++	control_page = page_address(image->control_code_page);
++	memcpy(control_page, relocate_new_kernel, relocate_new_kernel_size);
++
++	/* map the control_page at the virtual address of relocate_kernel.S */
++
++	pa = __pa(control_page);
++
++	k = create_mapping(image->arch_data.page_table_a[0], 
++			   &image->arch_data.page_table_a[1],
++			   (unsigned long)relocate_new_kernel, pa);
++
++	/* identity map the control_page */
++
++	create_mapping(image->arch_data.page_table_a[0], 
++		       &image->arch_data.page_table_a[k + 1],
++		       pa, pa);
++
+ 	return 0;
+ }
+ 
+@@ -170,45 +151,16 @@ void machine_kexec_cleanup(struct kimage
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+ 	unsigned long page_list;
+-	unsigned long reboot_code_buffer;
+-
++	unsigned long control_code;
++	unsigned long page_table_a;
+ 	relocate_new_kernel_t rnk;
+ 
+-	/* Interrupts aren't acceptable while we reboot */
+-	local_irq_disable();
+-
+-	/* Compute some offsets */
+-	reboot_code_buffer = page_to_pfn(image->control_code_page)
+-								<< PAGE_SHIFT;
+ 	page_list = image->head;
+-
+-	/* Set up an identity mapping for the reboot_code_buffer */
+-	identity_map_page(reboot_code_buffer);
+-
+-	/* copy it out */
+-	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+-						relocate_new_kernel_size);
+-
+-	/* The segment registers are funny things, they are
+-	 * automatically loaded from a table, in memory wherever you
+-	 * set them to a specific selector, but this table is never
+-	 * accessed again you set the segment to a different selector.
+-	 *
+-	 * The more common model is are caches where the behide
+-	 * the scenes work is done, but is also dropped at arbitrary
+-	 * times.
+-	 *
+-	 * I take advantage of this here by force loading the
+-	 * segments, before I zap the gdt with an invalid value.
+-	 */
+-	load_segments();
+-	/* The gdt & idt are now invalid.
+-	 * If you want to load them you must set up your own idt & gdt.
+-	 */
+-	set_gdt(phys_to_virt(0),0);
+-	set_idt(phys_to_virt(0),0);
++	control_code = __pa(page_address(image->control_code_page));
++	page_table_a = __pa(page_address(image->arch_data.page_table_a[0]));
+ 
+ 	/* now call it */
+-	rnk = (relocate_new_kernel_t) reboot_code_buffer;
+-	(*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
++	rnk = (relocate_new_kernel_t) relocate_new_kernel;
++	(*rnk)(page_list, control_code, image->start, 
++	       page_table_a, (unsigned long)cpu_has_pae);
+ }
+--- x/arch/i386/kernel/relocate_kernel.S
++++ x/arch/i386/kernel/relocate_kernel.S
+@@ -2,12 +2,20 @@
+  * relocate_kernel.S - put the kernel image in place to boot
+  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - moved segment handling code from machine_kexec.c
++ * - gdt tables stolen from arch/i386/boot/setup.S
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+ 
+ #include <linux/linkage.h>
++#include <asm/page.h>
+ 
++.text
++.align (1 << PAGE_SHIFT)
++	
+ 	/*
+ 	 * Must be relocatable PIC code callable as a C function, that once
+ 	 * it starts can not use the previous processes stack.
+@@ -18,18 +26,68 @@ relocate_new_kernel:
+ 	movl  4(%esp), %ebx /* page_list */
+ 	movl  8(%esp), %ebp /* reboot_code_buffer */
+ 	movl  12(%esp), %edx /* start address */
+-	movl  16(%esp), %ecx /* cpu_has_pae */
++	movl  16(%esp), %edi /* page_table_a */
++	movl  20(%esp), %ecx /* cpu_has_pae */
+ 
+ 	/* zero out flags, and disable interrupts */
+ 	pushl $0
+ 	popfl
+ 
++	/* switch to page_table_a */
++	movl	%edi, %eax
++	movl	%eax, %cr3
++
++	/* setup idt */
++
++	movl	%ebp, %eax
++	addl	$(idt_48 - relocate_new_kernel), %eax
++	lidtl	(%eax)
++
++	/* setup gdt */
++
++	movl	%ebp, %eax
++	addl	$(gdt - relocate_new_kernel), %eax
++	movl	%ebp, %esi
++	addl	$((gdt_48 - relocate_new_kernel) + 2), %esi
++	movl	%eax, (%esi)
++	
++	movl	%ebp, %eax
++	addl	$(gdt_48 - relocate_new_kernel), %eax
++	lgdtl	(%eax)
++
++	/* setup data segment registers */
++	
++	mov	$(gdt_ds - gdt), %eax
++	mov	%eax, %ds
++	mov	%eax, %es
++	mov	%eax, %fs
++	mov	%eax, %gs
++	mov	%eax, %ss
++
+ 	/* set a new stack at the bottom of our page... */
+ 	lea   4096(%ebp), %esp
+ 
++	/* load new code segment */
++
++	movl	%ebp, %esi
++	xorl	%eax, %eax
++	pushl	%eax
++	pushl	%esi
++	pushl	%eax
++	
++	movl	$(gdt_cs - gdt), %eax
++	pushl	%eax
++	
++	movl	%ebp, %eax
++	addl	$(identity_mapped - relocate_new_kernel),%eax
++	pushl	%eax
++	iretl
++
++identity_mapped:	
++
+ 	/* store the parameters back on the stack */
+ 	pushl   %edx /* store the start address */
+-
++	
+ 	/* Set cr0 to a known state:
+ 	 * 31 0 == Paging disabled
+ 	 * 18 0 == Alignment check disabled
+@@ -113,6 +171,36 @@ relocate_new_kernel:
+ 	xorl    %edi, %edi
+ 	xorl    %ebp, %ebp
+ 	ret
++
++	.align	16
++gdt:
++	.fill	1,8,0
++
++gdt_cs:	
++	.word	0xFFFF				# 4Gb - (0x100000*0x1000 = 4Gb)
++	.word	0				# base address = 0
++	.word	0x9A00				# code read/exec
++	.word	0x00CF				# granularity = 4096, 386
++						#  (+5th nibble of limit)
++gdt_ds:
++	.word	0xFFFF				# 4Gb - (0x100000*0x1000 = 4Gb)
++	.word	0				# base address = 0
++	.word	0x9200				# data read/write
++	.word	0x00CF				# granularity = 4096, 386
++						#  (+5th nibble of limit)
++gdt_end:
++	.align	4
++	
++	.word	0				# alignment byte
++idt_48:
++	.word	0				# idt limit = 0
++	.word	0, 0				# idt base = 0L
++
++	.word	0				# alignment byte
++gdt_48:
++	.word	gdt_end - gdt - 1		# gdt limit
++	.word	0, 0				# gdt base (filled in later)
++	
+ relocate_new_kernel_end:
+ 
+ 	.globl relocate_new_kernel_size
+--- x/include/asm-i386/kexec.h
++++ x/include/asm-i386/kexec.h
+@@ -29,7 +29,17 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
+-struct kimage_arch {};
++struct kimage_arch {
++       /* page_table_a[] holds enough pages to create a new page table
++        * that maps the control page twice..
++        */
++
++#if defined(CONFIG_X86_PAE)
++       struct page *page_table_a[5]; /* (2 * pte) + (2 * pmd) + pgd */
++#else
++       struct page *page_table_a[3]; /* (2 * pte) + pgd */
++#endif
++};
+ 
+ /* CPU does not save ss and esp on stack if execution is already
+  * running in kernel mode at the time of NMI occurrence. This code

[-- Attachment #5: 51.2.1.2-kexec-x86_64-upstream.patch --]
[-- Type: text/plain, Size: 23414 bytes --]

kexec: x86_64

This is the x86_64 component of kexec for xen.
The x86 component is a prerequsite for this patch.

Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
Signed-Off-By: Horms <horms@verge.net.au>

 buildconfigs/linux-defconfig_xen_x86_64                                    |    2 
 linux-2.6-xen-sparse/arch/x86_64/Kconfig                                   |    2 
 linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile                           |    2 
 linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c                        |   26 
 linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h                        |   30 
 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h           |    7 
 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h              |    2 
 patches/linux-2.6.16.13/2-linux-2.6.16-kexec_page_table_a_x86_64.patch     |  421 ++++++++++
 patches/linux-2.6.16.13/3-linux-2.6.16-kexec_page_table_a_x86_64-xen.patch |  116 ++
 xen/arch/x86/x86_64/entry.S                                                |    2 
 xen/arch/x86/x86_64/machine_kexec.c                                        |   16 
 11 files changed, 621 insertions(+), 5 deletions(-)

--- x/buildconfigs/linux-defconfig_xen_x86_64
+++ x/buildconfigs/linux-defconfig_xen_x86_64
@@ -139,6 +139,8 @@ CONFIG_SWIOTLB=y
 CONFIG_PHYSICAL_START=0x100000
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
+CONFIG_CRASH_DUMP=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
--- x/linux-2.6-xen-sparse/arch/x86_64/Kconfig
+++ x/linux-2.6-xen-sparse/arch/x86_64/Kconfig
@@ -433,7 +433,7 @@ config X86_MCE_AMD
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_64_XEN
+	depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
@@ -59,7 +59,7 @@ pci-dma-y			+= ../../i386/kernel/pci-dma
 microcode-$(subst m,y,$(CONFIG_MICROCODE))  := ../../i386/kernel/microcode-xen.o
 quirks-y			:= ../../i386/kernel/quirks-xen.o
 
-n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o
+n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o crash.o
 
 include $(srctree)/scripts/Makefile.xen
 
--- x/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
@@ -79,6 +79,10 @@
 #include <asm/mach-xen/setup_arch_post.h>
 #include <xen/interface/memory.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 extern unsigned long start_pfn;
 extern struct edid_info edid_info;
 
@@ -446,6 +450,7 @@ static __init void parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -456,6 +461,10 @@ static __init void parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 
@@ -810,10 +819,23 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif	/* !CONFIG_XEN */
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end) {
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
 	}
+#else
+	{
+		xen_kexec_reserve_t reservation;
+		BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0,
+					&reservation));
+		if (reservation.size) {
+			crashk_res.start = reservation.start;
+			crashk_res.end = reservation.start + 
+				reservation.size - 1;
+		}
+	}
+#endif
 #endif
 
 	paging_init();
@@ -972,6 +994,10 @@ void __init setup_arch(char **cmdline_p)
 	iommu_hole_init();
 #endif
 
+#ifdef CONFIG_KEXEC
+	request_resource(&ioport_resource, &crashk_res);
+#endif
+
 #ifdef CONFIG_XEN
 	{
 		struct physdev_set_iopl set_iopl;
--- /dev/null
+++ x/linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h
@@ -0,0 +1,30 @@
+/*
+ * include/asm-x86_64/kexec-xen.h
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _X86_64_KEXEC_XEN_H
+#define _X86_64_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_64.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+					struct cpu_user_regs *xen_regs)
+{
+#warning Implement me!
+}
+
+#endif /* _X86_64_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h
@@ -360,4 +360,11 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, unsigned int arg1, void * extra_args)
+{
+	return _hypercall3(int, kexec_op, op, arg1, extra_args);
+}
+
 #endif /* __HYPERCALL_H__ */
--- x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h
+++ x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h
@@ -90,6 +90,8 @@ extern unsigned long profile_pc(struct p
 #define profile_pc(regs) instruction_pointer(regs)
 #endif
 
+#include <linux/compiler.h>
+
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
 
 struct task_struct;
--- x/xen/arch/x86/x86_64/entry.S
+++ x/xen/arch/x86/x86_64/entry.S
@@ -556,6 +556,7 @@ ENTRY(hypercall_table)
         .quad do_xenoprof_op
         .quad do_event_channel_op
         .quad do_physdev_op
+        .quad do_kexec
         .rept NR_hypercalls-((.-hypercall_table)/8)
         .quad do_ni_hypercall
         .endr
@@ -595,6 +596,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_xenoprof_op       */
         .byte 2 /* do_event_channel_op  */
         .byte 2 /* do_physdev_op        */
+        .byte 3 /* do_kexec             */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- x/xen/arch/x86/x86_64/machine_kexec.c
+++ x/xen/arch/x86/x86_64/machine_kexec.c
@@ -7,14 +7,24 @@
  * Should be losely based on arch/x86_64/kernel/machine_kexec.c
  */
 
-#include <xen/lib.h>       /* for printk() used in stubs */
 #include <xen/types.h>
 #include <public/kexec.h>
+#include <asm/fixmap.h>
+
+typedef void (*relocate_new_kernel_t)(unsigned long indirection_page,
+                                      unsigned long control_code_buffer,
+                                      unsigned long start_address,
+                                      unsigned long page_table_a,
+                                      unsigned long page_table_b);
 
 void machine_kexec(xen_kexec_image_t *image)
 {
-    printk("STUB: arch/x86/x86_64/machine_kexec.c: machine_kexec: "
-        "not implemented\n");
+    relocate_new_kernel_t rnk;
+
+    rnk = (relocate_new_kernel_t) fix_to_virt(FIX_KEXEC_PAGE);
+    (*rnk)(image->indirection_page, image->reboot_code_buffer, 
+           image->start_address, image->page_table_a[0],
+           image->page_table_b);
 }
 
 /*
--- /dev/null
+++ x/patches/linux-2.6.16.13/2-linux-2.6.16-kexec_page_table_a_x86_64.patch
@@ -0,0 +1,421 @@
+kexec: Avoid overwriting the current pgd (V2, x86_64)
+
+This patch upgrades the x86_64-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables called "page_table_a". These
+tables are used to provide an executable identity mapping without overwriting
+the current pgd. The already existing page table is renamed to "page_table_b".
+
+KEXEC_CONTROL_CODE_SIZE is changed into a single page. This updated version of
+the patch also moves the segment handling code into the reloacte_kernel.S.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+---
+
+ The patch has been tested with regular kexec and CONFIG_CRASH_DUMP.
+ Applies on top of 2.6.16 and 2.6.17-rc4.
+
+ arch/x86_64/kernel/machine_kexec.c   |  193 +++++++++++++++++-----------------
+ arch/x86_64/kernel/relocate_kernel.S |   84 +++++++++++++-
+ include/asm-x86_64/kexec.h           |   15 ++
+ 3 files changed, 189 insertions(+), 103 deletions(-)
+
+--- x/arch/x86_64/kernel/machine_kexec.c
++++ x/arch/x86_64/kernel/machine_kexec.c
+@@ -2,6 +2,10 @@
+  * machine_kexec.c - handle transition of Linux booting another kernel
+  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - rewrote identity map code to avoid overwriting current pgd
++ * - moved segment handling code into relocate_kernel.S
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+@@ -96,81 +100,110 @@ out:
+ }
+ 
+ 
+-static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
++static int create_page_table_b(struct kimage *image)
+ {
+-	pgd_t *level4p;
+-	level4p = (pgd_t *)__va(start_pgtable);
+- 	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+-}
++	struct kimage_arch *arch = &image->arch_data;
+ 
+-static void set_idt(void *newidt, u16 limit)
+-{
+-	struct desc_ptr curidt;
++	arch->page_table_b = kimage_alloc_control_pages(image, 0);
+ 
+-	/* x86-64 supports unaliged loads & stores */
+-	curidt.size    = limit;
+-	curidt.address = (unsigned long)newidt;
++	if (!arch->page_table_b)
++		return -ENOMEM;
+ 
+-	__asm__ __volatile__ (
+-		"lidtq %0\n"
+-		: : "m" (curidt)
+-		);
+-};
++ 	return init_level4_page(image, page_address(arch->page_table_b),
++				0, end_pfn << PAGE_SHIFT);
++}
+ 
++typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
++					unsigned long control_code_buffer,
++					unsigned long start_address,
++					unsigned long page_table_a,
++					unsigned long page_table_b) ATTRIB_NORET;
++
++const extern unsigned char relocate_new_kernel[];
++const extern unsigned long relocate_new_kernel_size;
+ 
+-static void set_gdt(void *newgdt, u16 limit)
++static int allocate_page_table_a(struct kimage *image)
+ {
+-	struct desc_ptr curgdt;
++	struct kimage_arch *arch = &image->arch_data;
++	struct page *page;
++	int k = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
+ 
+-	/* x86-64 supports unaligned loads & stores */
+-	curgdt.size    = limit;
+-	curgdt.address = (unsigned long)newgdt;
++	for (; k > 0; k--) {
++		page = kimage_alloc_control_pages(image, 0);
++		if (!page)
++			return -ENOMEM;
+ 
+-	__asm__ __volatile__ (
+-		"lgdtq %0\n"
+-		: : "m" (curgdt)
+-		);
+-};
++		clear_page(page_address(page));
++		arch->page_table_a[k - 1] = page;
++	}
+ 
+-static void load_segments(void)
+-{
+-	__asm__ __volatile__ (
+-		"\tmovl %0,%%ds\n"
+-		"\tmovl %0,%%es\n"
+-		"\tmovl %0,%%ss\n"
+-		"\tmovl %0,%%fs\n"
+-		"\tmovl %0,%%gs\n"
+-		: : "a" (__KERNEL_DS) : "memory"
+-		);
++	return 0;
+ }
+ 
+-typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
+-					unsigned long control_code_buffer,
+-					unsigned long start_address,
+-					unsigned long pgtable) ATTRIB_NORET;
++#define _PAGE_KERNEL_EXEC __PAGE_KERNEL_EXEC
++#define pa_page(page) __pa_symbol(page_address(page)) /* __pa() miscompiles */
+ 
+-const extern unsigned char relocate_new_kernel[];
+-const extern unsigned long relocate_new_kernel_size;
++static int create_mapping(struct page *root, struct page **pages, 
++			  unsigned long va, unsigned long pa)
++{
++	pgd_t *pgd;
++	pud_t *pud;
++	pmd_t *pmd;
++	pte_t *pte;
++	int k = 0;
++
++	pgd = (pgd_t *)page_address(root) + pgd_index(va);
++	if (!pgd_present(*pgd))
++		set_pgd(pgd, __pgd(pa_page(pages[k++]) | _KERNPG_TABLE));
++
++	pud = pud_offset(pgd, va);
++	if (!pud_present(*pud))
++		set_pud(pud, __pud(pa_page(pages[k++]) | _KERNPG_TABLE));
++
++	pmd = pmd_offset(pud, va);
++	if (!pmd_present(*pmd))
++		set_pmd(pmd, __pmd(pa_page(pages[k++]) | _KERNPG_TABLE));
++
++	pte = (pte_t *)page_address(pmd_page(*pmd)) + pte_index(va);
++	set_pte(pte, __pte(pa | _PAGE_KERNEL_EXEC));
++
++	return k;
++}
+ 
+ int machine_kexec_prepare(struct kimage *image)
+ {
+-	unsigned long start_pgtable, control_code_buffer;
+-	int result;
++	void *control_page;
++	unsigned long pa;
++	int k;
+ 
+-	/* Calculate the offsets */
+-	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+-	control_code_buffer = start_pgtable + PAGE_SIZE;
+-
+-	/* Setup the identity mapped 64bit page table */
+-	result = init_pgtable(image, start_pgtable);
+-	if (result)
+-		return result;
+-
+-	/* Place the code in the reboot code buffer */
+-	memcpy(__va(control_code_buffer), relocate_new_kernel,
+-						relocate_new_kernel_size);
++	memset(&image->arch_data, 0, sizeof(image->arch_data));
+ 
+-	return 0;
++	k = allocate_page_table_a(image);
++	if (k)
++		return k;
++
++	/* fill in control_page with assembly code */
++
++	control_page = page_address(image->control_code_page);
++	memcpy(control_page, relocate_new_kernel, relocate_new_kernel_size);
++
++	/* map the control_page at the virtual address of relocate_kernel.S */
++
++	pa = __pa(control_page);
++
++	k = create_mapping(image->arch_data.page_table_a[0], 
++			   &image->arch_data.page_table_a[1],
++			   (unsigned long)relocate_new_kernel, pa);
++
++	/* identity map the control_page */
++
++	create_mapping(image->arch_data.page_table_a[0], 
++		       &image->arch_data.page_table_a[k + 1],
++		       pa, pa);
++
++	/* create identity mapped page table aka page_table_b */
++
++	return create_page_table_b(image);
+ }
+ 
+ void machine_kexec_cleanup(struct kimage *image)
+@@ -185,47 +218,17 @@ void machine_kexec_cleanup(struct kimage
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+ 	unsigned long page_list;
+-	unsigned long control_code_buffer;
+-	unsigned long start_pgtable;
++	unsigned long control_code;
++	unsigned long page_table_a;
++	unsigned long page_table_b;
+ 	relocate_new_kernel_t rnk;
+ 
+-	/* Interrupts aren't acceptable while we reboot */
+-	local_irq_disable();
+-
+-	/* Calculate the offsets */
+ 	page_list = image->head;
+-	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+-	control_code_buffer = start_pgtable + PAGE_SIZE;
++	control_code = __pa(page_address(image->control_code_page));
++	page_table_a = __pa(page_address(image->arch_data.page_table_a[0]));
++	page_table_b = __pa(page_address(image->arch_data.page_table_b));
+ 
+-	/* Set the low half of the page table to my identity mapped
+-	 * page table for kexec.  Leave the high half pointing at the
+-	 * kernel pages.   Don't bother to flush the global pages
+-	 * as that will happen when I fully switch to my identity mapped
+-	 * page table anyway.
+-	 */
+-	memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
+-	__flush_tlb();
+-
+-
+-	/* The segment registers are funny things, they are
+-	 * automatically loaded from a table, in memory wherever you
+-	 * set them to a specific selector, but this table is never
+-	 * accessed again unless you set the segment to a different selector.
+-	 *
+-	 * The more common model are caches where the behide
+-	 * the scenes work is done, but is also dropped at arbitrary
+-	 * times.
+-	 *
+-	 * I take advantage of this here by force loading the
+-	 * segments, before I zap the gdt with an invalid value.
+-	 */
+-	load_segments();
+-	/* The gdt & idt are now invalid.
+-	 * If you want to load them you must set up your own idt & gdt.
+-	 */
+-	set_gdt(phys_to_virt(0),0);
+-	set_idt(phys_to_virt(0),0);
+ 	/* now call it */
+-	rnk = (relocate_new_kernel_t) control_code_buffer;
+-	(*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
++	rnk = (relocate_new_kernel_t) relocate_new_kernel;
++	(*rnk)(page_list, control_code, image->start, page_table_a, page_table_b);
+ }
+--- x/arch/x86_64/kernel/relocate_kernel.S
++++ x/arch/x86_64/kernel/relocate_kernel.S
+@@ -2,11 +2,18 @@
+  * relocate_kernel.S - put the kernel image in place to boot
+  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - moved segment handling code from machine_kexec.c
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+ 
+ #include <linux/linkage.h>
++#include <asm/page.h>
++
++.text
++.align (1 << PAGE_SHIFT)
+ 
+ 	/*
+ 	 * Must be relocatable PIC code callable as a C function, that once
+@@ -18,21 +25,69 @@ relocate_new_kernel:
+ 	/* %rdi page_list
+ 	 * %rsi reboot_code_buffer
+ 	 * %rdx start address
+-	 * %rcx page_table
+-	 * %r8  arg5
++	 * %rcx page_table_a
++	 * %r8  page_table_b
+ 	 * %r9  arg6
+ 	 */
+-
++	
+ 	/* zero out flags, and disable interrupts */
+ 	pushq $0
+ 	popfq
+ 
++	/* switch to page_table_a */
++	movq    %rcx, %cr3
++
++	/* setup idt */
++
++	movq	%rsi, %rax
++	addq	$(idt_48 - relocate_new_kernel), %rax
++	lidtq	(%rax)
++
++	/* setup gdt */
++
++	movq	%rsi, %rax
++	addq	$(gdt - relocate_new_kernel), %rax
++	movq	%rsi, %r9
++	addq	$((gdt_48 - relocate_new_kernel) + 2), %r9
++	movq	%rax, (%r9)
++	
++	movq	%rsi, %rax
++	addq	$(gdt_48 - relocate_new_kernel), %rax
++	lgdtq	(%rax)
++
++	/* setup data segment registers */
++
++	xorl	%eax,%eax
++	movl	%eax, %ds
++	movl	%eax, %es
++	movl	%eax, %fs
++	movl	%eax, %gs
++	movl	%eax, %ss
++
+ 	/* set a new stack at the bottom of our page... */
+ 	lea   4096(%rsi), %rsp
+ 
++	/* load new code segment */
++
++	movq	%rsp, %rcx
++	xorq	%rax, %rax
++	pushq	%rax                                              /* SS */
++	pushq	%rcx                                              /* ESP */
++	pushq	%rax                                              /* RFLAGS */
++
++	movq	$(gdt_code - gdt), %rax
++	pushq	%rax                                              /* CS */
++
++	movq	%rsi, %rax
++	addq	$(identity_mapped - relocate_new_kernel), %rax
++	pushq	%rax                                              /* RIP */
++
++	iretq
++	
++identity_mapped:
+ 	/* store the parameters back on the stack */
+ 	pushq	%rdx /* store the start address */
+-
++	
+ 	/* Set cr0 to a known state:
+ 	 * 31 1 == Paging enabled
+ 	 * 18 0 == Alignment check disabled
+@@ -69,7 +124,7 @@ relocate_new_kernel:
+ 	/* Switch to the identity mapped page tables,
+ 	 * and flush the TLB.
+ 	*/
+-	movq	%rcx, %cr3
++	movq	%r8, %cr3
+ 
+ 	/* Do the copies */
+ 	movq	%rdi, %rcx 	/* Put the page_list in %rcx */
+@@ -136,6 +191,25 @@ relocate_new_kernel:
+ 	xorq	%r15, %r15
+ 
+ 	ret
++	.align	16
++gdt:
++	.long   0x00000000  /* NULL descriptor */
++	.long   0x00000000
++gdt_code:
++	.long   0x00000000  /* code descriptor */
++	.long   0x00209800
++
++gdt_end:
++	.align	4
++	
++idt_48:
++	.word	0				# idt limit = 0
++	.quad	0, 0				# idt base = 0L
++
++gdt_48:
++	.word	gdt_end - gdt - 1		# gdt limit
++	.quad	0, 0				# gdt base (filled in later)
++
+ relocate_new_kernel_end:
+ 
+ 	.globl relocate_new_kernel_size
+--- x/include/asm-x86_64/kexec.h
++++ x/include/asm-x86_64/kexec.h
+@@ -21,15 +21,24 @@
+ /* Maximum address we can use for the control pages */
+ #define KEXEC_CONTROL_MEMORY_LIMIT     (0xFFFFFFFFFFUL)
+ 
+-/* Allocate one page for the pdp and the second for the code */
+-#define KEXEC_CONTROL_CODE_SIZE  (4096UL + 4096UL)
++#define KEXEC_CONTROL_CODE_SIZE  4096
+ 
+ /* The native architecture */
+ #define KEXEC_ARCH KEXEC_ARCH_X86_64
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
+-struct kimage_arch {};
++struct kimage_arch {
++	/* page_table_a[] holds enough pages to create a new page table
++	 * that maps the control page twice..
++	 *
++	 * page_table_b points to the root page of a page table which is used
++	 * to provide identity mapping of all ram.
++	 */
++
++	struct page *page_table_a[7]; /* 2 * (pte + pud + pmd) + pgd */
++	struct page *page_table_b;
++};
+ 
+ /*
+  * Saving the registers of the cpu on which panic occured in
--- /dev/null
+++ x/patches/linux-2.6.16.13/3-linux-2.6.16-kexec_page_table_a_x86_64-xen.patch
@@ -0,0 +1,116 @@
+ arch/x86_64/kernel/machine_kexec.c |   61 ++++++++++++++++++++++++++++++++----
+ 1 file changed, 55 insertions(+), 6 deletions(-)
+
+--- x/arch/x86_64/kernel/machine_kexec.c
++++ x/arch/x86_64/kernel/machine_kexec.c
+@@ -19,6 +19,48 @@
+ #include <asm/mmu_context.h>
+ #include <asm/io.h>
+ 
++#ifdef CONFIG_XEN
++#include <xen/interface/memory.h>
++#define x__pmd(x) ((pmd_t) { (x) } )
++#define x__pud(x) ((pud_t) { (x) } )
++#define x__pgd(x) ((pgd_t) { (x) } )
++
++#define x_pmd_val(x)	((x).pmd)
++#define x_pud_val(x)	((x).pud)
++#define x_pgd_val(x)	((x).pgd)
++
++static inline void x_set_pmd(pmd_t *dst, pmd_t val)
++{
++        x_pmd_val(*dst) = x_pmd_val(val); 
++} 
++
++static inline void x_set_pud(pud_t *dst, pud_t val)
++{
++	x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
++}
++
++static inline void x_pud_clear (pud_t *pud)
++{
++	x_pud_val(*pud) = 0;
++}
++
++static inline void x_set_pgd(pgd_t *dst, pgd_t val)
++{
++	x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val)); 
++} 
++
++static inline void x_pgd_clear (pgd_t * pgd)
++{
++	x_pgd_val(*pgd) = 0; 
++}
++
++#define MY_LARGE_EXEC _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
++#define MY_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
++#else
++#define MY_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
++#define MY_TABLE _KERNPG_TABLE
++#endif
++
+ static void init_level2_page(pmd_t *level2p, unsigned long addr)
+ {
+ 	unsigned long end_addr;
+@@ -26,7 +68,7 @@
+ 	addr &= PAGE_MASK;
+ 	end_addr = addr + PUD_SIZE;
+ 	while (addr < end_addr) {
+-		set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
++		x_set_pmd(level2p++, x__pmd(addr | MY_LARGE_EXEC));
+ 		addr += PMD_SIZE;
+ 	}
+ }
+@@ -51,12 +93,12 @@
+ 		}
+ 		level2p = (pmd_t *)page_address(page);
+ 		init_level2_page(level2p, addr);
+-		set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
++		x_set_pud(level3p++, x__pud(__pa(level2p) | MY_TABLE));
+ 		addr += PUD_SIZE;
+ 	}
+ 	/* clear the unused entries */
+ 	while (addr < end_addr) {
+-		pud_clear(level3p++);
++		x_pud_clear(level3p++);
+ 		addr += PUD_SIZE;
+ 	}
+ out:
+@@ -87,12 +129,12 @@
+ 		if (result) {
+ 			goto out;
+ 		}
+-		set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
++		x_set_pgd(level4p++, x__pgd(__pa(level3p) | MY_TABLE));
+ 		addr += PGDIR_SIZE;
+ 	}
+ 	/* clear the unused entries */
+ 	while (addr < end_addr) {
+-		pgd_clear(level4p++);
++		x_pgd_clear(level4p++);
+ 		addr += PGDIR_SIZE;
+ 	}
+ out:
+@@ -103,14 +145,21 @@
+ static int create_page_table_b(struct kimage *image)
+ {
+ 	struct kimage_arch *arch = &image->arch_data;
++	unsigned long last_page;
+ 
+ 	arch->page_table_b = kimage_alloc_control_pages(image, 0);
+ 
+ 	if (!arch->page_table_b)
+ 		return -ENOMEM;
+ 
++#ifdef CONFIG_XEN
++	last_page = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
++#else
++	last_page = end_pfn;
++#endif
++
+  	return init_level4_page(image, page_address(arch->page_table_b),
+-				0, end_pfn << PAGE_SHIFT);
++				0, last_page << PAGE_SHIFT);
+ }
+ 
+ typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,

[-- Attachment #6: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 68+ messages in thread

* [PATCH] kexec: framework and i386 (Take XII)
  2006-06-15  7:29                                                     ` [PATCH] kexec: framework and i386 (Take XI) Horms
@ 2006-07-11  3:39                                                       ` Horms
  2006-08-11  7:48                                                         ` [PATCH] kexec: framework and i386 (Take XIII) Horms
  0 siblings, 1 reply; 68+ messages in thread
From: Horms @ 2006-07-11  3:39 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Akio Takebe, Isaku Yamahata,
	Magnus Damm, Mark Williamson

[-- Attachment #1: Type: text/plain, Size: 1455 bytes --]

Hi,

here is another modest update of the kexec patchset for kdump.
A breif summary of changes (all fairly minor):

* Forward port to xen-unstable-10650
* Move hypercall argument setup into machine specific code else its
  going to get messy as more architectures are added
* Don't pass kimage to the kexec_unload sub-hypercall, as its not needed
* Add ia64 stubs
* Use __FILE__ and __FUNCTION__ in stubs to make them less prone to error
* Add xen-console trigger crash_dump

The patches are currently:

   1. 51.1-kexec-generic-upstream.patch
      * Common code for all architectures,
        the basic plumbing for kexec/kdump

   2. 51.1.1-kexec-trigger_crash_dump.patch
      * xen-console trigger crash_dump
      * Depends on 1

   3. 51.2.1-kexec-x86-upstream.patch
      * Glue between 1, and 3 and 4.
        This would not be needed for ppc or ia64, but
	neither have been written yet.
	We are planning to commence work on ia64 soon.
      * Depends on 1

   4. 51.2.1.1-kexec-x86_32-upstream.patch
      * Kexec/kdump for x86_32
      * Depends on 3 (and 1)

   5. 51.2.31.2-kexec-x86_64-upstream.patch
      * * Kexec/kdump for x86_64
      * Depends on 3 (and 1)

I also have some ia64 patches, but they are still not working or
complete, so I'll hold onto them for a bit longer. If anyone wants them,
let me know.

-- 
Horms                                           
  H: http://www.vergenet.net/~horms/
  W: http://www.valinux.co.jp/en/


[-- Attachment #2: 51.1-kexec-generic-upstream.patch --]
[-- Type: text/plain, Size: 38342 bytes --]

kexec: framework

This is an implementation of kexec for dom0/xen, that allows
kexecing of the physical machine from xen. The approach taken is
to move the architecture-dependant kexec code into a new hypercall.

This patch only includes the framework, it cann't be used without
architecture dependant hooks, however the code should compile as is.

Signed-Off-By: Horms <horms@verge.net.au>
Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>

 linux-2.6-xen-sparse/drivers/xen/core/Makefile                        |    1 
 linux-2.6-xen-sparse/drivers/xen/core/crash.c                         |   49 +
 linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c                 |   78 ++
 linux-2.6-xen-sparse/drivers/xen/core/reboot.c                        |    4 
 patches/linux-2.6.16.13/0-linux-2.6.16-kexec_page_table_a_stubs.patch |   85 ++
 patches/linux-2.6.16.13/kexec-generic.patch                           |  294 ++++++++++
 xen/arch/ia64/xen/Makefile                                            |    2 
 xen/arch/ia64/xen/crash.c                                             |   26 
 xen/arch/ia64/xen/machine_kexec.c                                     |   46 +
 xen/arch/x86/Makefile                                                 |    2 
 xen/arch/x86/crash.c                                                  |   26 
 xen/arch/x86/machine_kexec.c                                          |   46 +
 xen/common/Makefile                                                   |    1 
 xen/common/kexec.c                                                    |  211 +++++++
 xen/common/page_alloc.c                                               |   33 -
 xen/drivers/char/console.c                                            |    3 
 xen/include/asm-ia64/kexec.h                                          |   32 +
 xen/include/asm-x86/kexec.h                                           |   31 +
 xen/include/public/kexec.h                                            |   85 ++
 xen/include/public/xen.h                                              |    1 
 xen/include/xen/elfcore.h                                             |   73 ++
 xen/include/xen/kexec.h                                               |   33 +
 xen/include/xen/mm.h                                                  |    1 
 23 files changed, 1152 insertions(+), 11 deletions(-)

--- x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
+++ x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
@@ -11,3 +11,4 @@ obj-$(CONFIG_XEN_SYSFS)		+= xen_sysfs.o
 obj-$(CONFIG_XEN_SKBUFF)	+= skbuff.o
 obj-$(CONFIG_XEN_REBOOT)	+= reboot.o
 obj-$(CONFIG_XEN_SMPBOOT)	+= smpboot.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o crash.o
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/crash.c
@@ -0,0 +1,49 @@
+/*
+ * drivers/xen/core/crash.c
+ * Architecture independent functions for kexec based crash dumps in xen.
+ *
+ * Created by: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <asm/ptrace.h>
+#include <linux/types.h>
+#include <asm/kexec-xen.h>
+#include <asm/hypervisor.h>
+#include <asm/system.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <asm/hw_irq.h>
+#include <xen/interface/kexec.h>
+
+/* 
+ * This passes the registers's down to the hypervisor and has it kexec()
+ * This is a bit different to the linux implementation which
+ * has this call save registers and stop CPUs and then goes into
+ * machine_kexec() later. But for Xen it makes more sense to
+ * have the kexec hypercall do everything, and this call
+ * has the registers parameter that is needed.
+ * to the hypervisor to allow the hypervisor to kdump itself
+ * on an internal panic 
+ */
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	struct cpu_user_regs xen_regs;
+	printk("machine_crash_shutdown: %d\n", smp_processor_id());
+	local_irq_disable();
+#ifdef CONFIG_X86_IO_APIC
+	disable_IO_APIC();
+#endif
+	crash_translate_regs(regs, &xen_regs);
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, KEXEC_TYPE_CRASH, &xen_regs);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
@@ -0,0 +1,78 @@
+/*
+ * drivers/xen/core/machine_kexec.c 
+ * handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <linux/kexec.h>
+#include <xen/interface/kexec.h>
+#include <linux/mm.h>
+#include <asm/hypercall.h>
+#include <asm/kexec-xen.h>
+
+const extern unsigned char relocate_new_kernel[];
+extern unsigned int relocate_new_kernel_size;
+
+static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+	memset(xki, 0, sizeof(*xki));
+
+	machine_kexec_setup_load_arg(xki, image);
+
+	xki->indirection_page = image->head;
+	xki->reboot_code_buffer = 
+		pfn_to_mfn(page_to_pfn(image->control_code_page)) << PAGE_SHIFT;
+	xki->start_address = image->start;
+}
+
+/*
+ * Load the image into xen so xen can kdump itself
+ * This might have been done in prepare, but prepare
+ * is currently called too early. It might make sense
+ * to move prepare, but for now, just add an extra hook.
+ */
+int xen_machine_kexec_load(struct kimage *image)
+{
+	xen_kexec_image_t xki;
+
+	setup_load_arg(&xki, image);
+	return HYPERVISOR_kexec(KEXEC_CMD_kexec_load, image->type, &xki);
+}
+
+/*
+ * Unload the image that was stored by machine_kexec_load()
+ * This might have been done in machine_kexec_cleanup() but it
+ * is called too late, and its possible xen could try and kdump
+ * using resources that have been freed.
+ */
+void xen_machine_kexec_unload(struct kimage *image)
+{
+	HYPERVISOR_kexec(KEXEC_CMD_kexec_unload, image->type, NULL);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ *
+ * This has the hypervisor move to the prefered reboot CPU, 
+ * stop all CPUs and kexec. That is it combines machine_shutdown()
+ * and machine_kexec() in Linux kexec terms.
+ */
+NORET_TYPE void xen_machine_kexec(struct kimage *image)
+{
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, image->type, NULL);
+	panic("KEXEC_CMD_kexec hypercall should not return\n");
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
+++ x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
@@ -67,6 +67,10 @@ void machine_power_off(void)
 	HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 }
 
+#ifdef CONFIG_KEXEC
+void machine_shutdown(void) { }
+#endif
+
 int reboot_thru_bios = 0;	/* for dmi_scan.c */
 EXPORT_SYMBOL(machine_restart);
 EXPORT_SYMBOL(machine_halt);
--- x/xen/arch/ia64/xen/Makefile
+++ x/xen/arch/ia64/xen/Makefile
@@ -24,5 +24,7 @@ obj-y += xenmisc.o
 obj-y += xensetup.o
 obj-y += xentime.o
 obj-y += flushd.o
+obj-y += machine_kexec.o
+obj-y += crash.o
 
 obj-$(crash_debug) += gdbstub.o
--- /dev/null
+++ x/xen/arch/ia64/xen/crash.c
@@ -0,0 +1,26 @@
+/**********************************************************************
+ * arch/ia64/xen/crash.c
+ *
+ * Created By: Horms
+ * 
+ */
+
+#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+void machine_crash_shutdown(struct cpu_user_regs *regs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
--- /dev/null
+++ x/xen/arch/ia64/xen/machine_kexec.c
@@ -0,0 +1,46 @@
+/**********************************************************************
+ * arch/ia64/xen/machine_kexec.c
+ *
+ * Created By: Horms
+ * 
+ */
+
+#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+int machine_kexec_load(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    return -1;
+}
+
+void machine_kexec_unload(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_kexec_reserved(xen_kexec_reserve_t *reservation)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_shutdown(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/Makefile
+++ x/xen/arch/x86/Makefile
@@ -39,6 +39,8 @@ obj-y += trampoline.o
 obj-y += traps.o
 obj-y += usercopy.o
 obj-y += x86_emulate.o
+obj-y += machine_kexec.o
+obj-y += crash.o
 
 ifneq ($(pae),n)
 obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o
--- /dev/null
+++ x/xen/arch/x86/crash.c
@@ -0,0 +1,26 @@
+/******************************************************************************
+ * arch/x86/crash.c
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+void machine_crash_shutdown(struct cpu_user_regs *regs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/arch/x86/machine_kexec.c
@@ -0,0 +1,46 @@
+/******************************************************************************
+ * arch/x86/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ */
+
+#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+int machine_kexec_load(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    return -1;
+}
+
+void machine_kexec_unload(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_kexec_reserved(xen_kexec_reserve_t *reservation)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_shutdown(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/Makefile
+++ x/xen/common/Makefile
@@ -7,6 +7,7 @@ obj-y += event_channel.o
 obj-y += grant_table.o
 obj-y += kernel.o
 obj-y += keyhandler.o
+obj-y += kexec.o
 obj-y += lib.o
 obj-y += memory.o
 obj-y += multicall.o
--- /dev/null
+++ x/xen/common/kexec.c
@@ -0,0 +1,211 @@
+/******************************************************************************
+ * common/kexec.c - Achitecture independent kexec code for Xen
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Based in part on Linux 2.6.16's kernel/kexec.c
+ */
+
+#include <asm/kexec.h>
+#include <xen/lib.h>
+#include <xen/errno.h>
+#include <xen/guest_access.h>
+#include <xen/sched.h>
+#include <xen/types.h>
+#include <xen/kexec.h>
+#include <public/kexec.h>
+
+DEFINE_PER_CPU (note_buf_t, crash_notes);
+
+static xen_kexec_image_t kexec_image;
+static int kexec_image_set = 0;
+static xen_kexec_image_t kexec_crash_image;
+static int kexec_crash_image_set = 0;
+static int kexec_crash_lock = 0;
+
+/* Must call with kexec_crash_lock held */
+void __crash_kexec(struct cpu_user_regs *regs)
+{
+    struct cpu_user_regs fixed_regs;
+
+    if (!kexec_crash_image_set)
+	    return;
+    crash_setup_regs(&fixed_regs, regs);
+    machine_crash_shutdown(&fixed_regs);
+    machine_kexec(&kexec_crash_image); /* Does not return */
+}
+
+void crash_kexec(struct cpu_user_regs *regs)
+{
+    int locked;
+
+    locked = xchg(&kexec_crash_lock, 1);
+    if (locked)
+        return;
+    __crash_kexec(regs);
+
+    /* The if() here is bogus, but gcc will throws a warning that the
+     * computed value is unused and xen compiles with -Werror.
+     * This seems like a viable work around.
+     * This did not seem to happen with slightly older gcc.
+     * Observed with: 
+     * gcc version 4.1.2 20060604 (prerelease) (Debian * 4.1.1-2) */
+    if (xchg(&kexec_crash_lock, 0)) ;
+
+    return;
+}
+
+static int get_crash_note(int vcpuid, XEN_GUEST_HANDLE(void) uarg)
+{
+    struct domain *domain = current->domain;
+    unsigned long crash_note;
+    struct vcpu *vcpu;
+    int locked;
+
+    if (vcpuid < 0 || vcpuid > MAX_VIRT_CPUS)
+	return -EINVAL;
+
+    if ( ! (vcpu = domain->vcpu[vcpuid]) )
+	return -EINVAL;
+
+    locked = xchg(&kexec_crash_lock, 1);
+    if (locked)
+    {
+       printk("do_kexec: (CMD_kexec_crash_note): dump is locked\n");
+       return -EFAULT;
+    }
+    crash_note = __pa((unsigned long)per_cpu(crash_notes, vcpu->processor));
+
+    /* The if() here is bogus, but gcc will throws a warning that the
+     * computed value is unused and xen compiles with -Werror.
+     * This seems like a viable work around.
+     * This did not seem to happen with slightly older gcc.
+     * Observed with: 
+     * gcc version 4.1.2 20060604 (prerelease) (Debian * 4.1.1-2) */
+    if (xchg(&kexec_crash_lock, 0)) ;
+
+    if ( unlikely(copy_to_guest(uarg, &crash_note, 1) != 0) )
+    {
+        printk("do_kexec: (CMD_kexec_crash_note): copy_to_guest failed\n");
+        return -EFAULT;
+    }
+    
+    return 0;
+}
+
+static int get_reserve(XEN_GUEST_HANDLE(void) uarg)
+{
+    xen_kexec_reserve_t reservation;
+
+    machine_kexec_reserved(&reservation);
+    if ( unlikely(copy_to_guest(uarg, &reservation, 1) != 0) )
+    {
+        printk("do_kexec (CMD_kexec_reserve): copy_to_guest failed\n");
+        return -EFAULT;
+    }
+    
+    return 0;
+}
+
+static int __do_kexec(unsigned long type, XEN_GUEST_HANDLE(void) uarg,
+		      xen_kexec_image_t *image)
+{
+    cpu_user_regs_t regs;
+
+    if (type == KEXEC_TYPE_DEFAULT)
+        machine_shutdown(image); /* Does not return */
+    else
+    {
+        if ( unlikely(copy_from_guest(&regs, uarg, 1) != 0) )
+        {
+            printk("do_kexec (CMD_kexec): copy_from_guest failed\n");
+            return -EFAULT;
+        }
+        __crash_kexec(&regs); /* Does not return */
+    }
+
+    return -EINVAL;
+}
+
+int do_kexec(unsigned long op, int arg1, XEN_GUEST_HANDLE(void) uarg)
+{
+    xen_kexec_image_t *image;
+    int locked;
+    int *image_set;
+    int status = -EINVAL;
+
+    if ( !IS_PRIV(current->domain) )  
+        return -EPERM;
+
+    switch (op)
+    {
+    case KEXEC_CMD_kexec_crash_note:
+        return get_crash_note(arg1, uarg);
+    case KEXEC_CMD_kexec_reserve:
+	return get_reserve(uarg);
+    }
+
+    /* For all other ops, arg1 is the type of kexec, that is
+     * KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH */
+    if (arg1 == KEXEC_TYPE_CRASH)
+    {
+        image = &kexec_crash_image;
+        image_set = &kexec_crash_image_set;
+        locked = xchg(&kexec_crash_lock, 1);
+        if (locked)
+        {
+           printk("do_kexec: dump is locked\n");
+           return -EFAULT;
+        }
+    }
+    else
+    {
+        image = &kexec_image;
+        image_set = &kexec_image_set;
+    }
+
+    switch(op) {
+    case KEXEC_CMD_kexec:
+        BUG_ON(!*image_set);
+	status = __do_kexec(arg1, uarg, image);
+        break;
+    case KEXEC_CMD_kexec_load:
+        BUG_ON(*image_set);
+        if ( unlikely(copy_from_guest(image, uarg, 1) != 0) )
+        {
+            printk("do_kexec (CMD_kexec_load): copy_from_guest failed\n");
+            status = -EFAULT;
+	    break;
+        }
+        *image_set = 1;
+        status = machine_kexec_load(arg1, image);
+        break;
+    case KEXEC_CMD_kexec_unload:
+        BUG_ON(!*image_set);
+        *image_set = 0;
+        machine_kexec_unload(arg1, image);
+        status = 0;
+        break;
+    }
+
+    if (arg1 == KEXEC_TYPE_CRASH)
+        /* The if() here is bogus, but gcc will throws a warning that the
+         * computed value is unused and xen compiles with -Werror.
+         * This seems like a viable work around.
+         * This did not seem to happen with slightly older gcc.
+         * Observed with: 
+         * gcc version 4.1.2 20060604 (prerelease) (Debian * 4.1.1-2) */
+        if (xchg(&kexec_crash_lock, 0)) ;
+
+    return status;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/page_alloc.c
+++ x/xen/common/page_alloc.c
@@ -212,24 +212,35 @@ void init_boot_pages(paddr_t ps, paddr_t
     }
 }
 
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at)
+{
+    unsigned long i;
+
+    for ( i = 0; i < nr_pfns; i++ )
+        if ( allocated_in_map(pfn_at + i) )
+             break;
+
+    if ( i == nr_pfns )
+    {
+        map_alloc(pfn_at, nr_pfns);
+        return pfn_at;
+    }
+
+    return 0;
+}
+
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
 {
-    unsigned long pg, i;
+    unsigned long pg, i = 0;
 
     for ( pg = 0; (pg + nr_pfns) < max_page; pg += pfn_align )
     {
-        for ( i = 0; i < nr_pfns; i++ )
-            if ( allocated_in_map(pg + i) )
-                 break;
-
-        if ( i == nr_pfns )
-        {
-            map_alloc(pg, nr_pfns);
-            return pg;
-        }
+        i = alloc_boot_pages_at(nr_pfns, pg);
+        if (i != 0)
+            break;
     }
 
-    return 0;
+    return i;
 }
 
 
--- x/xen/drivers/char/console.c
+++ x/xen/drivers/char/console.c
@@ -717,6 +717,7 @@ void panic(const char *fmt, ...)
     unsigned long flags;
     static DEFINE_SPINLOCK(lock);
     extern void machine_restart(char *);
+    extern void crash_kexec(struct cpu_user_regs *regs);
     
     debugtrace_dump();
 
@@ -736,6 +737,8 @@ void panic(const char *fmt, ...)
 
     debugger_trap_immediate();
 
+    crash_kexec(NULL);
+
     watchdog_disable();
     mdelay(5000);
     machine_restart(0);
--- /dev/null
+++ x/xen/include/asm-ia64/kexec.h
@@ -0,0 +1,32 @@
+/******************************************************************************
+ * include/asm-ia64/kexec.h
+ * 
+ * Created By: Horms
+ *
+ */
+
+#ifndef __IA64_KEXEC_H__
+#define __IA64_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+                            struct cpu_user_regs *oldregs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+#endif /* __IA64_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
--- /dev/null
+++ x/xen/include/asm-x86/kexec.h
@@ -0,0 +1,31 @@
+/******************************************************************************
+ * include/asm-x86/kexec.h
+ * 
+ * Created By: Horms
+ *
+ */
+
+#ifndef __X86_KEXEC_H__
+#define __X86_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+#endif /* __X86_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/public/kexec.h
@@ -0,0 +1,85 @@
+/******************************************************************************
+ * kexec.h - Public portion
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Types based on those in ./vcpu.h on request from Keir Frasier
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+#include "xen.h"
+
+/*
+ * Prototype for this hypercall is:
+ *  int kexec_op(int cmd, int type, void *extra_args)
+ * @cmd        == KEXEC_CMD_... 
+ *                KEXEC operation to perform
+ * @arg1       == Operation-specific unsigned long argument
+ *                This could be in extra_args, but by putting it here
+ *                copy_from_user can be avoided, inparticular in
+ *                KEXEC_CMD_kexec during a crash dump, which is a failry
+ *                critical section of code.If this turns out not to be
+ *                important then it can be collapsed into extra_args.
+ * @extra_args == Operation-specific extra arguments (NULL if none).
+ */
+
+#define KEXEC_TYPE_DEFAULT 0
+#define KEXEC_TYPE_CRASH   1
+
+/*
+ * Perform kexec having previously loaded a kexec or kdump kernel
+ * as appropriate.
+ * @arg1      == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH
+ * @extra_arg == pointer to cpu_user_regs_t structure.
+ */
+#define KEXEC_CMD_kexec                 0
+
+/*
+ * Load kernel image in preparation for kexec or kdump.
+ * @arg1      == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH
+ * @extra_arg == pointer to xen_kexec_image_t structure.
+ */
+#define KEXEC_CMD_kexec_load            1
+typedef struct xen_kexec_image {
+    unsigned long indirection_page;
+    unsigned long reboot_code_buffer;
+    unsigned long start_address;
+} xen_kexec_image_t;
+
+/*
+ * Clean up image loaded by KEXEC_CMD_kexec_load
+ * @arg1      == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH
+ */
+#define KEXEC_CMD_kexec_unload          2
+
+/*
+ * Find the base pointer and size of the area that xen has 
+ * reserved for use by the crash kernel.
+ * @extra_arg == pointer to xen_kexec_reserve_t structure.
+ */
+#define KEXEC_CMD_kexec_reserve         3
+typedef struct xen_kexec_reserve {
+    unsigned long size;
+    unsigned long start;
+} xen_kexec_reserve_t;
+
+/*
+ * Find the base pointer of the area that xen has 
+ * reserved for use by a crash note for a given VCPU
+ * @extra_arg == pointer to unsigned long.
+ */
+#define KEXEC_CMD_kexec_crash_note      4
+
+#endif /* _XEN_PUBLIC_KEXEC_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/public/xen.h
+++ x/xen/include/public/xen.h
@@ -64,6 +64,7 @@
 #define __HYPERVISOR_xenoprof_op          31
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
+#define __HYPERVISOR_kexec_op             34
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
--- /dev/null
+++ x/xen/include/xen/elfcore.h
@@ -0,0 +1,73 @@
+/******************************************************************************
+ * include/xen/elfcore.h
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on include/linux/elfcore.h from Linux 2.6.16
+ * Naming scheeme based on include/xen/elf.h (not include/linux/elfcore.h)
+ *
+ */
+
+#ifndef __ELFCOREC_H__
+#define __ELFCOREC_H__
+
+#include <xen/types.h>
+#include <xen/elf.h>
+#include <public/xen.h>
+
+#define NT_PRSTATUS     1
+
+typedef struct
+{
+    int signo;                       /* signal number */
+    int code;                        /* extra code */
+    int errno;                       /* errno */
+} ELF_Signifo;
+
+/* These seem to be the same length on all architectures on Linux */
+typedef int ELF_Pid;
+typedef struct {
+	long tv_sec;
+	long tv_usec;
+} ELF_Timeval;
+typedef unsigned long ELF_Greg;
+#define ELF_NGREG (sizeof (struct cpu_user_regs) / sizeof(ELF_Greg))
+typedef ELF_Greg ELF_Gregset[ELF_NGREG];
+
+/*
+ * Definitions to generate Intel SVR4-like core files.
+ * These mostly have the same names as the SVR4 types with "elf_"
+ * tacked on the front to prevent clashes with linux definitions,
+ * and the typedef forms have been avoided.  This is mostly like
+ * the SVR4 structure, but more Linuxy, with things that Linux does
+ * not support and which gdb doesn't really use excluded.
+ */
+typedef struct
+{
+    ELF_Signifo pr_info;         /* Info associated with signal */
+    short pr_cursig;             /* Current signal */
+    unsigned long pr_sigpend;    /* Set of pending signals */
+    unsigned long pr_sighold;    /* Set of held signals */
+    ELF_Pid pr_pid;
+    ELF_Pid pr_ppid;
+    ELF_Pid pr_pgrp;
+    ELF_Pid pr_sid;
+    ELF_Timeval pr_utime;        /* User time */
+    ELF_Timeval pr_stime;        /* System time */
+    ELF_Timeval pr_cutime;       /* Cumulative user time */
+    ELF_Timeval pr_cstime;       /* Cumulative system time */
+    ELF_Gregset pr_reg;          /* GP registers */
+    int pr_fpvalid;              /* True if math co-processor being used.  */
+} ELF_Prstatus;
+
+#endif /* __ELFCOREC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/xen/kexec.h
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * include/xen/kexec.h - Internal archtecture independant portion
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <public/kexec.h>
+
+#define MAX_NOTE_BYTES 1024
+
+typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
+DECLARE_PER_CPU (note_buf_t, crash_notes);
+
+int machine_kexec_load(int type, xen_kexec_image_t *image);
+void machine_kexec_unload(int type, xen_kexec_image_t *image);
+void machine_kexec_reserved(xen_kexec_reserve_t *reservation);
+void machine_kexec(xen_kexec_image_t *image);
+void machine_shutdown(xen_kexec_image_t *image);
+void machine_crash_shutdown(cpu_user_regs_t *regs);
+
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/xen/mm.h
+++ x/xen/include/xen/mm.h
@@ -40,6 +40,7 @@ struct page_info;
 paddr_t init_boot_allocator(paddr_t bitmap_start);
 void init_boot_pages(paddr_t ps, paddr_t pe);
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align);
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at);
 void end_boot_allocator(void);
 
 /* Generic allocator. These functions are *not* interrupt-safe. */
--- /dev/null
+++ x/patches/linux-2.6.16.13/kexec-generic.patch
@@ -0,0 +1,294 @@
+ drivers/base/cpu.c    |   20 ++++++++++++++
+ include/linux/kexec.h |    5 +++
+ kernel/kexec.c        |   68 ++++++++++++++++++++++++++++++++++++++++---------
+ kernel/sys.c          |    4 ++
+ 4 files changed, 85 insertions(+), 12 deletions(-)
+
+--- x/drivers/base/cpu.c
++++ x/drivers/base/cpu.c
+@@ -11,6 +11,10 @@
+ 
+ #include "base.h"
+ 
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#endif
++
+ struct sysdev_class cpu_sysdev_class = {
+ 	set_kset_name("cpu"),
+ };
+@@ -86,6 +90,18 @@ static inline void register_cpu_control(
+ #ifdef CONFIG_KEXEC
+ #include <linux/kexec.h>
+ 
++#ifdef CONFIG_XEN
++static unsigned long get_crash_notes(int cpu)
++{
++	unsigned long crash_note;
++
++	if (HYPERVISOR_kexec(KEXEC_CMD_kexec_crash_note, cpu, &crash_note) < 0)
++		return 0UL;
++	return crash_note;
++}
++#endif
++
++/* XXX: This only finds dom0's CPU's */
+ static ssize_t show_crash_notes(struct sys_device *dev, char *buf)
+ {
+ 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+@@ -101,7 +117,11 @@ static ssize_t show_crash_notes(struct s
+ 	 * boot up and this data does not change there after. Hence this
+ 	 * operation should be safe. No locking required.
+ 	 */
++#ifndef CONFIG_XEN
+ 	addr = __pa(per_cpu_ptr(crash_notes, cpunum));
++#else
++	addr = (unsigned long long)get_crash_notes(cpunum);
++#endif
+ 	rc = sprintf(buf, "%Lx\n", addr);
+ 	return rc;
+ }
+--- x/include/linux/kexec.h
++++ x/include/linux/kexec.h
+@@ -91,6 +91,11 @@ struct kimage {
+ extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
+ extern int machine_kexec_prepare(struct kimage *image);
+ extern void machine_kexec_cleanup(struct kimage *image);
++#ifdef CONFIG_XEN
++extern int xen_machine_kexec_load(struct kimage *image);
++extern void xen_machine_kexec_unload(struct kimage *image);
++extern NORET_TYPE void xen_machine_kexec(struct kimage *image) ATTRIB_NORET;
++#endif
+ extern asmlinkage long sys_kexec_load(unsigned long entry,
+ 					unsigned long nr_segments,
+ 					struct kexec_segment __user *segments,
+--- x/kernel/kexec.c
++++ x/kernel/kexec.c
+@@ -38,6 +38,20 @@ struct resource crashk_res = {
+ 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ };
+ 
++/* Kexec needs to know about the actually physical addresss.
++ * But in xen, a physical address is a pseudo-physical addresss. */
++#ifndef CONFIG_XEN
++#define kexec_page_to_pfn(page)  page_to_pfn(page)
++#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
++#define kexec_virt_to_phys(addr) virt_to_phys(addr)
++#define kexec_phys_to_virt(addr) phys_to_virt(addr)
++#else
++#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
++#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
++#define kexec_virt_to_phys(addr) virt_to_machine(addr)
++#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
++#endif
++
+ int kexec_should_crash(struct task_struct *p)
+ {
+ 	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+@@ -403,7 +417,7 @@ static struct page *kimage_alloc_normal_
+ 		pages = kimage_alloc_pages(GFP_KERNEL, order);
+ 		if (!pages)
+ 			break;
+-		pfn   = page_to_pfn(pages);
++		pfn   = kexec_page_to_pfn(pages);
+ 		epfn  = pfn + count;
+ 		addr  = pfn << PAGE_SHIFT;
+ 		eaddr = epfn << PAGE_SHIFT;
+@@ -437,6 +451,7 @@ static struct page *kimage_alloc_normal_
+ 	return pages;
+ }
+ 
++#ifndef CONFIG_XEN
+ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ 						      unsigned int order)
+ {
+@@ -490,7 +505,7 @@ static struct page *kimage_alloc_crash_c
+ 		}
+ 		/* If I don't overlap any segments I have found my hole! */
+ 		if (i == image->nr_segments) {
+-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
++			pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
+ 			break;
+ 		}
+ 	}
+@@ -517,6 +532,13 @@ struct page *kimage_alloc_control_pages(
+ 
+ 	return pages;
+ }
++#else /* !CONFIG_XEN */
++struct page *kimage_alloc_control_pages(struct kimage *image,
++					 unsigned int order)
++{
++	return kimage_alloc_normal_control_pages(image, order);
++}
++#endif
+ 
+ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+ {
+@@ -532,7 +554,7 @@ static int kimage_add_entry(struct kimag
+ 			return -ENOMEM;
+ 
+ 		ind_page = page_address(page);
+-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
++		*image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
+ 		image->entry = ind_page;
+ 		image->last_entry = ind_page +
+ 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+@@ -593,13 +615,13 @@ static int kimage_terminate(struct kimag
+ #define for_each_kimage_entry(image, ptr, entry) \
+ 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ 		ptr = (entry & IND_INDIRECTION)? \
+-			phys_to_virt((entry & PAGE_MASK)): ptr +1)
++			kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
+ 
+ static void kimage_free_entry(kimage_entry_t entry)
+ {
+ 	struct page *page;
+ 
+-	page = pfn_to_page(entry >> PAGE_SHIFT);
++	page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
+ 	kimage_free_pages(page);
+ }
+ 
+@@ -611,6 +633,10 @@ static void kimage_free(struct kimage *i
+ 	if (!image)
+ 		return;
+ 
++#ifdef CONFIG_XEN
++	xen_machine_kexec_unload(image);
++#endif
++
+ 	kimage_free_extra_pages(image);
+ 	for_each_kimage_entry(image, ptr, entry) {
+ 		if (entry & IND_INDIRECTION) {
+@@ -686,7 +712,7 @@ static struct page *kimage_alloc_page(st
+ 	 * have a match.
+ 	 */
+ 	list_for_each_entry(page, &image->dest_pages, lru) {
+-		addr = page_to_pfn(page) << PAGE_SHIFT;
++		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
+ 		if (addr == destination) {
+ 			list_del(&page->lru);
+ 			return page;
+@@ -701,12 +727,12 @@ static struct page *kimage_alloc_page(st
+ 		if (!page)
+ 			return NULL;
+ 		/* If the page cannot be used file it away */
+-		if (page_to_pfn(page) >
++		if (kexec_page_to_pfn(page) >
+ 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ 			list_add(&page->lru, &image->unuseable_pages);
+ 			continue;
+ 		}
+-		addr = page_to_pfn(page) << PAGE_SHIFT;
++		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
+ 
+ 		/* If it is the destination page we want use it */
+ 		if (addr == destination)
+@@ -729,7 +755,7 @@ static struct page *kimage_alloc_page(st
+ 			struct page *old_page;
+ 
+ 			old_addr = *old & PAGE_MASK;
+-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
++			old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
+ 			copy_highpage(page, old_page);
+ 			*old = addr | (*old & ~PAGE_MASK);
+ 
+@@ -779,7 +805,7 @@ static int kimage_load_normal_segment(st
+ 			result  = -ENOMEM;
+ 			goto out;
+ 		}
+-		result = kimage_add_page(image, page_to_pfn(page)
++		result = kimage_add_page(image, kexec_page_to_pfn(page)
+ 								<< PAGE_SHIFT);
+ 		if (result < 0)
+ 			goto out;
+@@ -811,6 +837,7 @@ out:
+ 	return result;
+ }
+ 
++#ifndef CONFIG_XEN
+ static int kimage_load_crash_segment(struct kimage *image,
+ 					struct kexec_segment *segment)
+ {
+@@ -833,7 +860,7 @@ static int kimage_load_crash_segment(str
+ 		char *ptr;
+ 		size_t uchunk, mchunk;
+ 
+-		page = pfn_to_page(maddr >> PAGE_SHIFT);
++		page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
+ 		if (page == 0) {
+ 			result  = -ENOMEM;
+ 			goto out;
+@@ -881,6 +908,13 @@ static int kimage_load_segment(struct ki
+ 
+ 	return result;
+ }
++#else /* CONFIG_XEN */
++static int kimage_load_segment(struct kimage *image,
++				struct kexec_segment *segment)
++{
++	return kimage_load_normal_segment(image, segment);
++}
++#endif
+ 
+ /*
+  * Exec Kernel system call: for obvious reasons only root may call it.
+@@ -991,6 +1025,11 @@ asmlinkage long sys_kexec_load(unsigned 
+ 		if (result)
+ 			goto out;
+ 	}
++#ifdef CONFIG_XEN
++	result = xen_machine_kexec_load(image);
++	if (result)
++		goto out;
++#endif
+ 	/* Install the new kernel, and  Uninstall the old */
+ 	image = xchg(dest_image, image);
+ 
+@@ -1045,7 +1084,6 @@ void crash_kexec(struct pt_regs *regs)
+ 	struct kimage *image;
+ 	int locked;
+ 
+-
+ 	/* Take the kexec_lock here to prevent sys_kexec_load
+ 	 * running on one cpu from replacing the crash kernel
+ 	 * we are using after a panic on a different cpu.
+@@ -1061,12 +1099,17 @@ void crash_kexec(struct pt_regs *regs)
+ 			struct pt_regs fixed_regs;
+ 			crash_setup_regs(&fixed_regs, regs);
+ 			machine_crash_shutdown(&fixed_regs);
++#ifdef CONFIG_XEN
++			xen_machine_kexec(image);
++#else
+ 			machine_kexec(image);
++#endif
+ 		}
+ 		xchg(&kexec_lock, 0);
+ 	}
+ }
+ 
++#ifndef CONFIG_XEN
+ static int __init crash_notes_memory_init(void)
+ {
+ 	/* Allocate memory for saving cpu registers. */
+@@ -1079,3 +1122,4 @@ static int __init crash_notes_memory_ini
+ 	return 0;
+ }
+ module_init(crash_notes_memory_init)
++#endif
+--- x/kernel/sys.c
++++ x/kernel/sys.c
+@@ -435,8 +435,12 @@ void kernel_kexec(void)
+ 	kernel_restart_prepare(NULL);
+ 	printk(KERN_EMERG "Starting new kernel\n");
+ 	machine_shutdown();
++#ifdef CONFIG_XEN
++	xen_machine_kexec(image);
++#else
+ 	machine_kexec(image);
+ #endif
++#endif
+ }
+ EXPORT_SYMBOL_GPL(kernel_kexec);
+ 
--- /dev/null
+++ x/patches/linux-2.6.16.13/0-linux-2.6.16-kexec_page_table_a_stubs.patch
@@ -0,0 +1,85 @@
+kexec: Avoid overwriting the current pgd (V2, stubs)
+
+This patch adds an architecture specific structure "struct kimage_arch" to
+struct kimage. This structure is filled in with members by the architecture
+specific patches followed by this one.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+---
+
+ Applies on top of 2.6.16 and 2.6.17-rc4.
+
+ include/asm-i386/kexec.h    |    2 ++
+ include/asm-powerpc/kexec.h |    2 ++
+ include/asm-s390/kexec.h    |    2 ++
+ include/asm-sh/kexec.h      |    2 ++
+ include/asm-x86_64/kexec.h  |    2 ++
+ include/linux/kexec.h       |    2 ++
+ 6 files changed, 12 insertions(+)
+
+--- x/include/asm-i386/kexec.h
++++ x/include/asm-i386/kexec.h
+@@ -29,6 +29,8 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ /* CPU does not save ss and esp on stack if execution is already
+  * running in kernel mode at the time of NMI occurrence. This code
+  * fixes it.
+--- x/include/asm-powerpc/kexec.h
++++ x/include/asm-powerpc/kexec.h
+@@ -108,6 +108,8 @@ static inline void crash_setup_regs(stru
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ #ifdef __powerpc64__
+ extern void kexec_smp_wait(void);	/* get and clear naca physid, wait for
+ 					  master to copy new code to 0 */
+--- x/include/asm-s390/kexec.h
++++ x/include/asm-s390/kexec.h
+@@ -36,6 +36,8 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ /* Provide a dummy definition to avoid build failures. */
+ static inline void crash_setup_regs(struct pt_regs *newregs,
+ 					struct pt_regs *oldregs) { }
+--- x/include/asm-sh/kexec.h
++++ x/include/asm-sh/kexec.h
+@@ -25,6 +25,8 @@
+ 
+ #ifndef __ASSEMBLY__
+ 
++struct kimage_arch {};
++
+ extern void machine_shutdown(void);
+ extern void *crash_notes;
+ 
+--- x/include/asm-x86_64/kexec.h
++++ x/include/asm-x86_64/kexec.h
+@@ -29,6 +29,8 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ /*
+  * Saving the registers of the cpu on which panic occured in
+  * crash_kexec to save a valid sp. The registers of other cpus
+--- x/include/linux/kexec.h
++++ x/include/linux/kexec.h
+@@ -69,6 +69,8 @@ struct kimage {
+ 	unsigned long start;
+ 	struct page *control_code_page;
+ 
++	struct kimage_arch arch_data;
++
+ 	unsigned long nr_segments;
+ 	struct kexec_segment segment[KEXEC_SEGMENT_MAX];
+ 

[-- Attachment #3: 51.1.1-kexec-trigger_crash_dump.patch --]
[-- Type: text/plain, Size: 1624 bytes --]

console: allow a crash dump to be triggered from the xen console

This feature is needed to test crash dump. It is essential for development
(though developers could easily add the patch). It may also be of
use for testing of the roll-out of production systems (people who don't
want to add patches).

The original version of this patch triggered a panic, Keir Fraser
suggested changing it to trigger a crash dump in line with a
similar feature in Linux's sysrq.

Christian Limpach suggested changing the original trigger 'p' (for panic)
to 'D' for dump, as p is already used by the performance counters.
This patch uses 'c' for crashdump, again in line with the similar
feature in Linux's sysrq. On inspection of the code, 'c' does not
seem to be already taken.

Signed-Off-By: Horms <horms@verge.net.au>

 xen/common/kexec.c |   14 ++++++++++++++
 1 file changed, 14 insertions(+)

--- x/xen/common/kexec.c
+++ x/xen/common/kexec.c
@@ -13,6 +13,7 @@
 #include <xen/sched.h>
 #include <xen/types.h>
 #include <xen/kexec.h>
+#include <xen/keyhandler.h>
 #include <public/kexec.h>
 
 DEFINE_PER_CPU (note_buf_t, crash_notes);
@@ -55,6 +56,19 @@ void crash_kexec(struct cpu_user_regs *r
     return;
 }
 
+static void do_crashdump_trigger(unsigned char key)
+{
+	printk("triggering crashdump\n");
+	crash_kexec(NULL);
+}
+
+static __init int register_crashdump_trigger(void)
+{
+	register_keyhandler('c', do_crashdump_trigger, "trigger a crashdump");
+	return 0;
+}
+__initcall(register_crashdump_trigger);
+
 static int get_crash_note(int vcpuid, XEN_GUEST_HANDLE(void) uarg)
 {
     struct domain *domain = current->domain;

[-- Attachment #4: 51.2.1-kexec-x86-upstream.patch --]
[-- Type: text/plain, Size: 23789 bytes --]

kexec: x86

This is the x86 component of kexec for xen.
The generic component is a prerequsite for this patch.
The x86_64 or x86_32 (i386) patch is also needed
in order to use this code, however the code should compile is.

Signed-Off-By: Horms <horms@verge.net.au>
Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>

 xen/arch/x86/crash.c                |  173 ++++++++++++++++++++++++++++++++++-
 xen/arch/x86/dom0_ops.c             |    3 
 xen/arch/x86/machine_kexec.c        |  147 ++++++++++++++++++++++++++++-
 xen/arch/x86/setup.c                |   75 +++++++++++++--
 xen/arch/x86/x86_32/Makefile        |    1 
 xen/arch/x86/x86_32/machine_kexec.c |   26 +++++
 xen/arch/x86/x86_64/Makefile        |    1 
 xen/arch/x86/x86_64/machine_kexec.c |   27 +++++
 xen/include/asm-x86/elf.h           |   27 +++++
 xen/include/asm-x86/fixmap.h        |    1 
 xen/include/asm-x86/hypercall.h     |    5 +
 xen/include/asm-x86/kexec.h         |   13 +-
 xen/include/asm-x86/x86_32/elf.h    |   28 +++++
 xen/include/asm-x86/x86_32/kexec.h  |   48 +++++++++
 xen/include/asm-x86/x86_64/elf.h    |   28 +++++
 xen/include/asm-x86/x86_64/kexec.h  |   33 ++++++
 xen/include/public/kexec.h          |    2 
 xen/include/xen/elfcore.h           |    3 
 18 files changed, 615 insertions(+), 26 deletions(-)

--- x/xen/arch/x86/crash.c
+++ x/xen/arch/x86/crash.c
@@ -3,16 +3,183 @@
  * 
  * Created By: Horms
  *
- * Should be based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
+ * Based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
  */
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <asm/atomic.h>
+#include <asm/elf.h>
+#include <asm/percpu.h>
+#include <asm/kexec.h>
 #include <xen/types.h>
+#include <xen/irq.h>
+#include <asm/ipi.h>
+#include <asm/nmi.h>
+#include <xen/string.h>
+#include <xen/elf.h>
+#include <xen/elfcore.h>
+#include <xen/smp.h>
+#include <xen/delay.h>
+#include <xen/perfc.h>
+#include <xen/kexec.h>
 #include <public/xen.h>
 
+static int crashing_cpu;
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+							       size_t data_len)
+{
+	Elf_Note note;
+
+	note.namesz = strlen(name) + 1;
+	note.descsz = data_len;
+	note.type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) +3)/4;
+	memcpy(buf, name, note.namesz);
+	buf += (note.namesz + 3)/4;
+	memcpy(buf, data, note.descsz);
+	buf += (note.descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	Elf_Note note;
+
+	note.namesz = 0;
+	note.descsz = 0;
+	note.type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static void crash_save_this_cpu(struct cpu_user_regs *regs, int cpu)
+{
+	ELF_Prstatus prstatus;
+	uint32_t *buf;
+
+	printk("crash_save_this_cpu: %d\n",  cpu);
+
+	if ((cpu < 0) || (cpu >= NR_CPUS))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * A well defined structure format with tags is needed
+	 * ELF notes happen to provide this and there is infastructure
+	 * in the Linux kernel to supprot them. In order to make
+	 * crash dumps produced by xen the same, the same
+	 * technique is used here.
+	 */
+
+	/* It should be safe to use per_cpu() here instead of per_cpu_ptr()
+	 * (which does not exist in xen) as kexecing_lock must be held in
+	 * order to get anywhere near here */
+	buf = (uint32_t *)per_cpu(crash_notes, cpu);
+	if (!buf) /* XXX: Can this ever occur? */
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	/* XXX: Xen does not have processes. For the crashing CPU on a dom0
+	 * crash this could be pased down from dom0, but is this
+	 * neccessary?
+	 * prstatus.pr_pid = current->pid; */
+	ELF_CORE_COPY_REGS(prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+				sizeof(prstatus));
+	final_note(buf);
+}
+
+static void crash_save_self(struct cpu_user_regs *regs)
+{
+	crash_save_this_cpu(regs, smp_processor_id());
+}
+
+#ifdef CONFIG_SMP
+static atomic_t waiting_for_crash_ipi;
+
+static int crash_nmi_callback(struct cpu_user_regs *regs, int cpu)
+{
+	struct cpu_user_regs fixed_regs;
+
+	/* Don't do anything if this handler is invoked on crashing cpu.
+	 * Otherwise, system will completely hang. Crashing cpu can get
+	 * an NMI if system was initially booted with nmi_watchdog parameter.
+	 */
+	if (cpu == crashing_cpu)
+		return 1;
+	local_irq_disable();
+
+#ifdef CONFIG_X86_32
+	if (!user_mode(regs)) {
+		crash_fixup_ss_esp(&fixed_regs, regs);
+		regs = &fixed_regs;
+	}
+#endif
+	crash_save_this_cpu(regs, cpu);
+	disable_local_APIC();
+	atomic_dec(&waiting_for_crash_ipi);
+	/* Assume hlt works */
+	__asm__ __volatile__ ( "hlt" );
+	for(;;);
+
+	return 1;
+
+	/* Need to use this somewhere as Xen builds with -Werror */
+	crash_setup_regs(&fixed_regs, regs);
+}
+
+/*
+ * By using the NMI code instead of a vector we just sneak thru the
+ * word generator coming out with just what we want.  AND it does
+ * not matter if clustered_apic_mode is set or not.
+ */
+static void smp_send_nmi_allbutself(void)
+{
+	cpumask_t allbutself = cpu_online_map;
+    	cpu_clear(smp_processor_id(), allbutself);
+	send_IPI_mask(allbutself, APIC_DM_NMI);
+}
+
+static void nmi_shootdown_cpus(void)
+{
+	unsigned long msecs;
+
+	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+	/* Would it be better to replace the trap vector here? */
+	set_nmi_callback(crash_nmi_callback);
+	/* Ensure the new callback function is set before sending
+	 * out the NMI
+	 */
+	wmb();
+
+	smp_send_nmi_allbutself();
+
+	msecs = 1000; /* Wait at most a second for the other cpus to stop */
+	while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+		mdelay(1);
+		msecs--;
+	}
+
+	/* Leave the nmi callback set */
+	disable_local_APIC();
+}
+#else
+static void nmi_shootdown_cpus(void)
+{
+	/* There are no cpus to shootdown */
+}
+#endif
+
 void machine_crash_shutdown(struct cpu_user_regs *regs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+	printk("machine_crash_shutdown: %d\n", smp_processor_id());
+	local_irq_disable();
+
+	crashing_cpu = smp_processor_id();
+	nmi_shootdown_cpus();
+#ifdef CONFIG_X86_IO_APIC
+        disable_IO_APIC();
+#endif
+	crash_save_self(regs);
 }
 
 /*
--- x/xen/arch/x86/dom0_ops.c
+++ x/xen/arch/x86/dom0_ops.c
@@ -29,6 +29,9 @@
 #include <asm/mtrr.h>
 #include "cpu/mtrr/mtrr.h"
 
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
 #define TRC_DOM0OP_ENTER_BASE  0x00020000
 #define TRC_DOM0OP_LEAVE_BASE  0x00030000
 
--- x/xen/arch/x86/machine_kexec.c
+++ x/xen/arch/x86/machine_kexec.c
@@ -5,34 +5,165 @@
  *
  */
 
-#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/lib.h>
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/flushtlb.h>
+#include <xen/lib.h>
+#include <xen/smp.h>
+#include <xen/nmi.h>
 #include <xen/types.h>
+#include <xen/console.h>
+#include <xen/kexec.h>
 #include <public/kexec.h>
+#include <xen/domain_page.h>
+#include <asm/fixmap.h>
+ 
+#define create_level_mapping(lvl, next, pages, nopages, k, va)               \
+{                                                                            \
+    lvl##_pgentry_t *table;                                                  \
+    void *old = next;                                                        \
+                                                                             \
+    table = (lvl##_pgentry_t *)next + lvl##_table_offset(va);                \
+    if (!(lvl##e_get_flags(*table) & _PAGE_PRESENT)) {                       \
+        if (k >= nopages || pages[k] == 0)                                   \
+            return -1;                                                       \
+        *table = lvl##e_from_pfn(pages[k++]>>PAGE_SHIFT, __PAGE_HYPERVISOR); \
+    }                                                                        \
+    next = map_domain_page(lvl##e_get_pfn(*table));                          \
+    unmap_domain_page(old);                                                  \
+}                                                                            
+
+#define create_level_1_mapping(next, nopages, va, pa)               \
+{                                                                   \
+    l1_pgentry_t *table;                                            \
+                                                                    \
+    table = (l1_pgentry_t *)next + l1_table_offset(va);             \
+    if (!(l1e_get_flags(*table) & _PAGE_PRESENT)) {                 \
+        *table = l1e_from_pfn(pa >> PAGE_SHIFT, __PAGE_HYPERVISOR); \
+    }                                                               \
+    unmap_domain_page(next);                                        \
+}
+
+static int create_mapping(unsigned long root,
+                          unsigned long *pages, int nopages,
+                          unsigned long va, unsigned long pa)
+{
+    void *next = map_domain_page(root >> PAGE_SHIFT);
+    int k = 0;
+
+#if CONFIG_PAGING_LEVELS >= 4
+    create_level_mapping(l4, next, pages, nopages, k, va);
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+    create_level_mapping(l3, next, pages, nopages, k, va);
+#endif
+    create_level_mapping(l2, next, pages, nopages, k, va);
+
+    create_level_1_mapping(next, nopages, va, pa);
+
+    return k;
+}
+
+static int setup_page_table_a(xen_kexec_image_t *image)
+{
+    void *page;
+    int k, n = sizeof(image->page_table_a) / sizeof(image->page_table_a[0]);
+
+    /* clear page_table_a pages */
+
+    for (k = 0; k < n; k++) {
+        if (!image->page_table_a[k])
+            break;
+
+        page = map_domain_page(image->page_table_a[k] >> PAGE_SHIFT);
+        clear_page(page);
+        unmap_domain_page(page);
+    }
+
+    /* check that the first page (root page) is actually non-zero */
+
+    if (k == 0)
+        return -1;
+
+    /* setup fixmap to point to our control page */
+
+    set_fixmap(FIX_KEXEC_PAGE, image->reboot_code_buffer);
+
+    /* fill in page_table_a: create mapping at fixmap address */
+
+    k = create_mapping(image->page_table_a[0],
+                       &image->page_table_a[1],
+                       n - 1, fix_to_virt(FIX_KEXEC_PAGE),
+                       image->reboot_code_buffer);
+    if (k < 0)
+        return -1;
+
+    /* fill in page_table_a: create identity mapping */
+
+    k = create_mapping(image->page_table_a[0],
+                       &image->page_table_a[1 + k],
+                       n - (1 + k), image->reboot_code_buffer,
+                       image->reboot_code_buffer);
+    if (k < 0)
+        return -1;
+
+	return 0;
+}
 
 int machine_kexec_load(int type, xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-    return -1;
+    return setup_page_table_a(image);
 }
 
 void machine_kexec_unload(int type, xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
 }
 
 void machine_kexec_reserved(xen_kexec_reserve_t *reservation)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    reservation->size = opt_kdump_megabytes << 20;
+    reservation->start = opt_kdump_megabytes_base << 20;
 }
 
-void machine_kexec(xen_kexec_image_t *image)
+static void __machine_shutdown(void *data)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    xen_kexec_image_t *image = (xen_kexec_image_t *)data;
+
+    printk("__machine_shutdown: cpu=%u\n", smp_processor_id());
+
+    watchdog_disable();
+    console_start_sync();
+
+    smp_send_stop();
+
+#ifdef CONFIG_X86_IO_APIC
+    disable_IO_APIC();
+#endif   
+
+    machine_kexec(image);
 }
 
 void machine_shutdown(xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    int reboot_cpu_id;
+    cpumask_t reboot_cpu;
+
+    reboot_cpu_id = 0;
+
+    if (!cpu_isset(reboot_cpu_id, cpu_online_map))
+        reboot_cpu_id = smp_processor_id();
+    
+    if (reboot_cpu_id != smp_processor_id()) {
+        cpus_clear(reboot_cpu);
+        cpu_set(reboot_cpu_id, reboot_cpu);
+        on_selected_cpus(reboot_cpu, __machine_shutdown, image, 1, 0);
+	for (;;)
+		; /* nothing */
+    }
+    else
+        __machine_shutdown(image);
+    BUG();
 }
 
 /*
--- x/xen/arch/x86/setup.c
+++ x/xen/arch/x86/setup.c
@@ -38,6 +38,11 @@ static unsigned int opt_xenheap_megabyte
 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
 #endif
 
+unsigned int opt_kdump_megabytes = 0;
+integer_param("kdump_megabytes", opt_kdump_megabytes);
+unsigned int opt_kdump_megabytes_base = 0;
+integer_param("kdump_megabytes_base", opt_kdump_megabytes_base);
+
 /* opt_nosmp: If true, secondary processors are ignored. */
 static int opt_nosmp = 0;
 boolean_param("nosmp", opt_nosmp);
@@ -190,6 +195,20 @@ static void percpu_free_unused_areas(voi
                        __pa(__per_cpu_end));
 }
 
+void __init move_memory(unsigned long dst, 
+                          unsigned long src_start, unsigned long src_end)
+{
+#if defined(CONFIG_X86_32)
+    memmove((void *)dst,  /* use low mapping */
+            (void *)src_start,      /* use low mapping */
+            src_end - src_start);
+#elif defined(CONFIG_X86_64)
+    memmove(__va(dst),
+            __va(src_start),
+            src_end - src_start);
+#endif
+}
+
 void __init __start_xen(multiboot_info_t *mbi)
 {
     char __cmdline[] = "", *cmdline = __cmdline;
@@ -325,15 +344,8 @@ void __init __start_xen(multiboot_info_t
         initial_images_start = xenheap_phys_end;
     initial_images_end = initial_images_start + modules_length;
 
-#if defined(CONFIG_X86_32)
-    memmove((void *)initial_images_start,  /* use low mapping */
-            (void *)mod[0].mod_start,      /* use low mapping */
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#elif defined(CONFIG_X86_64)
-    memmove(__va(initial_images_start),
-            __va(mod[0].mod_start),
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#endif
+    move_memory(initial_images_start, 
+                mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
 
     /* Initialise boot-time allocator with all RAM situated after modules. */
     xenheap_phys_start = init_boot_allocator(__pa(&_end));
@@ -381,6 +393,51 @@ void __init __start_xen(multiboot_info_t
 #endif
     }
 
+    if (opt_kdump_megabytes) {
+        unsigned long kdump_start, kdump_size, k;
+
+        /* mark images pages as free for now */
+
+        init_boot_pages(initial_images_start, initial_images_end);
+
+        kdump_start = opt_kdump_megabytes_base << 20;
+        kdump_size = opt_kdump_megabytes << 20;
+
+        printk("Kdump: %luMB (%lukB) at 0x%lx\n", 
+               kdump_size >> 20,
+               kdump_size >> 10,
+               kdump_start);
+
+        if ((kdump_start & ~PAGE_MASK) || (kdump_size & ~PAGE_MASK))
+            panic("Kdump parameters not page aligned\n");
+
+        kdump_start >>= PAGE_SHIFT;
+        kdump_size >>= PAGE_SHIFT;
+
+        /* allocate pages for Kdump memory area */
+
+        k = alloc_boot_pages_at(kdump_size, kdump_start);
+
+        if (k != kdump_start)
+            panic("Unable to reserve Kdump memory\n");
+
+        /* allocate pages for relocated initial images */
+
+        k = ((initial_images_end - initial_images_start) & ~PAGE_MASK) ? 1 : 0;
+        k += (initial_images_end - initial_images_start) >> PAGE_SHIFT;
+
+        k = alloc_boot_pages(k, 1);
+
+        if (!k)
+            panic("Unable to allocate initial images memory\n");
+
+        move_memory(k << PAGE_SHIFT, initial_images_start, initial_images_end);
+
+        initial_images_end -= initial_images_start;
+        initial_images_start = k << PAGE_SHIFT;
+        initial_images_end += initial_images_start;
+    }        
+
     memguard_init();
 
     printk("System RAM: %luMB (%lukB)\n", 
--- x/xen/arch/x86/x86_32/Makefile
+++ x/xen/arch/x86/x86_32/Makefile
@@ -3,5 +3,6 @@ obj-y += entry.o
 obj-y += mm.o
 obj-y += seg_fixup.o
 obj-y += traps.o
+obj-y += machine_kexec.o
 
 obj-$(supervisor_mode_kernel) += supervisor_mode_kernel.o
--- /dev/null
+++ x/xen/arch/x86/x86_32/machine_kexec.c
@@ -0,0 +1,26 @@
+/*
+ * arch/x86/x86_32/machine_kexec.c
+ * Handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Should be losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <public/kexec.h>
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/x86_64/Makefile
+++ x/xen/arch/x86/x86_64/Makefile
@@ -1,3 +1,4 @@
 obj-y += entry.o
 obj-y += mm.o
 obj-y += traps.o
+obj-y += machine_kexec.o
--- /dev/null
+++ x/xen/arch/x86/x86_64/machine_kexec.c
@@ -0,0 +1,27 @@
+/******************************************************************************
+ * arch/x86/x86_64/machine_kexec.c
+ * Handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Should be losely based on arch/x86_64/kernel/machine_kexec.c
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/elf.h
@@ -0,0 +1,27 @@
+/******************************************************************************
+ * include/asm-x86/elf.h
+ * 
+ * Created By: Horms
+ *
+ */
+
+#ifndef __X86_ELF_H__
+#define __X86_ELF_H__
+
+#ifdef __x86_64__
+#include <asm/x86_64/elf.h>
+#else
+#include <asm/x86_32/elf.h>
+#endif
+
+#endif /* __X86_ELF_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/asm-x86/fixmap.h
+++ x/xen/include/asm-x86/fixmap.h
@@ -36,6 +36,7 @@ enum fixed_addresses {
     FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
     FIX_HPET_BASE,
     FIX_CYCLONE_TIMER,
+    FIX_KEXEC_PAGE,
     __end_of_fixed_addresses
 };
 
--- x/xen/include/asm-x86/hypercall.h
+++ x/xen/include/asm-x86/hypercall.h
@@ -6,6 +6,7 @@
 #define __ASM_X86_HYPERCALL_H__
 
 #include <public/physdev.h>
+#include <xen/types.h>
 
 extern long
 do_event_channel_op_compat(
@@ -87,6 +88,10 @@ extern long
 arch_do_vcpu_op(
     int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg);
 
+extern int
+do_kexec(
+    unsigned long op, unsigned arg1, XEN_GUEST_HANDLE(void) uarg);
+
 #ifdef __x86_64__
 
 extern long
--- x/xen/include/asm-x86/kexec.h
+++ x/xen/include/asm-x86/kexec.h
@@ -8,15 +8,16 @@
 #ifndef __X86_KEXEC_H__
 #define __X86_KEXEC_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <asm/processor.h>
 #include <xen/types.h>
+#include <xen/string.h>
 #include <public/xen.h>
 
-static void crash_setup_regs(struct cpu_user_regs *newregs,
-			     struct cpu_user_regs *oldregs)
-{
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-}
+#ifdef __x86_64__
+#include <asm/x86_64/kexec.h>
+#else
+#include <asm/x86_32/kexec.h>
+#endif
 
 #endif /* __X86_KEXEC_H__ */
 
--- /dev/null
+++ x/xen/include/asm-x86/x86_32/elf.h
@@ -0,0 +1,28 @@
+/******************************************************************************
+ * include/asm-x86/x86_32/elf.h
+ * 
+ * Created By: Horms
+ *
+ * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS
+ * from Linux 2.6.16
+ */
+
+#ifndef __X86_ELF_X86_32_H__
+#define __X86_ELF_X86_32_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+
+#endif /* __X86_ELF_X86_32_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_32/kexec.h
@@ -0,0 +1,48 @@
+/******************************************************************************
+ * include/asm-x86/x86_32/kexec.h
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16
+ *
+ */
+
+#ifndef __X86_32_KEXEC_H__
+#define __X86_32_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
+		    struct cpu_user_regs *oldregs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    return;
+    crash_fixup_ss_esp(newregs, oldregs);
+}
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+static inline int user_mode(struct cpu_user_regs *regs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    return -1;
+}
+
+
+#endif /* __X86_32_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_64/elf.h
@@ -0,0 +1,28 @@
+/******************************************************************************
+ * include/asm-x86/x86_64/elf.h
+ * 
+ * Created By: Horms
+ *
+ * Should pull be based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS
+ * from Linux 2.6.16
+ */
+
+#ifndef __X86_ELF_X86_64_H__
+#define __X86_ELF_X86_64_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+
+#endif /* __X86_ELF_X86_64_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_64/kexec.h
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * include/asm-x86/x86_64/kexec.h
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on include/asm-x86_64/kexec.h from Linux 2.6.16
+ *
+ */
+
+#ifndef __X86_64_KEXEC_H__
+#define __X86_64_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+#endif /* __X86_64_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/public/kexec.h
+++ x/xen/include/public/kexec.h
@@ -43,6 +43,8 @@
  */
 #define KEXEC_CMD_kexec_load            1
 typedef struct xen_kexec_image {
+    unsigned long page_table_a[7];
+    unsigned long page_table_b;
     unsigned long indirection_page;
     unsigned long reboot_code_buffer;
     unsigned long start_address;
--- x/xen/include/xen/elfcore.h
+++ x/xen/include/xen/elfcore.h
@@ -16,6 +16,9 @@
 #include <public/xen.h>
 
 #define NT_PRSTATUS     1
+#define NT_XEN_DOM0_CR3 0x10000001 /* XXX: Hopefully this is unused,
+					   feel free to change to a 
+					   better/different value */
 
 typedef struct
 {

[-- Attachment #5: 51.2.1.1-kexec-x86_32-upstream.patch --]
[-- Type: text/plain, Size: 30946 bytes --]

kexec: x86_32

This is the x86_32 component of kexec for xen.
The x86 component is a prerequsite for this patch.

Signed-Off-By: Horms <horms@verge.net.au>
Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>

 buildconfigs/linux-defconfig_xen_x86_32                              |    2 
 linux-2.6-xen-sparse/arch/i386/Kconfig                               |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/Makefile                       |    6 
 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c                    |   29 
 linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h                    |   42 
 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h       |    8 
 patches/linux-2.6.16.13/1-linux-2.6.16-kexec_page_table_a_i386.patch |  485 ++++++++++
 xen/arch/x86/crash.c                                                 |   47 
 xen/arch/x86/x86_32/entry.S                                          |    2 
 xen/arch/x86/x86_32/machine_kexec.c                                  |   27 
 xen/include/asm-x86/x86_32/elf.h                                     |   32 
 xen/include/asm-x86/x86_32/kexec.h                                   |   65 +
 12 files changed, 714 insertions(+), 33 deletions(-)

--- x/buildconfigs/linux-defconfig_xen_x86_32
+++ x/buildconfigs/linux-defconfig_xen_x86_32
@@ -184,6 +184,7 @@ CONFIG_MTRR=y
 CONFIG_REGPARM=y
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
@@ -2774,6 +2775,7 @@ CONFIG_NTFS_FS=m
 #
 CONFIG_PROC_FS=y
 CONFIG_PROC_KCORE=y
+# CONFIG_PROC_VMCORE is not set
 CONFIG_SYSFS=y
 CONFIG_TMPFS=y
 # CONFIG_HUGETLB_PAGE is not set
--- x/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ x/linux-2.6-xen-sparse/arch/i386/Kconfig
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_XEN
+	depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
@@ -24,7 +24,11 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups.o
+ifdef CONFIG_XEN
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
+else
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o crash.o
+endif
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_X86_NUMAQ)		+= numaq.o
 obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit.o
@@ -89,7 +93,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
@@ -68,6 +68,10 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -941,6 +945,7 @@ static void __init parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -951,6 +956,10 @@ static void __init parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1320,9 +1329,22 @@ void __init setup_bootmem_allocator(void
 	}
 #endif
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end)
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
+#else
+	{
+		xen_kexec_reserve_t reservation;
+		BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0,
+					&reservation));
+		if (reservation.size) {
+			crashk_res.start = reservation.start;
+			crashk_res.end = reservation.start + 
+				reservation.size - 1;
+		}
+	}
+#endif
 #endif
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1378,6 +1400,10 @@ legacy_init_iomem_resources(struct e820e
 		res->end = res->start + e820[i].size - 1;
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		request_resource(&iomem_resource, res);
+#ifdef CONFIG_KEXEC
+		request_resource(res, &crashk_res);
+#endif
+
 		if (e820[i].type == E820_RAM) {
 			/*
 			 *  We don't know which RAM region contains kernel data,
@@ -1386,9 +1412,6 @@ legacy_init_iomem_resources(struct e820e
 			 */
 			request_resource(res, code_resource);
 			request_resource(res, data_resource);
-#ifdef CONFIG_KEXEC
-			request_resource(res, &crashk_res);
-#endif
 		}
 	}
 }
--- /dev/null
+++ x/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h
@@ -0,0 +1,42 @@
+/*
+ * include/asm-i386/kexec-xen.h
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _I386_KEXEC_XEN_H
+#define _I386_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_32.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+					struct cpu_user_regs *xen_regs)
+{
+	xen_regs->ebx    = linux_regs->ebx;
+	xen_regs->ecx    = linux_regs->ecx;
+	xen_regs->edx    = linux_regs->edx;
+	xen_regs->esi    = linux_regs->esi;
+	xen_regs->edi    = linux_regs->edi;
+	xen_regs->ebp    = linux_regs->ebp;
+	xen_regs->eax    = linux_regs->eax;
+	xen_regs->esp    = linux_regs->esp;
+	xen_regs->ss     = linux_regs->xss;
+	xen_regs->cs     = linux_regs->xcs;
+	xen_regs->ds     = linux_regs->xds;
+	xen_regs->es     = linux_regs->xes;
+	xen_regs->eflags = linux_regs->eflags;
+}
+
+#endif /* _I386_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
@@ -368,5 +368,13 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, unsigned int arg1, void * extra_args)
+{
+	return _hypercall3(int, kexec_op, op, arg1, extra_args);
+}
+
+
 
 #endif /* __HYPERCALL_H__ */
--- x/xen/arch/x86/crash.c
+++ x/xen/arch/x86/crash.c
@@ -21,6 +21,7 @@
 #include <xen/delay.h>
 #include <xen/perfc.h>
 #include <xen/kexec.h>
+#include <xen/sched.h>
 #include <public/xen.h>
 
 static int crashing_cpu;
@@ -169,6 +170,51 @@ static void nmi_shootdown_cpus(void)
 }
 #endif
 
+/* The cr3 for dom0 on each of its vcpus
+ * It is added as ELF_Prstatus prstatus.pr_reg[ELF_NGREG-1)], where
+ * prstatus is the data of the elf note, and ELF_NGREG was extended
+ * by one to allow extra space.
+ * This code runs after all cpus except the crashing one have
+ * been shutdown so as to avoid having to hold domlist_lock,
+ * as locking after a crash is playing with fire */
+void find_dom0_cr3(void)
+{
+	struct domain *d;
+	struct vcpu   *v;
+	uint32_t *buf;
+	uint32_t cr3;
+	Elf_Note note;
+
+	/* Don't need to grab domlist_lock as we are the only thing running */
+
+	/* No need to traverse domain_list, as dom0 is always first */
+	d = domain_list;
+	BUG_ON(d->domain_id);
+
+	for_each_vcpu ( d, v ) {
+		if ( test_bit(_VCPUF_down, &v->vcpu_flags) )
+			continue;
+		buf = (uint32_t *)per_cpu(crash_notes, v->processor);
+		if (!buf) /* XXX: Can this ever occur? */
+			continue;
+
+		memcpy(&note, buf, sizeof(Elf_Note));
+		buf += (sizeof(Elf_Note) +3)/4 + (note.namesz + 3)/4 +
+			(note.descsz + 3)/4;
+
+		/* XXX: This probably doesn't take into account shadow mode,
+		 * but that might not be a problem */
+		cr3 = pagetable_get_pfn(v->arch.guest_table);
+
+		buf = append_elf_note(buf, "Xen Domanin-0 CR3",
+			NT_XEN_DOM0_CR3, &cr3, 4);
+		final_note(buf);
+
+		printk("domain:%i vcpu:%u processor:%u cr3:%08x\n", 
+		       d->domain_id, v->vcpu_id, v->processor, cr3);
+	}
+}
+
 void machine_crash_shutdown(struct cpu_user_regs *regs)
 {
 	printk("machine_crash_shutdown: %d\n", smp_processor_id());
@@ -180,6 +226,7 @@ void machine_crash_shutdown(struct cpu_u
         disable_IO_APIC();
 #endif
 	crash_save_self(regs);
+	find_dom0_cr3();
 }
 
 /*
--- x/xen/arch/x86/x86_32/entry.S
+++ x/xen/arch/x86/x86_32/entry.S
@@ -656,6 +656,7 @@ ENTRY(hypercall_table)
         .long do_xenoprof_op
         .long do_event_channel_op
         .long do_physdev_op
+        .long do_kexec
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -695,6 +696,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_xenoprof_op       */
         .byte 2 /* do_event_channel_op  */
         .byte 2 /* do_physdev_op        */
+        .byte 3 /* do_kexec             */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- x/xen/arch/x86/x86_32/machine_kexec.c
+++ x/xen/arch/x86/x86_32/machine_kexec.c
@@ -1,18 +1,31 @@
-/*
+/******************************************************************************
  * arch/x86/x86_32/machine_kexec.c
- * Handle transition of Linux booting another kernel
- *
- * Created By: Horms <horms@verge.net.au>
+ * 
+ * Created By: Horms
  *
- * Should be losely based on arch/i386/kernel/machine_kexec.c
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
  */
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
 #include <public/kexec.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+                    unsigned long indirection_page,
+                    unsigned long reboot_code_buffer,
+                    unsigned long start_address,
+                    unsigned long page_table_a,
+                    unsigned long has_pae);
 
 void machine_kexec(xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    relocate_new_kernel_t rnk;
+
+    rnk = (relocate_new_kernel_t) fix_to_virt(FIX_KEXEC_PAGE);
+    (*rnk)(image->indirection_page, image->reboot_code_buffer, 
+           image->start_address, image->page_table_a[0],
+           (unsigned long)cpu_has_pae);
 }
 
 /*
--- x/xen/include/asm-x86/x86_32/elf.h
+++ x/xen/include/asm-x86/x86_32/elf.h
@@ -3,17 +3,39 @@
  * 
  * Created By: Horms
  *
- * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS
- * from Linux 2.6.16
+ * Based heavily on include/asm-i386/elf.h and 
+ * include/asm-i386/system.h from Linux 2.6.16
  */
 
 #ifndef __X86_ELF_X86_32_H__
 #define __X86_ELF_X86_32_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+/* XXX: Xen doesn't have orig_eax.  For kdump, on a dom0 crash, the values
+ * for the crashing CPU could could be passed down from dom0, but is that
+ * neccessary?
+ * Also, I'm not sure why fs and gs are derived from the CPU
+ * rather than regs */
 
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+#define ELF_CORE_COPY_REGS(pr_reg, regs) do {                           \
+    unsigned i;                                                         \
+    pr_reg[0] = regs->ebx;                                              \
+    pr_reg[1] = regs->ecx;                                              \
+    pr_reg[2] = regs->edx;                                              \
+    pr_reg[3] = regs->esi;                                              \
+    pr_reg[4] = regs->edi;                                              \
+    pr_reg[5] = regs->ebp;                                              \
+    pr_reg[6] = regs->eax;                                              \
+    pr_reg[7] = regs->ds;                                               \
+    pr_reg[8] = regs->es;                                               \
+    asm volatile("mov %%fs,%0":"=rm" (i)); pr_reg[9]  = i;              \
+    asm volatile("mov %%gs,%0":"=rm" (i)); pr_reg[10] = i;              \
+    pr_reg[11] = 0; /* regs->orig_eax; */                               \
+    pr_reg[12] = regs->eip;                                             \
+    pr_reg[13] = regs->cs;                                              \
+    pr_reg[14] = regs->eflags;                                          \
+    pr_reg[15] = regs->esp;                                             \
+    pr_reg[16] = regs->ss;                                              \
+} while(0);
 
 #endif /* __X86_ELF_X86_32_H__ */
 
--- x/xen/include/asm-x86/x86_32/kexec.h
+++ x/xen/include/asm-x86/x86_32/kexec.h
@@ -3,39 +3,72 @@
  * 
  * Created By: Horms
  *
- * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16
- *
+ * Based heavily on include/asm-i386/kexec.h from Linux 2.6.16
  */
 
-#ifndef __X86_32_KEXEC_H__
-#define __X86_32_KEXEC_H__
-
-#include <xen/lib.h>       /* for printk() used in stub */
-#include <xen/types.h>
-#include <public/xen.h>
+#ifndef __X86_KEXEC_X86_32_H__
+#define __X86_KEXEC_X86_32_H__
 
+/* CPU does not save ss and esp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
 static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
-		    struct cpu_user_regs *oldregs)
+                    struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-    return;
-    crash_fixup_ss_esp(newregs, oldregs);
+    memcpy(newregs, oldregs, sizeof(*newregs));
+    newregs->esp = (unsigned long)&(oldregs->esp);
+    __asm__ __volatile__(
+            "xorl %%eax, %%eax\n\t"
+            "movw %%ss, %%ax\n\t"
+            :"=a"(newregs->ss));
 }
 
+/*
+ * This function is responsible for capturing register states if coming
+ * via panic otherwise just fix up the ss and esp if coming via kernel
+ * mode exception.
+ */
 static void crash_setup_regs(struct cpu_user_regs *newregs,
 			     struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    if (oldregs)
+        crash_fixup_ss_esp(newregs, oldregs);
+    else {
+        __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx));
+        __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx));
+        __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx));
+        __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi));
+        __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi));
+        __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp));
+        __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax));
+        __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp));
+        __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->ss));
+        __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->cs));
+        __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->ds));
+        __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->es));
+        __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags));
+
+        newregs->eip = (unsigned long)current_text_addr();
+    }
 }
 
+/*              
+ * From Linux 2.6.16's include/asm-i386/mach-xen/asm/ptrace.h
+ *
+ * user_mode_vm(regs) determines whether a register set came from user mode.
+ * This is true if V8086 mode was enabled OR if the register set was from
+ * protected mode with RPL-3 CS value.  This tricky test checks that with
+ * one comparison.  Many places in the kernel can bypass this full check
+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ */
 static inline int user_mode(struct cpu_user_regs *regs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-    return -1;
+    return (regs->cs & 2) != 0;
 }
 
 
-#endif /* __X86_32_KEXEC_H__ */
+#endif /* __X86_KEXEC_X86_32_H__ */
 
 /*
  * Local variables:
--- /dev/null
+++ x/patches/linux-2.6.16.13/1-linux-2.6.16-kexec_page_table_a_i386.patch
@@ -0,0 +1,485 @@
+kexec: Avoid overwriting the current pgd (V2, i386)
+
+This patch upgrades the i386-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables called "page_table_a". These
+tables are used to provide an executable identity mapping without overwriting
+the current pgd. This updated version of the patch fixes a PAE bug and moves
+the segment handling code into the reloacte_kernel.S.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+---
+
+ The patch has been tested with regular kexec and CONFIG_CRASH_DUMP.
+ Both PAE and non-PAE configurations work well.
+ Applies on top of 2.6.16 and 2.6.17-rc4.
+
+ arch/i386/kernel/machine_kexec.c   |  249 ++++++++++++++++--------------------
+ arch/i386/kernel/relocate_kernel.S |   92 +++++++++++++
+ include/asm-i386/kexec.h           |   12 +
+ 3 files changed, 213 insertions(+), 140 deletions(-)
+
+--- x/arch/i386/kernel/machine_kexec.c
++++ x/arch/i386/kernel/machine_kexec.c
+@@ -2,6 +2,10 @@
+  * machine_kexec.c - handle transition of Linux booting another kernel
+  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - rewrote identity map code to avoid overwriting current pgd
++ * - moved segment handling code into relocate_kernel.S
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+@@ -19,123 +23,81 @@
+ #include <asm/desc.h>
+ #include <asm/system.h>
+ 
+-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+-
+-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L2_ATTR (_PAGE_PRESENT)
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#endif
+ 
+-#define LEVEL0_SIZE (1UL << 12UL)
++#ifndef CONFIG_XEN
++typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
++					unsigned long indirection_page,
++					unsigned long reboot_code_buffer,
++					unsigned long start_address,
++					unsigned long page_table_a,
++					unsigned long has_pae) ATTRIB_NORET;
++#endif
+ 
+-#ifndef CONFIG_X86_PAE
+-#define LEVEL1_SIZE (1UL << 22UL)
+-static u32 pgtable_level1[1024] PAGE_ALIGNED;
++const extern unsigned char relocate_new_kernel[];
++#ifndef CONFIG_XEN
++extern void relocate_new_kernel_end(void);
++#endif
++const extern unsigned int relocate_new_kernel_size;
+ 
+-static void identity_map_page(unsigned long address)
++static int allocate_page_table_a(struct kimage *image)
+ {
+-	unsigned long level1_index, level2_index;
+-	u32 *pgtable_level2;
+-
+-	/* Find the current page table */
+-	pgtable_level2 = __va(read_cr3());
+-
+-	/* Find the indexes of the physical address to identity map */
+-	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+-	level2_index = address / LEVEL1_SIZE;
+-
+-	/* Identity map the page table entry */
+-	pgtable_level1[level1_index] = address | L0_ATTR;
+-	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
++	struct kimage_arch *arch = &image->arch_data;
++	struct page *page;
++	int k = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
++
++	for (; k > 0; k--) {
++		page = kimage_alloc_control_pages(image, 0);
++		if (!page)
++			return -ENOMEM;
++
++		clear_page(page_address(page));
++		arch->page_table_a[k - 1] = page;
++	}
+ 
+-	/* Flush the tlb so the new mapping takes effect.
+-	 * Global tlb entries are not flushed but that is not an issue.
+-	 */
+-	load_cr3(pgtable_level2);
++	return 0;
+ }
+ 
+-#else
+-#define LEVEL1_SIZE (1UL << 21UL)
+-#define LEVEL2_SIZE (1UL << 30UL)
+-static u64 pgtable_level1[512] PAGE_ALIGNED;
+-static u64 pgtable_level2[512] PAGE_ALIGNED;
+-
+-static void identity_map_page(unsigned long address)
+-{
+-	unsigned long level1_index, level2_index, level3_index;
+-	u64 *pgtable_level3;
+-
+-	/* Find the current page table */
+-	pgtable_level3 = __va(read_cr3());
+-
+-	/* Find the indexes of the physical address to identity map */
+-	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+-	level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
+-	level3_index = address / LEVEL2_SIZE;
+-
+-	/* Identity map the page table entry */
+-	pgtable_level1[level1_index] = address | L0_ATTR;
+-	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+-	set_64bit(&pgtable_level3[level3_index],
+-					       __pa(pgtable_level2) | L2_ATTR);
++/* workaround for include/asm-i386/pgtable-3level.h */
+ 
+-	/* Flush the tlb so the new mapping takes effect.
+-	 * Global tlb entries are not flushed but that is not an issue.
+-	 */
+-	load_cr3(pgtable_level3);
+-}
++#ifdef CONFIG_X86_PAE
++#undef pgd_present
++#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
++#define _PGD_ATTR _PAGE_PRESENT
++#else
++#define _PGD_ATTR _KERNPG_TABLE
+ #endif
+ 
+-static void set_idt(void *newidt, __u16 limit)
+-{
+-	struct Xgt_desc_struct curidt;
+-
+-	/* ia32 supports unaliged loads & stores */
+-	curidt.size    = limit;
+-	curidt.address = (unsigned long)newidt;
++#define pa_page(page) __pa(page_address(page))
+ 
+-	load_idt(&curidt);
+-};
+-
+-
+-static void set_gdt(void *newgdt, __u16 limit)
++static int create_mapping(struct page *root, struct page **pages, 
++			  unsigned long va, unsigned long pa)
+ {
+-	struct Xgt_desc_struct curgdt;
+-
+-	/* ia32 supports unaligned loads & stores */
+-	curgdt.size    = limit;
+-	curgdt.address = (unsigned long)newgdt;
++	pgd_t *pgd;
++	pud_t *pud;
++	pmd_t *pmd;
++	pte_t *pte;
++	int k = 0;
+ 
+-	load_gdt(&curgdt);
+-};
++	pgd = (pgd_t *)page_address(root) + pgd_index(va);
++	if (!pgd_present(*pgd))
++		set_pgd(pgd, __pgd(pa_page(pages[k++]) | _PGD_ATTR));
+ 
+-static void load_segments(void)
+-{
+-#define __STR(X) #X
+-#define STR(X) __STR(X)
++	pud = pud_offset(pgd, va);
++	if (!pud_present(*pud))
++		set_pud(pud, __pud(pa_page(pages[k++]) | _KERNPG_TABLE));
+ 
+-	__asm__ __volatile__ (
+-		"\tljmp $"STR(__KERNEL_CS)",$1f\n"
+-		"\t1:\n"
+-		"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
+-		"\tmovl %%eax,%%ds\n"
+-		"\tmovl %%eax,%%es\n"
+-		"\tmovl %%eax,%%fs\n"
+-		"\tmovl %%eax,%%gs\n"
+-		"\tmovl %%eax,%%ss\n"
+-		::: "eax", "memory");
+-#undef STR
+-#undef __STR
+-}
++	pmd = pmd_offset(pud, va);
++	if (!pmd_present(*pmd))
++		set_pmd(pmd, __pmd(pa_page(pages[k++]) | _KERNPG_TABLE));
+ 
+-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
+-					unsigned long indirection_page,
+-					unsigned long reboot_code_buffer,
+-					unsigned long start_address,
+-					unsigned int has_pae) ATTRIB_NORET;
++	pte = (pte_t *)page_address(pmd_page(*pmd)) + pte_index(va);
++	set_pte(pte, __pte(pa | _PAGE_KERNEL_EXEC));
+ 
+-const extern unsigned char relocate_new_kernel[];
+-extern void relocate_new_kernel_end(void);
+-const extern unsigned int relocate_new_kernel_size;
++	return k;
++}
+ 
+ /*
+  * A architecture hook called to validate the
+@@ -147,11 +109,38 @@ const extern unsigned int relocate_new_k
+  * Do what every setup is needed on image and the
+  * reboot code buffer to allow us to avoid allocations
+  * later.
+- *
+- * Currently nothing.
+  */
+ int machine_kexec_prepare(struct kimage *image)
+ {
++	void *control_page;
++	unsigned long pa;
++	int k;
++
++	memset(&image->arch_data, 0, sizeof(image->arch_data));
++
++	k = allocate_page_table_a(image);
++	if (k)
++		return k;
++
++	/* fill in control_page with assembly code */
++
++	control_page = page_address(image->control_code_page);
++	memcpy(control_page, relocate_new_kernel, relocate_new_kernel_size);
++
++	/* map the control_page at the virtual address of relocate_kernel.S */
++
++	pa = __pa(control_page);
++
++	k = create_mapping(image->arch_data.page_table_a[0], 
++			   &image->arch_data.page_table_a[1],
++			   (unsigned long)relocate_new_kernel, pa);
++
++	/* identity map the control_page */
++
++	create_mapping(image->arch_data.page_table_a[0], 
++		       &image->arch_data.page_table_a[k + 1],
++		       pa, pa);
++
+ 	return 0;
+ }
+ 
+@@ -163,6 +152,7 @@ void machine_kexec_cleanup(struct kimage
+ {
+ }
+ 
++#ifndef CONFIG_XEN
+ /*
+  * Do not allocate memory (or fail in any way) in machine_kexec().
+  * We are past the point of no return, committed to rebooting now.
+@@ -170,45 +160,30 @@ void machine_kexec_cleanup(struct kimage
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+ 	unsigned long page_list;
+-	unsigned long reboot_code_buffer;
+-
++	unsigned long control_code;
++	unsigned long page_table_a;
+ 	relocate_new_kernel_t rnk;
+ 
+-	/* Interrupts aren't acceptable while we reboot */
+-	local_irq_disable();
+-
+-	/* Compute some offsets */
+-	reboot_code_buffer = page_to_pfn(image->control_code_page)
+-								<< PAGE_SHIFT;
+ 	page_list = image->head;
++	control_code = __pa(page_address(image->control_code_page));
++	page_table_a = __pa(page_address(image->arch_data.page_table_a[0]));
+ 
+-	/* Set up an identity mapping for the reboot_code_buffer */
+-	identity_map_page(reboot_code_buffer);
++	/* now call it */
++	rnk = (relocate_new_kernel_t) relocate_new_kernel;
++	(*rnk)(page_list, control_code, image->start, 
++	       page_table_a, (unsigned long)cpu_has_pae);
++}
++#endif
+ 
+-	/* copy it out */
+-	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+-						relocate_new_kernel_size);
+-
+-	/* The segment registers are funny things, they are
+-	 * automatically loaded from a table, in memory wherever you
+-	 * set them to a specific selector, but this table is never
+-	 * accessed again you set the segment to a different selector.
+-	 *
+-	 * The more common model is are caches where the behide
+-	 * the scenes work is done, but is also dropped at arbitrary
+-	 * times.
+-	 *
+-	 * I take advantage of this here by force loading the
+-	 * segments, before I zap the gdt with an invalid value.
+-	 */
+-	load_segments();
+-	/* The gdt & idt are now invalid.
+-	 * If you want to load them you must set up your own idt & gdt.
+-	 */
+-	set_gdt(phys_to_virt(0),0);
+-	set_idt(phys_to_virt(0),0);
++#ifdef CONFIG_XEN
++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
++{
++	struct kimage_arch *arch = &image->arch_data;
++	int k, n = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
+ 
+-	/* now call it */
+-	rnk = (relocate_new_kernel_t) reboot_code_buffer;
+-	(*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
++	for (k = 0; k < n; k++)
++		xki->page_table_a[k] =
++			pfn_to_mfn(page_to_pfn(arch->page_table_a[k]))
++				<< PAGE_SHIFT;
+ }
++#endif
+--- x/arch/i386/kernel/relocate_kernel.S
++++ x/arch/i386/kernel/relocate_kernel.S
+@@ -2,12 +2,20 @@
+  * relocate_kernel.S - put the kernel image in place to boot
+  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - moved segment handling code from machine_kexec.c
++ * - gdt tables stolen from arch/i386/boot/setup.S
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+ 
+ #include <linux/linkage.h>
++#include <asm/page.h>
+ 
++.text
++.align (1 << PAGE_SHIFT)
++	
+ 	/*
+ 	 * Must be relocatable PIC code callable as a C function, that once
+ 	 * it starts can not use the previous processes stack.
+@@ -18,18 +26,68 @@ relocate_new_kernel:
+ 	movl  4(%esp), %ebx /* page_list */
+ 	movl  8(%esp), %ebp /* reboot_code_buffer */
+ 	movl  12(%esp), %edx /* start address */
+-	movl  16(%esp), %ecx /* cpu_has_pae */
++	movl  16(%esp), %edi /* page_table_a */
++	movl  20(%esp), %ecx /* cpu_has_pae */
+ 
+ 	/* zero out flags, and disable interrupts */
+ 	pushl $0
+ 	popfl
+ 
++	/* switch to page_table_a */
++	movl	%edi, %eax
++	movl	%eax, %cr3
++
++	/* setup idt */
++
++	movl	%ebp, %eax
++	addl	$(idt_48 - relocate_new_kernel), %eax
++	lidtl	(%eax)
++
++	/* setup gdt */
++
++	movl	%ebp, %eax
++	addl	$(gdt - relocate_new_kernel), %eax
++	movl	%ebp, %esi
++	addl	$((gdt_48 - relocate_new_kernel) + 2), %esi
++	movl	%eax, (%esi)
++	
++	movl	%ebp, %eax
++	addl	$(gdt_48 - relocate_new_kernel), %eax
++	lgdtl	(%eax)
++
++	/* setup data segment registers */
++	
++	mov	$(gdt_ds - gdt), %eax
++	mov	%eax, %ds
++	mov	%eax, %es
++	mov	%eax, %fs
++	mov	%eax, %gs
++	mov	%eax, %ss
++
+ 	/* set a new stack at the bottom of our page... */
+ 	lea   4096(%ebp), %esp
+ 
++	/* load new code segment */
++
++	movl	%ebp, %esi
++	xorl	%eax, %eax
++	pushl	%eax
++	pushl	%esi
++	pushl	%eax
++	
++	movl	$(gdt_cs - gdt), %eax
++	pushl	%eax
++	
++	movl	%ebp, %eax
++	addl	$(identity_mapped - relocate_new_kernel),%eax
++	pushl	%eax
++	iretl
++
++identity_mapped:	
++
+ 	/* store the parameters back on the stack */
+ 	pushl   %edx /* store the start address */
+-
++	
+ 	/* Set cr0 to a known state:
+ 	 * 31 0 == Paging disabled
+ 	 * 18 0 == Alignment check disabled
+@@ -113,6 +171,36 @@ relocate_new_kernel:
+ 	xorl    %edi, %edi
+ 	xorl    %ebp, %ebp
+ 	ret
++
++	.align	16
++gdt:
++	.fill	1,8,0
++
++gdt_cs:	
++	.word	0xFFFF				# 4Gb - (0x100000*0x1000 = 4Gb)
++	.word	0				# base address = 0
++	.word	0x9A00				# code read/exec
++	.word	0x00CF				# granularity = 4096, 386
++						#  (+5th nibble of limit)
++gdt_ds:
++	.word	0xFFFF				# 4Gb - (0x100000*0x1000 = 4Gb)
++	.word	0				# base address = 0
++	.word	0x9200				# data read/write
++	.word	0x00CF				# granularity = 4096, 386
++						#  (+5th nibble of limit)
++gdt_end:
++	.align	4
++	
++	.word	0				# alignment byte
++idt_48:
++	.word	0				# idt limit = 0
++	.word	0, 0				# idt base = 0L
++
++	.word	0				# alignment byte
++gdt_48:
++	.word	gdt_end - gdt - 1		# gdt limit
++	.word	0, 0				# gdt base (filled in later)
++	
+ relocate_new_kernel_end:
+ 
+ 	.globl relocate_new_kernel_size
+--- x/include/asm-i386/kexec.h
++++ x/include/asm-i386/kexec.h
+@@ -29,7 +29,17 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
+-struct kimage_arch {};
++struct kimage_arch {
++       /* page_table_a[] holds enough pages to create a new page table
++        * that maps the control page twice..
++        */
++
++#if defined(CONFIG_X86_PAE)
++       struct page *page_table_a[5]; /* (2 * pte) + (2 * pmd) + pgd */
++#else
++       struct page *page_table_a[3]; /* (2 * pte) + pgd */
++#endif
++};
+ 
+ /* CPU does not save ss and esp on stack if execution is already
+  * running in kernel mode at the time of NMI occurrence. This code

[-- Attachment #6: 51.2.1.2-kexec-x86_64-upstream.patch --]
[-- Type: text/plain, Size: 29884 bytes --]

kexec: x86_64

This is the first x86_64 release of kexec for xen/dom0. The code is in an
early phase, but it compiles and kexec:ing into a Linux kernel seems to work 
well. Rebooting into a new kernel may work using kdump too, but register
saving support is still missing.

The x86 component is a prerequsite for this patch.

Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
Signed-Off-By: Horms <horms@verge.net.au>

 buildconfigs/linux-defconfig_xen_x86_64                                    |    1 
 linux-2.6-xen-sparse/arch/x86_64/Kconfig                                   |    2 
 linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile                           |    6 
 linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c                        |   26 
 linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h                        |   30 
 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h           |    7 
 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h              |    2 
 patches/linux-2.6.16.13/2-linux-2.6.16-kexec_page_table_a_x86_64.patch     |  421 ++++++++++
 patches/linux-2.6.16.13/3-linux-2.6.16-kexec_page_table_a_x86_64-xen.patch |  151 +++
 xen/arch/x86/x86_64/entry.S                                                |    2 
 xen/arch/x86/x86_64/machine_kexec.c                                        |   21 
 xen/include/asm-x86/x86_64/elf.h                                           |   48 +
 xen/include/asm-x86/x86_64/kexec.h                                         |   33 
 13 files changed, 736 insertions(+), 14 deletions(-)

--- x/buildconfigs/linux-defconfig_xen_x86_64
+++ x/buildconfigs/linux-defconfig_xen_x86_64
@@ -139,6 +139,7 @@ CONFIG_SWIOTLB=y
 CONFIG_PHYSICAL_START=0x100000
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
--- x/linux-2.6-xen-sparse/arch/x86_64/Kconfig
+++ x/linux-2.6-xen-sparse/arch/x86_64/Kconfig
@@ -433,7 +433,7 @@ config X86_MCE_AMD
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_64_XEN
+	depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
@@ -23,7 +23,11 @@ obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o  
 obj-$(CONFIG_X86_XEN_GENAPIC)	+= genapic.o genapic_xen.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o mpparse.o \
 		genapic.o genapic_cluster.o genapic_flat.o
+ifdef CONFIG_XEN
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
+else
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o crash.o
+endif
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_SOFTWARE_SUSPEND)	+= suspend.o
 obj-$(CONFIG_ACPI_SLEEP)	+= suspend.o
@@ -59,7 +63,7 @@ pci-dma-y			+= ../../i386/kernel/pci-dma
 microcode-$(subst m,y,$(CONFIG_MICROCODE))  := ../../i386/kernel/microcode-xen.o
 quirks-y			:= ../../i386/kernel/quirks-xen.o
 
-n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o
+n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o crash.o
 
 include $(srctree)/scripts/Makefile.xen
 
--- x/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
@@ -79,6 +79,10 @@
 #include <asm/mach-xen/setup_arch_post.h>
 #include <xen/interface/memory.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 extern unsigned long start_pfn;
 extern struct edid_info edid_info;
 
@@ -446,6 +450,7 @@ static __init void parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -456,6 +461,10 @@ static __init void parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 
@@ -801,10 +810,23 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif	/* !CONFIG_XEN */
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end) {
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
 	}
+#else
+	{
+		xen_kexec_reserve_t reservation;
+		BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0,
+					&reservation));
+		if (reservation.size) {
+			crashk_res.start = reservation.start;
+			crashk_res.end = reservation.start + 
+				reservation.size - 1;
+		}
+	}
+#endif
 #endif
 
 	paging_init();
@@ -950,6 +972,10 @@ void __init setup_arch(char **cmdline_p)
 	iommu_hole_init();
 #endif
 
+#ifdef CONFIG_KEXEC
+	request_resource(&ioport_resource, &crashk_res);
+#endif
+
 #ifdef CONFIG_XEN
 	{
 		struct physdev_set_iopl set_iopl;
--- /dev/null
+++ x/linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h
@@ -0,0 +1,30 @@
+/*
+ * include/asm-x86_64/kexec-xen.h
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _X86_64_KEXEC_XEN_H
+#define _X86_64_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_64.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+					struct cpu_user_regs *xen_regs)
+{
+#warning Implement me!
+}
+
+#endif /* _X86_64_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h
@@ -369,4 +369,11 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, unsigned int arg1, void * extra_args)
+{
+	return _hypercall3(int, kexec_op, op, arg1, extra_args);
+}
+
 #endif /* __HYPERCALL_H__ */
--- x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h
+++ x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h
@@ -90,6 +90,8 @@ extern unsigned long profile_pc(struct p
 #define profile_pc(regs) instruction_pointer(regs)
 #endif
 
+#include <linux/compiler.h>
+
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
 
 struct task_struct;
--- x/xen/arch/x86/x86_64/entry.S
+++ x/xen/arch/x86/x86_64/entry.S
@@ -566,6 +566,7 @@ ENTRY(hypercall_table)
         .quad do_xenoprof_op
         .quad do_event_channel_op
         .quad do_physdev_op
+        .quad do_kexec
         .rept NR_hypercalls-((.-hypercall_table)/8)
         .quad do_ni_hypercall
         .endr
@@ -605,6 +606,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_xenoprof_op       */
         .byte 2 /* do_event_channel_op  */
         .byte 2 /* do_physdev_op        */
+        .byte 3 /* do_kexec             */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- x/xen/arch/x86/x86_64/machine_kexec.c
+++ x/xen/arch/x86/x86_64/machine_kexec.c
@@ -4,18 +4,29 @@
  *
  * Created By: Horms <horms@verge.net.au>
  *
- * Should be losely based on arch/x86_64/kernel/machine_kexec.c
+ * Losely based on arch/x86_64/kernel/machine_kexec.c
  */
-
-#include <xen/lib.h>       /* for printk() used in stub */
+  
 #include <xen/types.h>
 #include <public/kexec.h>
+#include <asm/fixmap.h>
+
+typedef void (*relocate_new_kernel_t)(unsigned long indirection_page,
+                                      unsigned long control_code_buffer,
+                                      unsigned long start_address,
+                                      unsigned long page_table_a,
+                                      unsigned long page_table_b);
 
 void machine_kexec(xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-}
+    relocate_new_kernel_t rnk;
 
+    rnk = (relocate_new_kernel_t) fix_to_virt(FIX_KEXEC_PAGE);
+    (*rnk)(image->indirection_page, image->reboot_code_buffer, 
+           image->start_address, image->page_table_a[0],
+           image->page_table_b);
+  }
+  
 /*
  * Local variables:
  * mode: C
--- x/xen/include/asm-x86/x86_64/elf.h
+++ x/xen/include/asm-x86/x86_64/elf.h
@@ -3,17 +3,55 @@
  * 
  * Created By: Horms
  *
- * Should pull be based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS
- * from Linux 2.6.16
+ * Based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS from Linux 2.6.16
  */
 
 #ifndef __X86_ELF_X86_64_H__
 #define __X86_ELF_X86_64_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/lib.h>
 
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+#include <xen/lib.h>
+
+/* XXX: Xen doesn't have orig_rax, so it is omitted.
+ *      Xen dosn't have threads, so fs and gs are read from the CPU and
+ *      thus values 21 and 22 are just duplicates of 25 and 26
+ *      respectively.  All these values could be passed from dom0 in the
+ *      case of it crashing, but does that help?
+ *
+ *      Lastly, I'm not sure why ds, es, fs and gs are read from
+ *      the CPU rather than regs, but linux does this
+ */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs)  do { \
+	unsigned v;						\
+	(pr_reg)[0] = (regs)->r15;				\
+	(pr_reg)[1] = (regs)->r14;				\
+	(pr_reg)[2] = (regs)->r13;				\
+	(pr_reg)[3] = (regs)->r12;				\
+	(pr_reg)[4] = (regs)->rbp;				\
+	(pr_reg)[5] = (regs)->rbx;				\
+	(pr_reg)[6] = (regs)->r11;				\
+	(pr_reg)[7] = (regs)->r10;				\
+	(pr_reg)[8] = (regs)->r9;				\
+	(pr_reg)[9] = (regs)->r8;				\
+	(pr_reg)[10] = (regs)->rax;				\
+	(pr_reg)[11] = (regs)->rcx;				\
+	(pr_reg)[12] = (regs)->rdx;				\
+	(pr_reg)[13] = (regs)->rsi;				\
+	(pr_reg)[14] = (regs)->rdi;				\
+	(pr_reg)[16] = (regs)->rip;			\
+	(pr_reg)[17] = (regs)->cs;			\
+	(pr_reg)[18] = (regs)->eflags;			\
+	(pr_reg)[19] = (regs)->rsp;			\
+	(pr_reg)[20] = (regs)->ss;			\
+	asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[21] = v;	\
+	asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[22] = v;	\
+	asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v;	\
+	asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v;	\
+	asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v;	\
+	asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v;	\
+} while(0);
 
 #endif /* __X86_ELF_X86_64_H__ */
 
--- x/xen/include/asm-x86/x86_64/kexec.h
+++ x/xen/include/asm-x86/x86_64/kexec.h
@@ -10,14 +10,43 @@
 #ifndef __X86_64_KEXEC_H__
 #define __X86_64_KEXEC_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/lib.h>
 #include <xen/types.h>
 #include <public/xen.h>
 
+/*
+ * Saving the registers of the cpu on which panic occured in
+ * crash_kexec to save a valid sp. The registers of other cpus
+ * will be saved in machine_crash_shutdown while shooting down them.
+ */
 static void crash_setup_regs(struct cpu_user_regs *newregs,
 			     struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+	if (oldregs)
+		memcpy(newregs, oldregs, sizeof(*newregs));
+	else {
+		__asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->rbx));
+		__asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->rcx));
+		__asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->rdx));
+		__asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->rsi));
+		__asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->rdi));
+		__asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->rbp));
+		__asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->rax));
+		__asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->rsp));
+		__asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8));
+		__asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9));
+		__asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10));
+		__asm__ __volatile__("movq %%r11,%0" : "=m"(newregs->r11));
+		__asm__ __volatile__("movq %%r12,%0" : "=m"(newregs->r12));
+		__asm__ __volatile__("movq %%r13,%0" : "=m"(newregs->r13));
+		__asm__ __volatile__("movq %%r14,%0" : "=m"(newregs->r14));
+		__asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15));
+		__asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
+		__asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
+		__asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->eflags));
+
+		newregs->rip = (unsigned long)current_text_addr();
+	}
 }
 
 #endif /* __X86_64_KEXEC_H__ */
--- /dev/null
+++ x/patches/linux-2.6.16.13/2-linux-2.6.16-kexec_page_table_a_x86_64.patch
@@ -0,0 +1,421 @@
+kexec: Avoid overwriting the current pgd (V2, x86_64)
+
+This patch upgrades the x86_64-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables called "page_table_a". These
+tables are used to provide an executable identity mapping without overwriting
+the current pgd. The already existing page table is renamed to "page_table_b".
+
+KEXEC_CONTROL_CODE_SIZE is changed into a single page. This updated version of
+the patch also moves the segment handling code into the reloacte_kernel.S.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+---
+
+ The patch has been tested with regular kexec and CONFIG_CRASH_DUMP.
+ Applies on top of 2.6.16 and 2.6.17-rc4.
+
+ arch/x86_64/kernel/machine_kexec.c   |  193 +++++++++++++++++-----------------
+ arch/x86_64/kernel/relocate_kernel.S |   84 +++++++++++++-
+ include/asm-x86_64/kexec.h           |   15 ++
+ 3 files changed, 189 insertions(+), 103 deletions(-)
+
+--- x/arch/x86_64/kernel/machine_kexec.c
++++ x/arch/x86_64/kernel/machine_kexec.c
+@@ -2,6 +2,10 @@
+  * machine_kexec.c - handle transition of Linux booting another kernel
+  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - rewrote identity map code to avoid overwriting current pgd
++ * - moved segment handling code into relocate_kernel.S
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+@@ -96,81 +100,110 @@ out:
+ }
+ 
+ 
+-static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
++static int create_page_table_b(struct kimage *image)
+ {
+-	pgd_t *level4p;
+-	level4p = (pgd_t *)__va(start_pgtable);
+- 	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+-}
++	struct kimage_arch *arch = &image->arch_data;
+ 
+-static void set_idt(void *newidt, u16 limit)
+-{
+-	struct desc_ptr curidt;
++	arch->page_table_b = kimage_alloc_control_pages(image, 0);
+ 
+-	/* x86-64 supports unaliged loads & stores */
+-	curidt.size    = limit;
+-	curidt.address = (unsigned long)newidt;
++	if (!arch->page_table_b)
++		return -ENOMEM;
+ 
+-	__asm__ __volatile__ (
+-		"lidtq %0\n"
+-		: : "m" (curidt)
+-		);
+-};
++ 	return init_level4_page(image, page_address(arch->page_table_b),
++				0, end_pfn << PAGE_SHIFT);
++}
+ 
++typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
++					unsigned long control_code_buffer,
++					unsigned long start_address,
++					unsigned long page_table_a,
++					unsigned long page_table_b) ATTRIB_NORET;
++
++const extern unsigned char relocate_new_kernel[];
++const extern unsigned long relocate_new_kernel_size;
+ 
+-static void set_gdt(void *newgdt, u16 limit)
++static int allocate_page_table_a(struct kimage *image)
+ {
+-	struct desc_ptr curgdt;
++	struct kimage_arch *arch = &image->arch_data;
++	struct page *page;
++	int k = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
+ 
+-	/* x86-64 supports unaligned loads & stores */
+-	curgdt.size    = limit;
+-	curgdt.address = (unsigned long)newgdt;
++	for (; k > 0; k--) {
++		page = kimage_alloc_control_pages(image, 0);
++		if (!page)
++			return -ENOMEM;
+ 
+-	__asm__ __volatile__ (
+-		"lgdtq %0\n"
+-		: : "m" (curgdt)
+-		);
+-};
++		clear_page(page_address(page));
++		arch->page_table_a[k - 1] = page;
++	}
+ 
+-static void load_segments(void)
+-{
+-	__asm__ __volatile__ (
+-		"\tmovl %0,%%ds\n"
+-		"\tmovl %0,%%es\n"
+-		"\tmovl %0,%%ss\n"
+-		"\tmovl %0,%%fs\n"
+-		"\tmovl %0,%%gs\n"
+-		: : "a" (__KERNEL_DS) : "memory"
+-		);
++	return 0;
+ }
+ 
+-typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
+-					unsigned long control_code_buffer,
+-					unsigned long start_address,
+-					unsigned long pgtable) ATTRIB_NORET;
++#define _PAGE_KERNEL_EXEC __PAGE_KERNEL_EXEC
++#define pa_page(page) __pa_symbol(page_address(page)) /* __pa() miscompiles */
+ 
+-const extern unsigned char relocate_new_kernel[];
+-const extern unsigned long relocate_new_kernel_size;
++static int create_mapping(struct page *root, struct page **pages, 
++			  unsigned long va, unsigned long pa)
++{
++	pgd_t *pgd;
++	pud_t *pud;
++	pmd_t *pmd;
++	pte_t *pte;
++	int k = 0;
++
++	pgd = (pgd_t *)page_address(root) + pgd_index(va);
++	if (!pgd_present(*pgd))
++		set_pgd(pgd, __pgd(pa_page(pages[k++]) | _KERNPG_TABLE));
++
++	pud = pud_offset(pgd, va);
++	if (!pud_present(*pud))
++		set_pud(pud, __pud(pa_page(pages[k++]) | _KERNPG_TABLE));
++
++	pmd = pmd_offset(pud, va);
++	if (!pmd_present(*pmd))
++		set_pmd(pmd, __pmd(pa_page(pages[k++]) | _KERNPG_TABLE));
++
++	pte = (pte_t *)page_address(pmd_page(*pmd)) + pte_index(va);
++	set_pte(pte, __pte(pa | _PAGE_KERNEL_EXEC));
++
++	return k;
++}
+ 
+ int machine_kexec_prepare(struct kimage *image)
+ {
+-	unsigned long start_pgtable, control_code_buffer;
+-	int result;
++	void *control_page;
++	unsigned long pa;
++	int k;
+ 
+-	/* Calculate the offsets */
+-	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+-	control_code_buffer = start_pgtable + PAGE_SIZE;
+-
+-	/* Setup the identity mapped 64bit page table */
+-	result = init_pgtable(image, start_pgtable);
+-	if (result)
+-		return result;
+-
+-	/* Place the code in the reboot code buffer */
+-	memcpy(__va(control_code_buffer), relocate_new_kernel,
+-						relocate_new_kernel_size);
++	memset(&image->arch_data, 0, sizeof(image->arch_data));
+ 
+-	return 0;
++	k = allocate_page_table_a(image);
++	if (k)
++		return k;
++
++	/* fill in control_page with assembly code */
++
++	control_page = page_address(image->control_code_page);
++	memcpy(control_page, relocate_new_kernel, relocate_new_kernel_size);
++
++	/* map the control_page at the virtual address of relocate_kernel.S */
++
++	pa = __pa(control_page);
++
++	k = create_mapping(image->arch_data.page_table_a[0], 
++			   &image->arch_data.page_table_a[1],
++			   (unsigned long)relocate_new_kernel, pa);
++
++	/* identity map the control_page */
++
++	create_mapping(image->arch_data.page_table_a[0], 
++		       &image->arch_data.page_table_a[k + 1],
++		       pa, pa);
++
++	/* create identity mapped page table aka page_table_b */
++
++	return create_page_table_b(image);
+ }
+ 
+ void machine_kexec_cleanup(struct kimage *image)
+@@ -185,47 +218,17 @@ void machine_kexec_cleanup(struct kimage
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+ 	unsigned long page_list;
+-	unsigned long control_code_buffer;
+-	unsigned long start_pgtable;
++	unsigned long control_code;
++	unsigned long page_table_a;
++	unsigned long page_table_b;
+ 	relocate_new_kernel_t rnk;
+ 
+-	/* Interrupts aren't acceptable while we reboot */
+-	local_irq_disable();
+-
+-	/* Calculate the offsets */
+ 	page_list = image->head;
+-	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+-	control_code_buffer = start_pgtable + PAGE_SIZE;
++	control_code = __pa(page_address(image->control_code_page));
++	page_table_a = __pa(page_address(image->arch_data.page_table_a[0]));
++	page_table_b = __pa(page_address(image->arch_data.page_table_b));
+ 
+-	/* Set the low half of the page table to my identity mapped
+-	 * page table for kexec.  Leave the high half pointing at the
+-	 * kernel pages.   Don't bother to flush the global pages
+-	 * as that will happen when I fully switch to my identity mapped
+-	 * page table anyway.
+-	 */
+-	memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
+-	__flush_tlb();
+-
+-
+-	/* The segment registers are funny things, they are
+-	 * automatically loaded from a table, in memory wherever you
+-	 * set them to a specific selector, but this table is never
+-	 * accessed again unless you set the segment to a different selector.
+-	 *
+-	 * The more common model are caches where the behide
+-	 * the scenes work is done, but is also dropped at arbitrary
+-	 * times.
+-	 *
+-	 * I take advantage of this here by force loading the
+-	 * segments, before I zap the gdt with an invalid value.
+-	 */
+-	load_segments();
+-	/* The gdt & idt are now invalid.
+-	 * If you want to load them you must set up your own idt & gdt.
+-	 */
+-	set_gdt(phys_to_virt(0),0);
+-	set_idt(phys_to_virt(0),0);
+ 	/* now call it */
+-	rnk = (relocate_new_kernel_t) control_code_buffer;
+-	(*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
++	rnk = (relocate_new_kernel_t) relocate_new_kernel;
++	(*rnk)(page_list, control_code, image->start, page_table_a, page_table_b);
+ }
+--- x/arch/x86_64/kernel/relocate_kernel.S
++++ x/arch/x86_64/kernel/relocate_kernel.S
+@@ -2,11 +2,18 @@
+  * relocate_kernel.S - put the kernel image in place to boot
+  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - moved segment handling code from machine_kexec.c
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+ 
+ #include <linux/linkage.h>
++#include <asm/page.h>
++
++.text
++.align (1 << PAGE_SHIFT)
+ 
+ 	/*
+ 	 * Must be relocatable PIC code callable as a C function, that once
+@@ -18,21 +25,69 @@ relocate_new_kernel:
+ 	/* %rdi page_list
+ 	 * %rsi reboot_code_buffer
+ 	 * %rdx start address
+-	 * %rcx page_table
+-	 * %r8  arg5
++	 * %rcx page_table_a
++	 * %r8  page_table_b
+ 	 * %r9  arg6
+ 	 */
+-
++	
+ 	/* zero out flags, and disable interrupts */
+ 	pushq $0
+ 	popfq
+ 
++	/* switch to page_table_a */
++	movq    %rcx, %cr3
++
++	/* setup idt */
++
++	movq	%rsi, %rax
++	addq	$(idt_48 - relocate_new_kernel), %rax
++	lidtq	(%rax)
++
++	/* setup gdt */
++
++	movq	%rsi, %rax
++	addq	$(gdt - relocate_new_kernel), %rax
++	movq	%rsi, %r9
++	addq	$((gdt_48 - relocate_new_kernel) + 2), %r9
++	movq	%rax, (%r9)
++	
++	movq	%rsi, %rax
++	addq	$(gdt_48 - relocate_new_kernel), %rax
++	lgdtq	(%rax)
++
++	/* setup data segment registers */
++
++	xorl	%eax,%eax
++	movl	%eax, %ds
++	movl	%eax, %es
++	movl	%eax, %fs
++	movl	%eax, %gs
++	movl	%eax, %ss
++
+ 	/* set a new stack at the bottom of our page... */
+ 	lea   4096(%rsi), %rsp
+ 
++	/* load new code segment */
++
++	movq	%rsp, %rcx
++	xorq	%rax, %rax
++	pushq	%rax                                              /* SS */
++	pushq	%rcx                                              /* ESP */
++	pushq	%rax                                              /* RFLAGS */
++
++	movq	$(gdt_code - gdt), %rax
++	pushq	%rax                                              /* CS */
++
++	movq	%rsi, %rax
++	addq	$(identity_mapped - relocate_new_kernel), %rax
++	pushq	%rax                                              /* RIP */
++
++	iretq
++	
++identity_mapped:
+ 	/* store the parameters back on the stack */
+ 	pushq	%rdx /* store the start address */
+-
++	
+ 	/* Set cr0 to a known state:
+ 	 * 31 1 == Paging enabled
+ 	 * 18 0 == Alignment check disabled
+@@ -69,7 +124,7 @@ relocate_new_kernel:
+ 	/* Switch to the identity mapped page tables,
+ 	 * and flush the TLB.
+ 	*/
+-	movq	%rcx, %cr3
++	movq	%r8, %cr3
+ 
+ 	/* Do the copies */
+ 	movq	%rdi, %rcx 	/* Put the page_list in %rcx */
+@@ -136,6 +191,25 @@ relocate_new_kernel:
+ 	xorq	%r15, %r15
+ 
+ 	ret
++	.align	16
++gdt:
++	.long   0x00000000  /* NULL descriptor */
++	.long   0x00000000
++gdt_code:
++	.long   0x00000000  /* code descriptor */
++	.long   0x00209800
++
++gdt_end:
++	.align	4
++	
++idt_48:
++	.word	0				# idt limit = 0
++	.quad	0, 0				# idt base = 0L
++
++gdt_48:
++	.word	gdt_end - gdt - 1		# gdt limit
++	.quad	0, 0				# gdt base (filled in later)
++
+ relocate_new_kernel_end:
+ 
+ 	.globl relocate_new_kernel_size
+--- x/include/asm-x86_64/kexec.h
++++ x/include/asm-x86_64/kexec.h
+@@ -21,15 +21,24 @@
+ /* Maximum address we can use for the control pages */
+ #define KEXEC_CONTROL_MEMORY_LIMIT     (0xFFFFFFFFFFUL)
+ 
+-/* Allocate one page for the pdp and the second for the code */
+-#define KEXEC_CONTROL_CODE_SIZE  (4096UL + 4096UL)
++#define KEXEC_CONTROL_CODE_SIZE  4096
+ 
+ /* The native architecture */
+ #define KEXEC_ARCH KEXEC_ARCH_X86_64
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
+-struct kimage_arch {};
++struct kimage_arch {
++	/* page_table_a[] holds enough pages to create a new page table
++	 * that maps the control page twice..
++	 *
++	 * page_table_b points to the root page of a page table which is used
++	 * to provide identity mapping of all ram.
++	 */
++
++	struct page *page_table_a[7]; /* 2 * (pte + pud + pmd) + pgd */
++	struct page *page_table_b;
++};
+ 
+ /*
+  * Saving the registers of the cpu on which panic occured in
--- /dev/null
+++ x/patches/linux-2.6.16.13/3-linux-2.6.16-kexec_page_table_a_x86_64-xen.patch
@@ -0,0 +1,151 @@
+ arch/x86_64/kernel/machine_kexec.c |   84 +++++++++++++++++++++++++++++++++---
+ 1 file changed, 77 insertions(+), 7 deletions(-)
+
+--- x/arch/x86_64/kernel/machine_kexec.c
++++ x/arch/x86_64/kernel/machine_kexec.c
+@@ -19,6 +19,50 @@
+ #include <asm/mmu_context.h>
+ #include <asm/io.h>
+ 
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#include <xen/interface/memory.h>
++
++#define x__pmd(x) ((pmd_t) { (x) } )
++#define x__pud(x) ((pud_t) { (x) } )
++#define x__pgd(x) ((pgd_t) { (x) } )
++
++#define x_pmd_val(x)	((x).pmd)
++#define x_pud_val(x)	((x).pud)
++#define x_pgd_val(x)	((x).pgd)
++
++static inline void x_set_pmd(pmd_t *dst, pmd_t val)
++{
++        x_pmd_val(*dst) = x_pmd_val(val); 
++} 
++
++static inline void x_set_pud(pud_t *dst, pud_t val)
++{
++	x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
++}
++
++static inline void x_pud_clear (pud_t *pud)
++{
++	x_pud_val(*pud) = 0;
++}
++
++static inline void x_set_pgd(pgd_t *dst, pgd_t val)
++{
++	x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val)); 
++} 
++
++static inline void x_pgd_clear (pgd_t * pgd)
++{
++	x_pgd_val(*pgd) = 0; 
++}
++
++#define MY_LARGE_EXEC _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
++#define MY_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
++#else
++#define MY_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
++#define MY_TABLE _KERNPG_TABLE
++#endif /* CONFIG_XEN */
++
+ static void init_level2_page(pmd_t *level2p, unsigned long addr)
+ {
+ 	unsigned long end_addr;
+@@ -26,7 +70,7 @@ static void init_level2_page(pmd_t *leve
+ 	addr &= PAGE_MASK;
+ 	end_addr = addr + PUD_SIZE;
+ 	while (addr < end_addr) {
+-		set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
++		x_set_pmd(level2p++, x__pmd(addr | MY_LARGE_EXEC));
+ 		addr += PMD_SIZE;
+ 	}
+ }
+@@ -51,12 +95,12 @@ static int init_level3_page(struct kimag
+ 		}
+ 		level2p = (pmd_t *)page_address(page);
+ 		init_level2_page(level2p, addr);
+-		set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
++		x_set_pud(level3p++, x__pud(__pa(level2p) | MY_TABLE));
+ 		addr += PUD_SIZE;
+ 	}
+ 	/* clear the unused entries */
+ 	while (addr < end_addr) {
+-		pud_clear(level3p++);
++		x_pud_clear(level3p++);
+ 		addr += PUD_SIZE;
+ 	}
+ out:
+@@ -87,12 +131,12 @@ static int init_level4_page(struct kimag
+ 		if (result) {
+ 			goto out;
+ 		}
+-		set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
++		x_set_pgd(level4p++, x__pgd(__pa(level3p) | MY_TABLE));
+ 		addr += PGDIR_SIZE;
+ 	}
+ 	/* clear the unused entries */
+ 	while (addr < end_addr) {
+-		pgd_clear(level4p++);
++		x_pgd_clear(level4p++);
+ 		addr += PGDIR_SIZE;
+ 	}
+ out:
+@@ -103,14 +147,21 @@ out:
+ static int create_page_table_b(struct kimage *image)
+ {
+ 	struct kimage_arch *arch = &image->arch_data;
++	unsigned long last_page;
+ 
+ 	arch->page_table_b = kimage_alloc_control_pages(image, 0);
+ 
+ 	if (!arch->page_table_b)
+ 		return -ENOMEM;
+ 
++#ifdef CONFIG_XEN
++	last_page = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
++#else
++	last_page = end_pfn;
++#endif
++
+  	return init_level4_page(image, page_address(arch->page_table_b),
+-				0, end_pfn << PAGE_SHIFT);
++				0, last_page << PAGE_SHIFT);
+ }
+ 
+ typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
+@@ -211,6 +262,7 @@ void machine_kexec_cleanup(struct kimage
+ 	return;
+ }
+ 
++#ifndef CONFIG_XEN
+ /*
+  * Do not allocate memory (or fail in any way) in machine_kexec().
+  * We are past the point of no return, committed to rebooting now.
+@@ -230,5 +282,23 @@ NORET_TYPE void machine_kexec(struct kim
+ 
+ 	/* now call it */
+ 	rnk = (relocate_new_kernel_t) relocate_new_kernel;
+-	(*rnk)(page_list, control_code, image->start, page_table_a, page_table_b);
++	(*rnk)(page_list, control_code, image->start, page_table_a,
++	       page_table_b);
++}
++#endif /* !CONFIG_XEN */
++
++#ifdef CONFIG_XEN
++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki,struct kimage *image)
++{
++	struct kimage_arch *arch = &image->arch_data;
++	int k, n = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
++
++	for (k = 0; k < n; k++)
++		xki->page_table_a[k] = 
++			pfn_to_mfn(page_to_pfn(arch->page_table_a[k]))
++				<< PAGE_SHIFT;
++
++	xki->page_table_b =
++		pfn_to_mfn(page_to_pfn(arch->page_table_b)) << PAGE_SHIFT;
+ }
++#endif /* CONFIG_XEN */

[-- Attachment #7: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] kexec: framework and i386 (Take XIII)
  2006-07-11  3:39                                                       ` [PATCH] kexec: framework and i386 (Take XII) Horms
@ 2006-08-11  7:48                                                         ` Horms
  2006-08-31  7:43                                                           ` [PATCH] kexec: framework and i386 (Take XIV) Horms
  0 siblings, 1 reply; 68+ messages in thread
From: Horms @ 2006-08-11  7:48 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Akio Takebe, Isaku Yamahata,
	Magnus Damm, Mark Williamson

[-- Attachment #1: Type: text/plain, Size: 1789 bytes --]

Hi,

here is a minor update of the kexec/kdump patchset.
The changes since the last post are quite minimal.

* Forward port to xen-unstable-11059 from 10650
* Add powerpc stubs to 51.1-kexec-generic-upstream.patch
  going to get messy as more architectures are added

The patches are currently:

   1. 51.1-kexec-generic-upstream.patch
      * Common code for all architectures,
        the basic plumbing for kexec/kdump

   2. 51.1.1-kexec-trigger_crash_dump.patch
      * xen-console trigger crash_dump
      * Depends on 1

   3. 51.2.1-kexec-x86-upstream.patch
      * Glue between 1, and 3 and 4.
        This would not be needed for ppc or ia64, but
	neither have been written yet.
	We are planning to commence work on ia64 soon.
      * Depends on 1

   4. 51.2.1.1-kexec-x86_32-upstream.patch
      * Kexec/kdump for x86_32
      * Depends on 3 (and 1)

   5. 51.2.31.2-kexec-x86_64-upstream.patch
      * * Kexec/kdump for x86_64
      * Depends on 3 (and 1)

Things that are being worked on:

* Porting kexec for ia64. This is going somewhat slower than
  I had hoped. Partly because of my own schedule. And partly
  because the Linux code is flakier than I previously thought.

  If anyone cares, the problem that is currently bothering me
  most about linux ia64 kexec is that you can usually kexec once,
  but twice doesn't work.

  e.g. linux --kexec--> linux: ok
       linux --kexec--> linux --kexec fails--> linux: not ok

* Kdump for x86_64. My colleague Magnus is working on this.
  But he is seeing a very strange problem where kdumping
  into a bzimage works, while a vmlinux does not.
  
  Please prod him if you want more details.

Things that would be good to work on:

* PPC port

-- 
Horms
  H: http://www.vergenet.net/~horms/
  W: http://www.valinux.co.jp/en/


[-- Attachment #2: 51.1-kexec-generic-upstream.patch --]
[-- Type: text/plain, Size: 40657 bytes --]

kexec: framework

This is an implementation of kexec for dom0/xen, that allows
kexecing of the physical machine from xen. The approach taken is
to move the architecture-dependant kexec code into a new hypercall.

This patch only includes the framework, it cann't be used without
architecture dependant hooks, however the code should compile as is.

Signed-Off-By: Horms <horms@verge.net.au>
Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>

 linux-2.6-xen-sparse/drivers/xen/core/Makefile                        |    1 
 linux-2.6-xen-sparse/drivers/xen/core/crash.c                         |   49 +
 linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c                 |   81 ++
 linux-2.6-xen-sparse/drivers/xen/core/reboot.c                        |    4 
 patches/linux-2.6.16.13/0-linux-2.6.16-kexec_page_table_a_stubs.patch |   85 ++
 patches/linux-2.6.16.13/kexec-generic.patch                           |  294 ++++++++++
 xen/arch/ia64/xen/Makefile                                            |    2 
 xen/arch/ia64/xen/crash.c                                             |   26 
 xen/arch/ia64/xen/machine_kexec.c                                     |   46 +
 xen/arch/powerpc/Makefile                                             |    2 
 xen/arch/powerpc/crash.c                                              |   26 
 xen/arch/powerpc/machine_kexec.c                                      |   46 +
 xen/arch/x86/Makefile                                                 |    2 
 xen/arch/x86/crash.c                                                  |   26 
 xen/arch/x86/machine_kexec.c                                          |   46 +
 xen/common/Makefile                                                   |    1 
 xen/common/kexec.c                                                    |  211 +++++++
 xen/common/page_alloc.c                                               |   33 -
 xen/drivers/char/console.c                                            |    3 
 xen/include/asm-ia64/kexec.h                                          |   32 +
 xen/include/asm-x86/kexec.h                                           |   31 +
 xen/include/public/kexec.h                                            |   85 ++
 xen/include/public/xen.h                                              |    1 
 xen/include/xen/elfcore.h                                             |   73 ++
 xen/include/xen/kexec.h                                               |   33 +
 xen/include/xen/mm.h                                                  |    1 
 26 files changed, 1229 insertions(+), 11 deletions(-)

--- x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
+++ x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
@@ -11,3 +11,4 @@ obj-$(CONFIG_XEN_SYSFS)		+= xen_sysfs.o
 obj-$(CONFIG_XEN_SKBUFF)	+= skbuff.o
 obj-$(CONFIG_XEN_REBOOT)	+= reboot.o
 obj-$(CONFIG_XEN_SMPBOOT)	+= smpboot.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o crash.o
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/crash.c
@@ -0,0 +1,49 @@
+/*
+ * drivers/xen/core/crash.c
+ * Architecture independent functions for kexec based crash dumps in xen.
+ *
+ * Created by: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <asm/ptrace.h>
+#include <linux/types.h>
+#include <asm/kexec-xen.h>
+#include <asm/hypervisor.h>
+#include <asm/system.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <asm/hw_irq.h>
+#include <xen/interface/kexec.h>
+
+/* 
+ * This passes the registers's down to the hypervisor and has it kexec()
+ * This is a bit different to the linux implementation which
+ * has this call save registers and stop CPUs and then goes into
+ * machine_kexec() later. But for Xen it makes more sense to
+ * have the kexec hypercall do everything, and this call
+ * has the registers parameter that is needed.
+ * to the hypervisor to allow the hypervisor to kdump itself
+ * on an internal panic 
+ */
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	struct cpu_user_regs xen_regs;
+	printk("machine_crash_shutdown: %d\n", smp_processor_id());
+	local_irq_disable();
+#ifdef CONFIG_X86_IO_APIC
+	disable_IO_APIC();
+#endif
+	crash_translate_regs(regs, &xen_regs);
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, KEXEC_TYPE_CRASH, &xen_regs);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
@@ -0,0 +1,81 @@
+/*
+ * drivers/xen/core/machine_kexec.c 
+ * handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <linux/kexec.h>
+#include <xen/interface/kexec.h>
+#include <linux/mm.h>
+#include <asm/hypercall.h>
+#include <asm/kexec-xen.h>
+
+const extern unsigned char relocate_new_kernel[];
+extern unsigned int relocate_new_kernel_size;
+
+extern void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, 
+					 struct kimage *image);
+
+static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+	memset(xki, 0, sizeof(*xki));
+
+	machine_kexec_setup_load_arg(xki, image);
+
+	xki->indirection_page = image->head;
+	xki->reboot_code_buffer = 
+		pfn_to_mfn(page_to_pfn(image->control_code_page)) << PAGE_SHIFT;
+	xki->start_address = image->start;
+}
+
+/*
+ * Load the image into xen so xen can kdump itself
+ * This might have been done in prepare, but prepare
+ * is currently called too early. It might make sense
+ * to move prepare, but for now, just add an extra hook.
+ */
+int xen_machine_kexec_load(struct kimage *image)
+{
+	xen_kexec_image_t xki;
+
+	setup_load_arg(&xki, image);
+	return HYPERVISOR_kexec(KEXEC_CMD_kexec_load, image->type, &xki);
+}
+
+/*
+ * Unload the image that was stored by machine_kexec_load()
+ * This might have been done in machine_kexec_cleanup() but it
+ * is called too late, and its possible xen could try and kdump
+ * using resources that have been freed.
+ */
+void xen_machine_kexec_unload(struct kimage *image)
+{
+	HYPERVISOR_kexec(KEXEC_CMD_kexec_unload, image->type, NULL);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ *
+ * This has the hypervisor move to the prefered reboot CPU, 
+ * stop all CPUs and kexec. That is it combines machine_shutdown()
+ * and machine_kexec() in Linux kexec terms.
+ */
+NORET_TYPE void xen_machine_kexec(struct kimage *image)
+{
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, image->type, NULL);
+	panic("KEXEC_CMD_kexec hypercall should not return\n");
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
+++ x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
@@ -65,6 +65,10 @@ void machine_power_off(void)
 	HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 }
 
+#ifdef CONFIG_KEXEC
+void machine_shutdown(void) { }
+#endif
+
 int reboot_thru_bios = 0;	/* for dmi_scan.c */
 EXPORT_SYMBOL(machine_restart);
 EXPORT_SYMBOL(machine_halt);
--- x/xen/arch/ia64/xen/Makefile
+++ x/xen/arch/ia64/xen/Makefile
@@ -25,5 +25,7 @@ obj-y += xensetup.o
 obj-y += xentime.o
 obj-y += flushd.o
 obj-y += privop_stat.o
+obj-y += machine_kexec.o
+obj-y += crash.o
 
 obj-$(crash_debug) += gdbstub.o
--- /dev/null
+++ x/xen/arch/ia64/xen/crash.c
@@ -0,0 +1,26 @@
+/**********************************************************************
+ * arch/ia64/xen/crash.c
+ *
+ * Created By: Horms
+ * 
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+void machine_crash_shutdown(struct cpu_user_regs *regs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
--- /dev/null
+++ x/xen/arch/ia64/xen/machine_kexec.c
@@ -0,0 +1,46 @@
+/**********************************************************************
+ * arch/ia64/xen/machine_kexec.c
+ *
+ * Created By: Horms
+ * 
+ */
+
+#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+int machine_kexec_load(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    return -1;
+}
+
+void machine_kexec_unload(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_kexec_reserved(xen_kexec_reserve_t *reservation)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_shutdown(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/powerpc/Makefile
+++ x/xen/arch/powerpc/Makefile
@@ -35,6 +35,8 @@ obj-y += setup.o
 obj-y += smp.o
 obj-y += time.o
 obj-y += usercopy.o
+obj-y += machine_kexec.o
+obj-y += crash.o
 
 obj-$(debug) += 0opt.o
 obj-$(crash_debug) += gdbstub.o
--- /dev/null
+++ x/xen/arch/powerpc/crash.c
@@ -0,0 +1,26 @@
+/**********************************************************************
+ * arch/powerpc/crash.c
+ *
+ * Created By: Horms
+ * 
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+void machine_crash_shutdown(struct cpu_user_regs *regs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
--- /dev/null
+++ x/xen/arch/powerpc/machine_kexec.c
@@ -0,0 +1,46 @@
+/**********************************************************************
+ * arch/powerpc/machine_kexec.c
+ *
+ * Created By: Horms
+ * 
+ */
+
+#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+int machine_kexec_load(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    return -1;
+}
+
+void machine_kexec_unload(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_kexec_reserved(xen_kexec_reserve_t *reservation)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_shutdown(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/Makefile
+++ x/xen/arch/x86/Makefile
@@ -39,6 +39,8 @@ obj-y += trampoline.o
 obj-y += traps.o
 obj-y += usercopy.o
 obj-y += x86_emulate.o
+obj-y += machine_kexec.o
+obj-y += crash.o
 
 ifneq ($(pae),n)
 obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o
--- /dev/null
+++ x/xen/arch/x86/crash.c
@@ -0,0 +1,26 @@
+/******************************************************************************
+ * arch/x86/crash.c
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+void machine_crash_shutdown(struct cpu_user_regs *regs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/arch/x86/machine_kexec.c
@@ -0,0 +1,46 @@
+/******************************************************************************
+ * arch/x86/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ */
+
+#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+int machine_kexec_load(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    return -1;
+}
+
+void machine_kexec_unload(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_kexec_reserved(xen_kexec_reserve_t *reservation)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_shutdown(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/Makefile
+++ x/xen/common/Makefile
@@ -7,6 +7,7 @@ obj-y += event_channel.o
 obj-y += grant_table.o
 obj-y += kernel.o
 obj-y += keyhandler.o
+obj-y += kexec.o
 obj-y += lib.o
 obj-y += memory.o
 obj-y += multicall.o
--- /dev/null
+++ x/xen/common/kexec.c
@@ -0,0 +1,211 @@
+/******************************************************************************
+ * common/kexec.c - Achitecture independent kexec code for Xen
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Based in part on Linux 2.6.16's kernel/kexec.c
+ */
+
+#include <asm/kexec.h>
+#include <xen/lib.h>
+#include <xen/errno.h>
+#include <xen/guest_access.h>
+#include <xen/sched.h>
+#include <xen/types.h>
+#include <xen/kexec.h>
+#include <public/kexec.h>
+
+DEFINE_PER_CPU (note_buf_t, crash_notes);
+
+static xen_kexec_image_t kexec_image;
+static int kexec_image_set = 0;
+static xen_kexec_image_t kexec_crash_image;
+static int kexec_crash_image_set = 0;
+static int kexec_crash_lock = 0;
+
+/* Must call with kexec_crash_lock held */
+void __crash_kexec(struct cpu_user_regs *regs)
+{
+    struct cpu_user_regs fixed_regs;
+
+    if (!kexec_crash_image_set)
+	    return;
+    crash_setup_regs(&fixed_regs, regs);
+    machine_crash_shutdown(&fixed_regs);
+    machine_kexec(&kexec_crash_image); /* Does not return */
+}
+
+void crash_kexec(struct cpu_user_regs *regs)
+{
+    int locked;
+
+    locked = xchg(&kexec_crash_lock, 1);
+    if (locked)
+        return;
+    __crash_kexec(regs);
+
+    /* The if() here is bogus, but gcc will throws a warning that the
+     * computed value is unused and xen compiles with -Werror.
+     * This seems like a viable work around.
+     * This did not seem to happen with slightly older gcc.
+     * Observed with: 
+     * gcc version 4.1.2 20060604 (prerelease) (Debian * 4.1.1-2) */
+    if (xchg(&kexec_crash_lock, 0)) ;
+
+    return;
+}
+
+static int get_crash_note(int vcpuid, XEN_GUEST_HANDLE(void) uarg)
+{
+    struct domain *domain = current->domain;
+    unsigned long crash_note;
+    struct vcpu *vcpu;
+    int locked;
+
+    if (vcpuid < 0 || vcpuid > MAX_VIRT_CPUS)
+	return -EINVAL;
+
+    if ( ! (vcpu = domain->vcpu[vcpuid]) )
+	return -EINVAL;
+
+    locked = xchg(&kexec_crash_lock, 1);
+    if (locked)
+    {
+       printk("do_kexec: (CMD_kexec_crash_note): dump is locked\n");
+       return -EFAULT;
+    }
+    crash_note = __pa((unsigned long)per_cpu(crash_notes, vcpu->processor));
+
+    /* The if() here is bogus, but gcc will throws a warning that the
+     * computed value is unused and xen compiles with -Werror.
+     * This seems like a viable work around.
+     * This did not seem to happen with slightly older gcc.
+     * Observed with: 
+     * gcc version 4.1.2 20060604 (prerelease) (Debian * 4.1.1-2) */
+    if (xchg(&kexec_crash_lock, 0)) ;
+
+    if ( unlikely(copy_to_guest(uarg, &crash_note, 1) != 0) )
+    {
+        printk("do_kexec: (CMD_kexec_crash_note): copy_to_guest failed\n");
+        return -EFAULT;
+    }
+    
+    return 0;
+}
+
+static int get_reserve(XEN_GUEST_HANDLE(void) uarg)
+{
+    xen_kexec_reserve_t reservation;
+
+    machine_kexec_reserved(&reservation);
+    if ( unlikely(copy_to_guest(uarg, &reservation, 1) != 0) )
+    {
+        printk("do_kexec (CMD_kexec_reserve): copy_to_guest failed\n");
+        return -EFAULT;
+    }
+    
+    return 0;
+}
+
+static int __do_kexec(unsigned long type, XEN_GUEST_HANDLE(void) uarg,
+		      xen_kexec_image_t *image)
+{
+    cpu_user_regs_t regs;
+
+    if (type == KEXEC_TYPE_DEFAULT)
+        machine_shutdown(image); /* Does not return */
+    else
+    {
+        if ( unlikely(copy_from_guest(&regs, uarg, 1) != 0) )
+        {
+            printk("do_kexec (CMD_kexec): copy_from_guest failed\n");
+            return -EFAULT;
+        }
+        __crash_kexec(&regs); /* Does not return */
+    }
+
+    return -EINVAL;
+}
+
+int do_kexec(unsigned long op, int arg1, XEN_GUEST_HANDLE(void) uarg)
+{
+    xen_kexec_image_t *image;
+    int locked;
+    int *image_set;
+    int status = -EINVAL;
+
+    if ( !IS_PRIV(current->domain) )  
+        return -EPERM;
+
+    switch (op)
+    {
+    case KEXEC_CMD_kexec_crash_note:
+        return get_crash_note(arg1, uarg);
+    case KEXEC_CMD_kexec_reserve:
+	return get_reserve(uarg);
+    }
+
+    /* For all other ops, arg1 is the type of kexec, that is
+     * KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH */
+    if (arg1 == KEXEC_TYPE_CRASH)
+    {
+        image = &kexec_crash_image;
+        image_set = &kexec_crash_image_set;
+        locked = xchg(&kexec_crash_lock, 1);
+        if (locked)
+        {
+           printk("do_kexec: dump is locked\n");
+           return -EFAULT;
+        }
+    }
+    else
+    {
+        image = &kexec_image;
+        image_set = &kexec_image_set;
+    }
+
+    switch(op) {
+    case KEXEC_CMD_kexec:
+        BUG_ON(!*image_set);
+	status = __do_kexec(arg1, uarg, image);
+        break;
+    case KEXEC_CMD_kexec_load:
+        BUG_ON(*image_set);
+        if ( unlikely(copy_from_guest(image, uarg, 1) != 0) )
+        {
+            printk("do_kexec (CMD_kexec_load): copy_from_guest failed\n");
+            status = -EFAULT;
+	    break;
+        }
+        *image_set = 1;
+        status = machine_kexec_load(arg1, image);
+        break;
+    case KEXEC_CMD_kexec_unload:
+        BUG_ON(!*image_set);
+        *image_set = 0;
+        machine_kexec_unload(arg1, image);
+        status = 0;
+        break;
+    }
+
+    if (arg1 == KEXEC_TYPE_CRASH)
+        /* The if() here is bogus, but gcc will throws a warning that the
+         * computed value is unused and xen compiles with -Werror.
+         * This seems like a viable work around.
+         * This did not seem to happen with slightly older gcc.
+         * Observed with: 
+         * gcc version 4.1.2 20060604 (prerelease) (Debian * 4.1.1-2) */
+        if (xchg(&kexec_crash_lock, 0)) ;
+
+    return status;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/page_alloc.c
+++ x/xen/common/page_alloc.c
@@ -213,24 +213,35 @@ void init_boot_pages(paddr_t ps, paddr_t
     }
 }
 
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at)
+{
+    unsigned long i;
+
+    for ( i = 0; i < nr_pfns; i++ )
+        if ( allocated_in_map(pfn_at + i) )
+             break;
+
+    if ( i == nr_pfns )
+    {
+        map_alloc(pfn_at, nr_pfns);
+        return pfn_at;
+    }
+
+    return 0;
+}
+
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
 {
-    unsigned long pg, i;
+    unsigned long pg, i = 0;
 
     for ( pg = 0; (pg + nr_pfns) < max_page; pg += pfn_align )
     {
-        for ( i = 0; i < nr_pfns; i++ )
-            if ( allocated_in_map(pg + i) )
-                 break;
-
-        if ( i == nr_pfns )
-        {
-            map_alloc(pg, nr_pfns);
-            return pg;
-        }
+        i = alloc_boot_pages_at(nr_pfns, pg);
+        if (i != 0)
+            break;
     }
 
-    return 0;
+    return i;
 }
 
 
--- x/xen/drivers/char/console.c
+++ x/xen/drivers/char/console.c
@@ -716,6 +716,7 @@ void panic(const char *fmt, ...)
     char buf[128];
     unsigned long flags;
     static DEFINE_SPINLOCK(lock);
+    extern void crash_kexec(struct cpu_user_regs *regs);
     
     debugtrace_dump();
 
@@ -738,6 +739,8 @@ void panic(const char *fmt, ...)
 
     debugger_trap_immediate();
 
+    crash_kexec(NULL);
+
     if ( opt_noreboot )
     {
         machine_halt();
--- /dev/null
+++ x/xen/include/asm-ia64/kexec.h
@@ -0,0 +1,32 @@
+/******************************************************************************
+ * include/asm-ia64/kexec.h
+ * 
+ * Created By: Horms
+ *
+ */
+
+#ifndef __IA64_KEXEC_H__
+#define __IA64_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+                            struct cpu_user_regs *oldregs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+#endif /* __IA64_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
--- /dev/null
+++ x/xen/include/asm-x86/kexec.h
@@ -0,0 +1,31 @@
+/******************************************************************************
+ * include/asm-x86/kexec.h
+ * 
+ * Created By: Horms
+ *
+ */
+
+#ifndef __X86_KEXEC_H__
+#define __X86_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+#endif /* __X86_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/public/kexec.h
@@ -0,0 +1,85 @@
+/******************************************************************************
+ * kexec.h - Public portion
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Types based on those in ./vcpu.h on request from Keir Frasier
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+#include "xen.h"
+
+/*
+ * Prototype for this hypercall is:
+ *  int kexec_op(int cmd, int type, void *extra_args)
+ * @cmd        == KEXEC_CMD_... 
+ *                KEXEC operation to perform
+ * @arg1       == Operation-specific unsigned long argument
+ *                This could be in extra_args, but by putting it here
+ *                copy_from_user can be avoided, inparticular in
+ *                KEXEC_CMD_kexec during a crash dump, which is a failry
+ *                critical section of code.If this turns out not to be
+ *                important then it can be collapsed into extra_args.
+ * @extra_args == Operation-specific extra arguments (NULL if none).
+ */
+
+#define KEXEC_TYPE_DEFAULT 0
+#define KEXEC_TYPE_CRASH   1
+
+/*
+ * Perform kexec having previously loaded a kexec or kdump kernel
+ * as appropriate.
+ * @arg1      == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH
+ * @extra_arg == pointer to cpu_user_regs_t structure.
+ */
+#define KEXEC_CMD_kexec                 0
+
+/*
+ * Load kernel image in preparation for kexec or kdump.
+ * @arg1      == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH
+ * @extra_arg == pointer to xen_kexec_image_t structure.
+ */
+#define KEXEC_CMD_kexec_load            1
+typedef struct xen_kexec_image {
+    unsigned long indirection_page;
+    unsigned long reboot_code_buffer;
+    unsigned long start_address;
+} xen_kexec_image_t;
+
+/*
+ * Clean up image loaded by KEXEC_CMD_kexec_load
+ * @arg1      == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH
+ */
+#define KEXEC_CMD_kexec_unload          2
+
+/*
+ * Find the base pointer and size of the area that xen has 
+ * reserved for use by the crash kernel.
+ * @extra_arg == pointer to xen_kexec_reserve_t structure.
+ */
+#define KEXEC_CMD_kexec_reserve         3
+typedef struct xen_kexec_reserve {
+    unsigned long size;
+    unsigned long start;
+} xen_kexec_reserve_t;
+
+/*
+ * Find the base pointer of the area that xen has 
+ * reserved for use by a crash note for a given VCPU
+ * @extra_arg == pointer to unsigned long.
+ */
+#define KEXEC_CMD_kexec_crash_note      4
+
+#endif /* _XEN_PUBLIC_KEXEC_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/public/xen.h
+++ x/xen/include/public/xen.h
@@ -67,6 +67,7 @@
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
 #define __HYPERVISOR_hvm_op               34
+#define __HYPERVISOR_kexec_op             35
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
--- /dev/null
+++ x/xen/include/xen/elfcore.h
@@ -0,0 +1,73 @@
+/******************************************************************************
+ * include/xen/elfcore.h
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on include/linux/elfcore.h from Linux 2.6.16
+ * Naming scheeme based on include/xen/elf.h (not include/linux/elfcore.h)
+ *
+ */
+
+#ifndef __ELFCOREC_H__
+#define __ELFCOREC_H__
+
+#include <xen/types.h>
+#include <xen/elf.h>
+#include <public/xen.h>
+
+#define NT_PRSTATUS     1
+
+typedef struct
+{
+    int signo;                       /* signal number */
+    int code;                        /* extra code */
+    int errno;                       /* errno */
+} ELF_Signifo;
+
+/* These seem to be the same length on all architectures on Linux */
+typedef int ELF_Pid;
+typedef struct {
+	long tv_sec;
+	long tv_usec;
+} ELF_Timeval;
+typedef unsigned long ELF_Greg;
+#define ELF_NGREG (sizeof (struct cpu_user_regs) / sizeof(ELF_Greg))
+typedef ELF_Greg ELF_Gregset[ELF_NGREG];
+
+/*
+ * Definitions to generate Intel SVR4-like core files.
+ * These mostly have the same names as the SVR4 types with "elf_"
+ * tacked on the front to prevent clashes with linux definitions,
+ * and the typedef forms have been avoided.  This is mostly like
+ * the SVR4 structure, but more Linuxy, with things that Linux does
+ * not support and which gdb doesn't really use excluded.
+ */
+typedef struct
+{
+    ELF_Signifo pr_info;         /* Info associated with signal */
+    short pr_cursig;             /* Current signal */
+    unsigned long pr_sigpend;    /* Set of pending signals */
+    unsigned long pr_sighold;    /* Set of held signals */
+    ELF_Pid pr_pid;
+    ELF_Pid pr_ppid;
+    ELF_Pid pr_pgrp;
+    ELF_Pid pr_sid;
+    ELF_Timeval pr_utime;        /* User time */
+    ELF_Timeval pr_stime;        /* System time */
+    ELF_Timeval pr_cutime;       /* Cumulative user time */
+    ELF_Timeval pr_cstime;       /* Cumulative system time */
+    ELF_Gregset pr_reg;          /* GP registers */
+    int pr_fpvalid;              /* True if math co-processor being used.  */
+} ELF_Prstatus;
+
+#endif /* __ELFCOREC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/xen/kexec.h
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * include/xen/kexec.h - Internal archtecture independant portion
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <public/kexec.h>
+
+#define MAX_NOTE_BYTES 1024
+
+typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
+DECLARE_PER_CPU (note_buf_t, crash_notes);
+
+int machine_kexec_load(int type, xen_kexec_image_t *image);
+void machine_kexec_unload(int type, xen_kexec_image_t *image);
+void machine_kexec_reserved(xen_kexec_reserve_t *reservation);
+void machine_kexec(xen_kexec_image_t *image);
+void machine_shutdown(xen_kexec_image_t *image);
+void machine_crash_shutdown(cpu_user_regs_t *regs);
+
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/xen/mm.h
+++ x/xen/include/xen/mm.h
@@ -40,6 +40,7 @@ struct page_info;
 paddr_t init_boot_allocator(paddr_t bitmap_start);
 void init_boot_pages(paddr_t ps, paddr_t pe);
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align);
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at);
 void end_boot_allocator(void);
 
 /* Generic allocator. These functions are *not* interrupt-safe. */
--- /dev/null
+++ x/patches/linux-2.6.16.13/kexec-generic.patch
@@ -0,0 +1,294 @@
+ drivers/base/cpu.c    |   20 ++++++++++++++
+ include/linux/kexec.h |    5 +++
+ kernel/kexec.c        |   68 ++++++++++++++++++++++++++++++++++++++++---------
+ kernel/sys.c          |    4 ++
+ 4 files changed, 85 insertions(+), 12 deletions(-)
+
+--- x/drivers/base/cpu.c
++++ x/drivers/base/cpu.c
+@@ -11,6 +11,10 @@
+ 
+ #include "base.h"
+ 
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#endif
++
+ struct sysdev_class cpu_sysdev_class = {
+ 	set_kset_name("cpu"),
+ };
+@@ -86,6 +90,18 @@ static inline void register_cpu_control(
+ #ifdef CONFIG_KEXEC
+ #include <linux/kexec.h>
+ 
++#ifdef CONFIG_XEN
++static unsigned long get_crash_notes(int cpu)
++{
++	unsigned long crash_note;
++
++	if (HYPERVISOR_kexec(KEXEC_CMD_kexec_crash_note, cpu, &crash_note) < 0)
++		return 0UL;
++	return crash_note;
++}
++#endif
++
++/* XXX: This only finds dom0's CPU's */
+ static ssize_t show_crash_notes(struct sys_device *dev, char *buf)
+ {
+ 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+@@ -101,7 +117,11 @@ static ssize_t show_crash_notes(struct s
+ 	 * boot up and this data does not change there after. Hence this
+ 	 * operation should be safe. No locking required.
+ 	 */
++#ifndef CONFIG_XEN
+ 	addr = __pa(per_cpu_ptr(crash_notes, cpunum));
++#else
++	addr = (unsigned long long)get_crash_notes(cpunum);
++#endif
+ 	rc = sprintf(buf, "%Lx\n", addr);
+ 	return rc;
+ }
+--- x/include/linux/kexec.h
++++ x/include/linux/kexec.h
+@@ -91,6 +91,11 @@ struct kimage {
+ extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
+ extern int machine_kexec_prepare(struct kimage *image);
+ extern void machine_kexec_cleanup(struct kimage *image);
++#ifdef CONFIG_XEN
++extern int xen_machine_kexec_load(struct kimage *image);
++extern void xen_machine_kexec_unload(struct kimage *image);
++extern NORET_TYPE void xen_machine_kexec(struct kimage *image) ATTRIB_NORET;
++#endif
+ extern asmlinkage long sys_kexec_load(unsigned long entry,
+ 					unsigned long nr_segments,
+ 					struct kexec_segment __user *segments,
+--- x/kernel/kexec.c
++++ x/kernel/kexec.c
+@@ -38,6 +38,20 @@ struct resource crashk_res = {
+ 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ };
+ 
++/* Kexec needs to know about the actually physical addresss.
++ * But in xen, a physical address is a pseudo-physical addresss. */
++#ifndef CONFIG_XEN
++#define kexec_page_to_pfn(page)  page_to_pfn(page)
++#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
++#define kexec_virt_to_phys(addr) virt_to_phys(addr)
++#define kexec_phys_to_virt(addr) phys_to_virt(addr)
++#else
++#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
++#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
++#define kexec_virt_to_phys(addr) virt_to_machine(addr)
++#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
++#endif
++
+ int kexec_should_crash(struct task_struct *p)
+ {
+ 	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+@@ -403,7 +417,7 @@ static struct page *kimage_alloc_normal_
+ 		pages = kimage_alloc_pages(GFP_KERNEL, order);
+ 		if (!pages)
+ 			break;
+-		pfn   = page_to_pfn(pages);
++		pfn   = kexec_page_to_pfn(pages);
+ 		epfn  = pfn + count;
+ 		addr  = pfn << PAGE_SHIFT;
+ 		eaddr = epfn << PAGE_SHIFT;
+@@ -437,6 +451,7 @@ static struct page *kimage_alloc_normal_
+ 	return pages;
+ }
+ 
++#ifndef CONFIG_XEN
+ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ 						      unsigned int order)
+ {
+@@ -490,7 +505,7 @@ static struct page *kimage_alloc_crash_c
+ 		}
+ 		/* If I don't overlap any segments I have found my hole! */
+ 		if (i == image->nr_segments) {
+-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
++			pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
+ 			break;
+ 		}
+ 	}
+@@ -517,6 +532,13 @@ struct page *kimage_alloc_control_pages(
+ 
+ 	return pages;
+ }
++#else /* !CONFIG_XEN */
++struct page *kimage_alloc_control_pages(struct kimage *image,
++					 unsigned int order)
++{
++	return kimage_alloc_normal_control_pages(image, order);
++}
++#endif
+ 
+ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+ {
+@@ -532,7 +554,7 @@ static int kimage_add_entry(struct kimag
+ 			return -ENOMEM;
+ 
+ 		ind_page = page_address(page);
+-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
++		*image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
+ 		image->entry = ind_page;
+ 		image->last_entry = ind_page +
+ 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+@@ -593,13 +615,13 @@ static int kimage_terminate(struct kimag
+ #define for_each_kimage_entry(image, ptr, entry) \
+ 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ 		ptr = (entry & IND_INDIRECTION)? \
+-			phys_to_virt((entry & PAGE_MASK)): ptr +1)
++			kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
+ 
+ static void kimage_free_entry(kimage_entry_t entry)
+ {
+ 	struct page *page;
+ 
+-	page = pfn_to_page(entry >> PAGE_SHIFT);
++	page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
+ 	kimage_free_pages(page);
+ }
+ 
+@@ -611,6 +633,10 @@ static void kimage_free(struct kimage *i
+ 	if (!image)
+ 		return;
+ 
++#ifdef CONFIG_XEN
++	xen_machine_kexec_unload(image);
++#endif
++
+ 	kimage_free_extra_pages(image);
+ 	for_each_kimage_entry(image, ptr, entry) {
+ 		if (entry & IND_INDIRECTION) {
+@@ -686,7 +712,7 @@ static struct page *kimage_alloc_page(st
+ 	 * have a match.
+ 	 */
+ 	list_for_each_entry(page, &image->dest_pages, lru) {
+-		addr = page_to_pfn(page) << PAGE_SHIFT;
++		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
+ 		if (addr == destination) {
+ 			list_del(&page->lru);
+ 			return page;
+@@ -701,12 +727,12 @@ static struct page *kimage_alloc_page(st
+ 		if (!page)
+ 			return NULL;
+ 		/* If the page cannot be used file it away */
+-		if (page_to_pfn(page) >
++		if (kexec_page_to_pfn(page) >
+ 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ 			list_add(&page->lru, &image->unuseable_pages);
+ 			continue;
+ 		}
+-		addr = page_to_pfn(page) << PAGE_SHIFT;
++		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
+ 
+ 		/* If it is the destination page we want use it */
+ 		if (addr == destination)
+@@ -729,7 +755,7 @@ static struct page *kimage_alloc_page(st
+ 			struct page *old_page;
+ 
+ 			old_addr = *old & PAGE_MASK;
+-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
++			old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
+ 			copy_highpage(page, old_page);
+ 			*old = addr | (*old & ~PAGE_MASK);
+ 
+@@ -779,7 +805,7 @@ static int kimage_load_normal_segment(st
+ 			result  = -ENOMEM;
+ 			goto out;
+ 		}
+-		result = kimage_add_page(image, page_to_pfn(page)
++		result = kimage_add_page(image, kexec_page_to_pfn(page)
+ 								<< PAGE_SHIFT);
+ 		if (result < 0)
+ 			goto out;
+@@ -811,6 +837,7 @@ out:
+ 	return result;
+ }
+ 
++#ifndef CONFIG_XEN
+ static int kimage_load_crash_segment(struct kimage *image,
+ 					struct kexec_segment *segment)
+ {
+@@ -833,7 +860,7 @@ static int kimage_load_crash_segment(str
+ 		char *ptr;
+ 		size_t uchunk, mchunk;
+ 
+-		page = pfn_to_page(maddr >> PAGE_SHIFT);
++		page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
+ 		if (page == 0) {
+ 			result  = -ENOMEM;
+ 			goto out;
+@@ -881,6 +908,13 @@ static int kimage_load_segment(struct ki
+ 
+ 	return result;
+ }
++#else /* CONFIG_XEN */
++static int kimage_load_segment(struct kimage *image,
++				struct kexec_segment *segment)
++{
++	return kimage_load_normal_segment(image, segment);
++}
++#endif
+ 
+ /*
+  * Exec Kernel system call: for obvious reasons only root may call it.
+@@ -991,6 +1025,11 @@ asmlinkage long sys_kexec_load(unsigned 
+ 		if (result)
+ 			goto out;
+ 	}
++#ifdef CONFIG_XEN
++	result = xen_machine_kexec_load(image);
++	if (result)
++		goto out;
++#endif
+ 	/* Install the new kernel, and  Uninstall the old */
+ 	image = xchg(dest_image, image);
+ 
+@@ -1045,7 +1084,6 @@ void crash_kexec(struct pt_regs *regs)
+ 	struct kimage *image;
+ 	int locked;
+ 
+-
+ 	/* Take the kexec_lock here to prevent sys_kexec_load
+ 	 * running on one cpu from replacing the crash kernel
+ 	 * we are using after a panic on a different cpu.
+@@ -1061,12 +1099,17 @@ void crash_kexec(struct pt_regs *regs)
+ 			struct pt_regs fixed_regs;
+ 			crash_setup_regs(&fixed_regs, regs);
+ 			machine_crash_shutdown(&fixed_regs);
++#ifdef CONFIG_XEN
++			xen_machine_kexec(image);
++#else
+ 			machine_kexec(image);
++#endif
+ 		}
+ 		xchg(&kexec_lock, 0);
+ 	}
+ }
+ 
++#ifndef CONFIG_XEN
+ static int __init crash_notes_memory_init(void)
+ {
+ 	/* Allocate memory for saving cpu registers. */
+@@ -1079,3 +1122,4 @@ static int __init crash_notes_memory_ini
+ 	return 0;
+ }
+ module_init(crash_notes_memory_init)
++#endif
+--- x/kernel/sys.c
++++ x/kernel/sys.c
+@@ -435,8 +435,12 @@ void kernel_kexec(void)
+ 	kernel_restart_prepare(NULL);
+ 	printk(KERN_EMERG "Starting new kernel\n");
+ 	machine_shutdown();
++#ifdef CONFIG_XEN
++	xen_machine_kexec(image);
++#else
+ 	machine_kexec(image);
+ #endif
++#endif
+ }
+ EXPORT_SYMBOL_GPL(kernel_kexec);
+ 
--- /dev/null
+++ x/patches/linux-2.6.16.13/0-linux-2.6.16-kexec_page_table_a_stubs.patch
@@ -0,0 +1,85 @@
+kexec: Avoid overwriting the current pgd (V2, stubs)
+
+This patch adds an architecture specific structure "struct kimage_arch" to
+struct kimage. This structure is filled in with members by the architecture
+specific patches followed by this one.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+---
+
+ Applies on top of 2.6.16 and 2.6.17-rc4.
+
+ include/asm-i386/kexec.h    |    2 ++
+ include/asm-powerpc/kexec.h |    2 ++
+ include/asm-s390/kexec.h    |    2 ++
+ include/asm-sh/kexec.h      |    2 ++
+ include/asm-x86_64/kexec.h  |    2 ++
+ include/linux/kexec.h       |    2 ++
+ 6 files changed, 12 insertions(+)
+
+--- x/include/asm-i386/kexec.h
++++ x/include/asm-i386/kexec.h
+@@ -29,6 +29,8 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ /* CPU does not save ss and esp on stack if execution is already
+  * running in kernel mode at the time of NMI occurrence. This code
+  * fixes it.
+--- x/include/asm-powerpc/kexec.h
++++ x/include/asm-powerpc/kexec.h
+@@ -108,6 +108,8 @@ static inline void crash_setup_regs(stru
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ #ifdef __powerpc64__
+ extern void kexec_smp_wait(void);	/* get and clear naca physid, wait for
+ 					  master to copy new code to 0 */
+--- x/include/asm-s390/kexec.h
++++ x/include/asm-s390/kexec.h
+@@ -36,6 +36,8 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ /* Provide a dummy definition to avoid build failures. */
+ static inline void crash_setup_regs(struct pt_regs *newregs,
+ 					struct pt_regs *oldregs) { }
+--- x/include/asm-sh/kexec.h
++++ x/include/asm-sh/kexec.h
+@@ -25,6 +25,8 @@
+ 
+ #ifndef __ASSEMBLY__
+ 
++struct kimage_arch {};
++
+ extern void machine_shutdown(void);
+ extern void *crash_notes;
+ 
+--- x/include/asm-x86_64/kexec.h
++++ x/include/asm-x86_64/kexec.h
+@@ -29,6 +29,8 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ /*
+  * Saving the registers of the cpu on which panic occured in
+  * crash_kexec to save a valid sp. The registers of other cpus
+--- x/include/linux/kexec.h
++++ x/include/linux/kexec.h
+@@ -69,6 +69,8 @@ struct kimage {
+ 	unsigned long start;
+ 	struct page *control_code_page;
+ 
++	struct kimage_arch arch_data;
++
+ 	unsigned long nr_segments;
+ 	struct kexec_segment segment[KEXEC_SEGMENT_MAX];
+ 

[-- Attachment #3: 51.1.1-kexec-trigger_crash_dump.patch --]
[-- Type: text/plain, Size: 1624 bytes --]

console: allow a crash dump to be triggered from the xen console

This feature is needed to test crash dump. It is essential for development
(though developers could easily add the patch). It may also be of
use for testing of the roll-out of production systems (people who don't
want to add patches).

The original version of this patch triggered a panic, Keir Fraser
suggested changing it to trigger a crash dump in line with a
similar feature in Linux's sysrq.

Christian Limpach suggested changing the original trigger 'p' (for panic)
to 'D' for dump, as p is already used by the performance counters.
This patch uses 'c' for crashdump, again in line with the similar
feature in Linux's sysrq. On inspection of the code, 'c' does not
seem to be already taken.

Signed-Off-By: Horms <horms@verge.net.au>

 xen/common/kexec.c |   14 ++++++++++++++
 1 file changed, 14 insertions(+)

--- x/xen/common/kexec.c
+++ x/xen/common/kexec.c
@@ -13,6 +13,7 @@
 #include <xen/sched.h>
 #include <xen/types.h>
 #include <xen/kexec.h>
+#include <xen/keyhandler.h>
 #include <public/kexec.h>
 
 DEFINE_PER_CPU (note_buf_t, crash_notes);
@@ -55,6 +56,19 @@ void crash_kexec(struct cpu_user_regs *r
     return;
 }
 
+static void do_crashdump_trigger(unsigned char key)
+{
+	printk("triggering crashdump\n");
+	crash_kexec(NULL);
+}
+
+static __init int register_crashdump_trigger(void)
+{
+	register_keyhandler('c', do_crashdump_trigger, "trigger a crashdump");
+	return 0;
+}
+__initcall(register_crashdump_trigger);
+
 static int get_crash_note(int vcpuid, XEN_GUEST_HANDLE(void) uarg)
 {
     struct domain *domain = current->domain;

[-- Attachment #4: 51.2.1-kexec-x86-upstream.patch --]
[-- Type: text/plain, Size: 23789 bytes --]

kexec: x86

This is the x86 component of kexec for xen.
The generic component is a prerequsite for this patch.
The x86_64 or x86_32 (i386) patch is also needed
in order to use this code, however the code should compile is.

Signed-Off-By: Horms <horms@verge.net.au>
Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>

 xen/arch/x86/crash.c                |  173 ++++++++++++++++++++++++++++++++++-
 xen/arch/x86/dom0_ops.c             |    3 
 xen/arch/x86/machine_kexec.c        |  147 ++++++++++++++++++++++++++++-
 xen/arch/x86/setup.c                |   75 +++++++++++++--
 xen/arch/x86/x86_32/Makefile        |    1 
 xen/arch/x86/x86_32/machine_kexec.c |   26 +++++
 xen/arch/x86/x86_64/Makefile        |    1 
 xen/arch/x86/x86_64/machine_kexec.c |   27 +++++
 xen/include/asm-x86/elf.h           |   27 +++++
 xen/include/asm-x86/fixmap.h        |    1 
 xen/include/asm-x86/hypercall.h     |    5 +
 xen/include/asm-x86/kexec.h         |   13 +-
 xen/include/asm-x86/x86_32/elf.h    |   28 +++++
 xen/include/asm-x86/x86_32/kexec.h  |   48 +++++++++
 xen/include/asm-x86/x86_64/elf.h    |   28 +++++
 xen/include/asm-x86/x86_64/kexec.h  |   33 ++++++
 xen/include/public/kexec.h          |    2 
 xen/include/xen/elfcore.h           |    3 
 18 files changed, 615 insertions(+), 26 deletions(-)

--- x/xen/arch/x86/crash.c
+++ x/xen/arch/x86/crash.c
@@ -3,16 +3,183 @@
  * 
  * Created By: Horms
  *
- * Should be based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
+ * Based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
  */
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <asm/atomic.h>
+#include <asm/elf.h>
+#include <asm/percpu.h>
+#include <asm/kexec.h>
 #include <xen/types.h>
+#include <xen/irq.h>
+#include <asm/ipi.h>
+#include <asm/nmi.h>
+#include <xen/string.h>
+#include <xen/elf.h>
+#include <xen/elfcore.h>
+#include <xen/smp.h>
+#include <xen/delay.h>
+#include <xen/perfc.h>
+#include <xen/kexec.h>
 #include <public/xen.h>
 
+static int crashing_cpu;
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+							       size_t data_len)
+{
+	Elf_Note note;
+
+	note.namesz = strlen(name) + 1;
+	note.descsz = data_len;
+	note.type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) +3)/4;
+	memcpy(buf, name, note.namesz);
+	buf += (note.namesz + 3)/4;
+	memcpy(buf, data, note.descsz);
+	buf += (note.descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	Elf_Note note;
+
+	note.namesz = 0;
+	note.descsz = 0;
+	note.type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static void crash_save_this_cpu(struct cpu_user_regs *regs, int cpu)
+{
+	ELF_Prstatus prstatus;
+	uint32_t *buf;
+
+	printk("crash_save_this_cpu: %d\n",  cpu);
+
+	if ((cpu < 0) || (cpu >= NR_CPUS))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * A well defined structure format with tags is needed
+	 * ELF notes happen to provide this and there is infastructure
+	 * in the Linux kernel to supprot them. In order to make
+	 * crash dumps produced by xen the same, the same
+	 * technique is used here.
+	 */
+
+	/* It should be safe to use per_cpu() here instead of per_cpu_ptr()
+	 * (which does not exist in xen) as kexecing_lock must be held in
+	 * order to get anywhere near here */
+	buf = (uint32_t *)per_cpu(crash_notes, cpu);
+	if (!buf) /* XXX: Can this ever occur? */
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	/* XXX: Xen does not have processes. For the crashing CPU on a dom0
+	 * crash this could be pased down from dom0, but is this
+	 * neccessary?
+	 * prstatus.pr_pid = current->pid; */
+	ELF_CORE_COPY_REGS(prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+				sizeof(prstatus));
+	final_note(buf);
+}
+
+static void crash_save_self(struct cpu_user_regs *regs)
+{
+	crash_save_this_cpu(regs, smp_processor_id());
+}
+
+#ifdef CONFIG_SMP
+static atomic_t waiting_for_crash_ipi;
+
+static int crash_nmi_callback(struct cpu_user_regs *regs, int cpu)
+{
+	struct cpu_user_regs fixed_regs;
+
+	/* Don't do anything if this handler is invoked on crashing cpu.
+	 * Otherwise, system will completely hang. Crashing cpu can get
+	 * an NMI if system was initially booted with nmi_watchdog parameter.
+	 */
+	if (cpu == crashing_cpu)
+		return 1;
+	local_irq_disable();
+
+#ifdef CONFIG_X86_32
+	if (!user_mode(regs)) {
+		crash_fixup_ss_esp(&fixed_regs, regs);
+		regs = &fixed_regs;
+	}
+#endif
+	crash_save_this_cpu(regs, cpu);
+	disable_local_APIC();
+	atomic_dec(&waiting_for_crash_ipi);
+	/* Assume hlt works */
+	__asm__ __volatile__ ( "hlt" );
+	for(;;);
+
+	return 1;
+
+	/* Need to use this somewhere as Xen builds with -Werror */
+	crash_setup_regs(&fixed_regs, regs);
+}
+
+/*
+ * By using the NMI code instead of a vector we just sneak thru the
+ * word generator coming out with just what we want.  AND it does
+ * not matter if clustered_apic_mode is set or not.
+ */
+static void smp_send_nmi_allbutself(void)
+{
+	cpumask_t allbutself = cpu_online_map;
+    	cpu_clear(smp_processor_id(), allbutself);
+	send_IPI_mask(allbutself, APIC_DM_NMI);
+}
+
+static void nmi_shootdown_cpus(void)
+{
+	unsigned long msecs;
+
+	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+	/* Would it be better to replace the trap vector here? */
+	set_nmi_callback(crash_nmi_callback);
+	/* Ensure the new callback function is set before sending
+	 * out the NMI
+	 */
+	wmb();
+
+	smp_send_nmi_allbutself();
+
+	msecs = 1000; /* Wait at most a second for the other cpus to stop */
+	while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+		mdelay(1);
+		msecs--;
+	}
+
+	/* Leave the nmi callback set */
+	disable_local_APIC();
+}
+#else
+static void nmi_shootdown_cpus(void)
+{
+	/* There are no cpus to shootdown */
+}
+#endif
+
 void machine_crash_shutdown(struct cpu_user_regs *regs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+	printk("machine_crash_shutdown: %d\n", smp_processor_id());
+	local_irq_disable();
+
+	crashing_cpu = smp_processor_id();
+	nmi_shootdown_cpus();
+#ifdef CONFIG_X86_IO_APIC
+        disable_IO_APIC();
+#endif
+	crash_save_self(regs);
 }
 
 /*
--- x/xen/arch/x86/dom0_ops.c
+++ x/xen/arch/x86/dom0_ops.c
@@ -29,6 +29,9 @@
 #include <asm/mtrr.h>
 #include "cpu/mtrr/mtrr.h"
 
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
 #define TRC_DOM0OP_ENTER_BASE  0x00020000
 #define TRC_DOM0OP_LEAVE_BASE  0x00030000
 
--- x/xen/arch/x86/machine_kexec.c
+++ x/xen/arch/x86/machine_kexec.c
@@ -5,34 +5,165 @@
  *
  */
 
-#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/lib.h>
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/flushtlb.h>
+#include <xen/lib.h>
+#include <xen/smp.h>
+#include <xen/nmi.h>
 #include <xen/types.h>
+#include <xen/console.h>
+#include <xen/kexec.h>
 #include <public/kexec.h>
+#include <xen/domain_page.h>
+#include <asm/fixmap.h>
+ 
+#define create_level_mapping(lvl, next, pages, nopages, k, va)               \
+{                                                                            \
+    lvl##_pgentry_t *table;                                                  \
+    void *old = next;                                                        \
+                                                                             \
+    table = (lvl##_pgentry_t *)next + lvl##_table_offset(va);                \
+    if (!(lvl##e_get_flags(*table) & _PAGE_PRESENT)) {                       \
+        if (k >= nopages || pages[k] == 0)                                   \
+            return -1;                                                       \
+        *table = lvl##e_from_pfn(pages[k++]>>PAGE_SHIFT, __PAGE_HYPERVISOR); \
+    }                                                                        \
+    next = map_domain_page(lvl##e_get_pfn(*table));                          \
+    unmap_domain_page(old);                                                  \
+}                                                                            
+
+#define create_level_1_mapping(next, nopages, va, pa)               \
+{                                                                   \
+    l1_pgentry_t *table;                                            \
+                                                                    \
+    table = (l1_pgentry_t *)next + l1_table_offset(va);             \
+    if (!(l1e_get_flags(*table) & _PAGE_PRESENT)) {                 \
+        *table = l1e_from_pfn(pa >> PAGE_SHIFT, __PAGE_HYPERVISOR); \
+    }                                                               \
+    unmap_domain_page(next);                                        \
+}
+
+static int create_mapping(unsigned long root,
+                          unsigned long *pages, int nopages,
+                          unsigned long va, unsigned long pa)
+{
+    void *next = map_domain_page(root >> PAGE_SHIFT);
+    int k = 0;
+
+#if CONFIG_PAGING_LEVELS >= 4
+    create_level_mapping(l4, next, pages, nopages, k, va);
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+    create_level_mapping(l3, next, pages, nopages, k, va);
+#endif
+    create_level_mapping(l2, next, pages, nopages, k, va);
+
+    create_level_1_mapping(next, nopages, va, pa);
+
+    return k;
+}
+
+static int setup_page_table_a(xen_kexec_image_t *image)
+{
+    void *page;
+    int k, n = sizeof(image->page_table_a) / sizeof(image->page_table_a[0]);
+
+    /* clear page_table_a pages */
+
+    for (k = 0; k < n; k++) {
+        if (!image->page_table_a[k])
+            break;
+
+        page = map_domain_page(image->page_table_a[k] >> PAGE_SHIFT);
+        clear_page(page);
+        unmap_domain_page(page);
+    }
+
+    /* check that the first page (root page) is actually non-zero */
+
+    if (k == 0)
+        return -1;
+
+    /* setup fixmap to point to our control page */
+
+    set_fixmap(FIX_KEXEC_PAGE, image->reboot_code_buffer);
+
+    /* fill in page_table_a: create mapping at fixmap address */
+
+    k = create_mapping(image->page_table_a[0],
+                       &image->page_table_a[1],
+                       n - 1, fix_to_virt(FIX_KEXEC_PAGE),
+                       image->reboot_code_buffer);
+    if (k < 0)
+        return -1;
+
+    /* fill in page_table_a: create identity mapping */
+
+    k = create_mapping(image->page_table_a[0],
+                       &image->page_table_a[1 + k],
+                       n - (1 + k), image->reboot_code_buffer,
+                       image->reboot_code_buffer);
+    if (k < 0)
+        return -1;
+
+	return 0;
+}
 
 int machine_kexec_load(int type, xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-    return -1;
+    return setup_page_table_a(image);
 }
 
 void machine_kexec_unload(int type, xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
 }
 
 void machine_kexec_reserved(xen_kexec_reserve_t *reservation)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    reservation->size = opt_kdump_megabytes << 20;
+    reservation->start = opt_kdump_megabytes_base << 20;
 }
 
-void machine_kexec(xen_kexec_image_t *image)
+static void __machine_shutdown(void *data)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    xen_kexec_image_t *image = (xen_kexec_image_t *)data;
+
+    printk("__machine_shutdown: cpu=%u\n", smp_processor_id());
+
+    watchdog_disable();
+    console_start_sync();
+
+    smp_send_stop();
+
+#ifdef CONFIG_X86_IO_APIC
+    disable_IO_APIC();
+#endif   
+
+    machine_kexec(image);
 }
 
 void machine_shutdown(xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    int reboot_cpu_id;
+    cpumask_t reboot_cpu;
+
+    reboot_cpu_id = 0;
+
+    if (!cpu_isset(reboot_cpu_id, cpu_online_map))
+        reboot_cpu_id = smp_processor_id();
+    
+    if (reboot_cpu_id != smp_processor_id()) {
+        cpus_clear(reboot_cpu);
+        cpu_set(reboot_cpu_id, reboot_cpu);
+        on_selected_cpus(reboot_cpu, __machine_shutdown, image, 1, 0);
+	for (;;)
+		; /* nothing */
+    }
+    else
+        __machine_shutdown(image);
+    BUG();
 }
 
 /*
--- x/xen/arch/x86/setup.c
+++ x/xen/arch/x86/setup.c
@@ -38,6 +38,11 @@ static unsigned int opt_xenheap_megabyte
 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
 #endif
 
+unsigned int opt_kdump_megabytes = 0;
+integer_param("kdump_megabytes", opt_kdump_megabytes);
+unsigned int opt_kdump_megabytes_base = 0;
+integer_param("kdump_megabytes_base", opt_kdump_megabytes_base);
+
 /* opt_nosmp: If true, secondary processors are ignored. */
 static int opt_nosmp = 0;
 boolean_param("nosmp", opt_nosmp);
@@ -190,6 +195,20 @@ static void percpu_free_unused_areas(voi
                        __pa(__per_cpu_end));
 }
 
+void __init move_memory(unsigned long dst, 
+                          unsigned long src_start, unsigned long src_end)
+{
+#if defined(CONFIG_X86_32)
+    memmove((void *)dst,  /* use low mapping */
+            (void *)src_start,      /* use low mapping */
+            src_end - src_start);
+#elif defined(CONFIG_X86_64)
+    memmove(__va(dst),
+            __va(src_start),
+            src_end - src_start);
+#endif
+}
+
 void __init __start_xen(multiboot_info_t *mbi)
 {
     char __cmdline[] = "", *cmdline = __cmdline;
@@ -325,15 +344,8 @@ void __init __start_xen(multiboot_info_t
         initial_images_start = xenheap_phys_end;
     initial_images_end = initial_images_start + modules_length;
 
-#if defined(CONFIG_X86_32)
-    memmove((void *)initial_images_start,  /* use low mapping */
-            (void *)mod[0].mod_start,      /* use low mapping */
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#elif defined(CONFIG_X86_64)
-    memmove(__va(initial_images_start),
-            __va(mod[0].mod_start),
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#endif
+    move_memory(initial_images_start, 
+                mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
 
     /* Initialise boot-time allocator with all RAM situated after modules. */
     xenheap_phys_start = init_boot_allocator(__pa(&_end));
@@ -381,6 +393,51 @@ void __init __start_xen(multiboot_info_t
 #endif
     }
 
+    if (opt_kdump_megabytes) {
+        unsigned long kdump_start, kdump_size, k;
+
+        /* mark images pages as free for now */
+
+        init_boot_pages(initial_images_start, initial_images_end);
+
+        kdump_start = opt_kdump_megabytes_base << 20;
+        kdump_size = opt_kdump_megabytes << 20;
+
+        printk("Kdump: %luMB (%lukB) at 0x%lx\n", 
+               kdump_size >> 20,
+               kdump_size >> 10,
+               kdump_start);
+
+        if ((kdump_start & ~PAGE_MASK) || (kdump_size & ~PAGE_MASK))
+            panic("Kdump parameters not page aligned\n");
+
+        kdump_start >>= PAGE_SHIFT;
+        kdump_size >>= PAGE_SHIFT;
+
+        /* allocate pages for Kdump memory area */
+
+        k = alloc_boot_pages_at(kdump_size, kdump_start);
+
+        if (k != kdump_start)
+            panic("Unable to reserve Kdump memory\n");
+
+        /* allocate pages for relocated initial images */
+
+        k = ((initial_images_end - initial_images_start) & ~PAGE_MASK) ? 1 : 0;
+        k += (initial_images_end - initial_images_start) >> PAGE_SHIFT;
+
+        k = alloc_boot_pages(k, 1);
+
+        if (!k)
+            panic("Unable to allocate initial images memory\n");
+
+        move_memory(k << PAGE_SHIFT, initial_images_start, initial_images_end);
+
+        initial_images_end -= initial_images_start;
+        initial_images_start = k << PAGE_SHIFT;
+        initial_images_end += initial_images_start;
+    }        
+
     memguard_init();
 
     printk("System RAM: %luMB (%lukB)\n", 
--- x/xen/arch/x86/x86_32/Makefile
+++ x/xen/arch/x86/x86_32/Makefile
@@ -3,5 +3,6 @@ obj-y += entry.o
 obj-y += mm.o
 obj-y += seg_fixup.o
 obj-y += traps.o
+obj-y += machine_kexec.o
 
 obj-$(supervisor_mode_kernel) += supervisor_mode_kernel.o
--- /dev/null
+++ x/xen/arch/x86/x86_32/machine_kexec.c
@@ -0,0 +1,26 @@
+/*
+ * arch/x86/x86_32/machine_kexec.c
+ * Handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Should be losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <public/kexec.h>
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/x86_64/Makefile
+++ x/xen/arch/x86/x86_64/Makefile
@@ -1,3 +1,4 @@
 obj-y += entry.o
 obj-y += mm.o
 obj-y += traps.o
+obj-y += machine_kexec.o
--- /dev/null
+++ x/xen/arch/x86/x86_64/machine_kexec.c
@@ -0,0 +1,27 @@
+/******************************************************************************
+ * arch/x86/x86_64/machine_kexec.c
+ * Handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Should be losely based on arch/x86_64/kernel/machine_kexec.c
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/elf.h
@@ -0,0 +1,27 @@
+/******************************************************************************
+ * include/asm-x86/elf.h
+ * 
+ * Created By: Horms
+ *
+ */
+
+#ifndef __X86_ELF_H__
+#define __X86_ELF_H__
+
+#ifdef __x86_64__
+#include <asm/x86_64/elf.h>
+#else
+#include <asm/x86_32/elf.h>
+#endif
+
+#endif /* __X86_ELF_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/asm-x86/fixmap.h
+++ x/xen/include/asm-x86/fixmap.h
@@ -36,6 +36,7 @@ enum fixed_addresses {
     FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
     FIX_HPET_BASE,
     FIX_CYCLONE_TIMER,
+    FIX_KEXEC_PAGE,
     __end_of_fixed_addresses
 };
 
--- x/xen/include/asm-x86/hypercall.h
+++ x/xen/include/asm-x86/hypercall.h
@@ -6,6 +6,7 @@
 #define __ASM_X86_HYPERCALL_H__
 
 #include <public/physdev.h>
+#include <xen/types.h>
 
 extern long
 do_event_channel_op_compat(
@@ -87,6 +88,10 @@ extern long
 arch_do_vcpu_op(
     int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg);
 
+extern int
+do_kexec(
+    unsigned long op, unsigned arg1, XEN_GUEST_HANDLE(void) uarg);
+
 #ifdef __x86_64__
 
 extern long
--- x/xen/include/asm-x86/kexec.h
+++ x/xen/include/asm-x86/kexec.h
@@ -8,15 +8,16 @@
 #ifndef __X86_KEXEC_H__
 #define __X86_KEXEC_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <asm/processor.h>
 #include <xen/types.h>
+#include <xen/string.h>
 #include <public/xen.h>
 
-static void crash_setup_regs(struct cpu_user_regs *newregs,
-			     struct cpu_user_regs *oldregs)
-{
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-}
+#ifdef __x86_64__
+#include <asm/x86_64/kexec.h>
+#else
+#include <asm/x86_32/kexec.h>
+#endif
 
 #endif /* __X86_KEXEC_H__ */
 
--- /dev/null
+++ x/xen/include/asm-x86/x86_32/elf.h
@@ -0,0 +1,28 @@
+/******************************************************************************
+ * include/asm-x86/x86_32/elf.h
+ * 
+ * Created By: Horms
+ *
+ * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS
+ * from Linux 2.6.16
+ */
+
+#ifndef __X86_ELF_X86_32_H__
+#define __X86_ELF_X86_32_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+
+#endif /* __X86_ELF_X86_32_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_32/kexec.h
@@ -0,0 +1,48 @@
+/******************************************************************************
+ * include/asm-x86/x86_32/kexec.h
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16
+ *
+ */
+
+#ifndef __X86_32_KEXEC_H__
+#define __X86_32_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
+		    struct cpu_user_regs *oldregs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    return;
+    crash_fixup_ss_esp(newregs, oldregs);
+}
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+static inline int user_mode(struct cpu_user_regs *regs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    return -1;
+}
+
+
+#endif /* __X86_32_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_64/elf.h
@@ -0,0 +1,28 @@
+/******************************************************************************
+ * include/asm-x86/x86_64/elf.h
+ * 
+ * Created By: Horms
+ *
+ * Should pull be based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS
+ * from Linux 2.6.16
+ */
+
+#ifndef __X86_ELF_X86_64_H__
+#define __X86_ELF_X86_64_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+
+#endif /* __X86_ELF_X86_64_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_64/kexec.h
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * include/asm-x86/x86_64/kexec.h
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on include/asm-x86_64/kexec.h from Linux 2.6.16
+ *
+ */
+
+#ifndef __X86_64_KEXEC_H__
+#define __X86_64_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+#endif /* __X86_64_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/public/kexec.h
+++ x/xen/include/public/kexec.h
@@ -43,6 +43,8 @@
  */
 #define KEXEC_CMD_kexec_load            1
 typedef struct xen_kexec_image {
+    unsigned long page_table_a[7];
+    unsigned long page_table_b;
     unsigned long indirection_page;
     unsigned long reboot_code_buffer;
     unsigned long start_address;
--- x/xen/include/xen/elfcore.h
+++ x/xen/include/xen/elfcore.h
@@ -16,6 +16,9 @@
 #include <public/xen.h>
 
 #define NT_PRSTATUS     1
+#define NT_XEN_DOM0_CR3 0x10000001 /* XXX: Hopefully this is unused,
+					   feel free to change to a 
+					   better/different value */
 
 typedef struct
 {

[-- Attachment #5: 51.2.1.1-kexec-x86_32-upstream.patch --]
[-- Type: text/plain, Size: 31739 bytes --]

kexec: x86_32

This is the x86_32 component of kexec for xen.
The x86 component is a prerequsite for this patch.

Signed-Off-By: Horms <horms@verge.net.au>
Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>

 buildconfigs/linux-defconfig_xen_x86_32                                  |    2 
 linux-2.6-xen-sparse/arch/i386/Kconfig                                   |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/Makefile                           |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c                        |   29 
 linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h                        |   42 
 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h           |    8 
 patches/linux-2.6.16.13/1-linux-2.6.16-kexec_page_table_a_i386.patch     |  457 ++++++++++
 patches/linux-2.6.16.13/4-linux-2.6.16-kexec_page_table_a_i386-xen.patch |   59 +
 xen/arch/x86/crash.c                                                     |   47 +
 xen/arch/x86/x86_32/entry.S                                              |    2 
 xen/arch/x86/x86_32/machine_kexec.c                                      |   27 
 xen/include/asm-x86/x86_32/elf.h                                         |   32 
 xen/include/asm-x86/x86_32/kexec.h                                       |   65 +
 13 files changed, 741 insertions(+), 33 deletions(-)

--- x/buildconfigs/linux-defconfig_xen_x86_32
+++ x/buildconfigs/linux-defconfig_xen_x86_32
@@ -184,6 +184,7 @@ CONFIG_MTRR=y
 CONFIG_REGPARM=y
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
@@ -2775,6 +2776,7 @@ CONFIG_NTFS_FS=m
 #
 CONFIG_PROC_FS=y
 CONFIG_PROC_KCORE=y
+# CONFIG_PROC_VMCORE is not set
 CONFIG_SYSFS=y
 CONFIG_TMPFS=y
 # CONFIG_HUGETLB_PAGE is not set
--- x/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ x/linux-2.6-xen-sparse/arch/i386/Kconfig
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_XEN
+	depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
@@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
@@ -68,6 +68,10 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -941,6 +945,7 @@ static void __init parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -951,6 +956,10 @@ static void __init parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1320,9 +1329,22 @@ void __init setup_bootmem_allocator(void
 	}
 #endif
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end)
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
+#else
+	{
+		xen_kexec_reserve_t reservation;
+		BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0,
+					&reservation));
+		if (reservation.size) {
+			crashk_res.start = reservation.start;
+			crashk_res.end = reservation.start + 
+				reservation.size - 1;
+		}
+	}
+#endif
 #endif
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1378,6 +1400,10 @@ legacy_init_iomem_resources(struct e820e
 		res->end = res->start + e820[i].size - 1;
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		request_resource(&iomem_resource, res);
+#ifdef CONFIG_KEXEC
+		request_resource(res, &crashk_res);
+#endif
+
 		if (e820[i].type == E820_RAM) {
 			/*
 			 *  We don't know which RAM region contains kernel data,
@@ -1386,9 +1412,6 @@ legacy_init_iomem_resources(struct e820e
 			 */
 			request_resource(res, code_resource);
 			request_resource(res, data_resource);
-#ifdef CONFIG_KEXEC
-			request_resource(res, &crashk_res);
-#endif
 		}
 	}
 }
--- /dev/null
+++ x/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h
@@ -0,0 +1,42 @@
+/*
+ * include/asm-i386/kexec-xen.h
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _I386_KEXEC_XEN_H
+#define _I386_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_32.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+					struct cpu_user_regs *xen_regs)
+{
+	xen_regs->ebx    = linux_regs->ebx;
+	xen_regs->ecx    = linux_regs->ecx;
+	xen_regs->edx    = linux_regs->edx;
+	xen_regs->esi    = linux_regs->esi;
+	xen_regs->edi    = linux_regs->edi;
+	xen_regs->ebp    = linux_regs->ebp;
+	xen_regs->eax    = linux_regs->eax;
+	xen_regs->esp    = linux_regs->esp;
+	xen_regs->ss     = linux_regs->xss;
+	xen_regs->cs     = linux_regs->xcs;
+	xen_regs->ds     = linux_regs->xds;
+	xen_regs->es     = linux_regs->xes;
+	xen_regs->eflags = linux_regs->eflags;
+}
+
+#endif /* _I386_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
@@ -368,5 +368,13 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, unsigned int arg1, void * extra_args)
+{
+	return _hypercall3(int, kexec_op, op, arg1, extra_args);
+}
+
+
 
 #endif /* __HYPERCALL_H__ */
--- x/xen/arch/x86/crash.c
+++ x/xen/arch/x86/crash.c
@@ -21,6 +21,7 @@
 #include <xen/delay.h>
 #include <xen/perfc.h>
 #include <xen/kexec.h>
+#include <xen/sched.h>
 #include <public/xen.h>
 
 static int crashing_cpu;
@@ -169,6 +170,51 @@ static void nmi_shootdown_cpus(void)
 }
 #endif
 
+/* The cr3 for dom0 on each of its vcpus
+ * It is added as ELF_Prstatus prstatus.pr_reg[ELF_NGREG-1)], where
+ * prstatus is the data of the elf note, and ELF_NGREG was extended
+ * by one to allow extra space.
+ * This code runs after all cpus except the crashing one have
+ * been shutdown so as to avoid having to hold domlist_lock,
+ * as locking after a crash is playing with fire */
+void find_dom0_cr3(void)
+{
+	struct domain *d;
+	struct vcpu   *v;
+	uint32_t *buf;
+	uint32_t cr3;
+	Elf_Note note;
+
+	/* Don't need to grab domlist_lock as we are the only thing running */
+
+	/* No need to traverse domain_list, as dom0 is always first */
+	d = domain_list;
+	BUG_ON(d->domain_id);
+
+	for_each_vcpu ( d, v ) {
+		if ( test_bit(_VCPUF_down, &v->vcpu_flags) )
+			continue;
+		buf = (uint32_t *)per_cpu(crash_notes, v->processor);
+		if (!buf) /* XXX: Can this ever occur? */
+			continue;
+
+		memcpy(&note, buf, sizeof(Elf_Note));
+		buf += (sizeof(Elf_Note) +3)/4 + (note.namesz + 3)/4 +
+			(note.descsz + 3)/4;
+
+		/* XXX: This probably doesn't take into account shadow mode,
+		 * but that might not be a problem */
+		cr3 = pagetable_get_pfn(v->arch.guest_table);
+
+		buf = append_elf_note(buf, "Xen Domanin-0 CR3",
+			NT_XEN_DOM0_CR3, &cr3, 4);
+		final_note(buf);
+
+		printk("domain:%i vcpu:%u processor:%u cr3:%08x\n", 
+		       d->domain_id, v->vcpu_id, v->processor, cr3);
+	}
+}
+
 void machine_crash_shutdown(struct cpu_user_regs *regs)
 {
 	printk("machine_crash_shutdown: %d\n", smp_processor_id());
@@ -180,6 +226,7 @@ void machine_crash_shutdown(struct cpu_u
         disable_IO_APIC();
 #endif
 	crash_save_self(regs);
+	find_dom0_cr3();
 }
 
 /*
--- x/xen/arch/x86/x86_32/entry.S
+++ x/xen/arch/x86/x86_32/entry.S
@@ -657,6 +657,7 @@ ENTRY(hypercall_table)
         .long do_event_channel_op
         .long do_physdev_op
         .long do_hvm_op             /* 34 */
+        .long do_kexec
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -697,6 +698,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_event_channel_op  */
         .byte 2 /* do_physdev_op        */
         .byte 2 /* do_hvm_op            */  /* 34 */
+        .byte 3 /* do_kexec_op          */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- x/xen/arch/x86/x86_32/machine_kexec.c
+++ x/xen/arch/x86/x86_32/machine_kexec.c
@@ -1,18 +1,31 @@
-/*
+/******************************************************************************
  * arch/x86/x86_32/machine_kexec.c
- * Handle transition of Linux booting another kernel
- *
- * Created By: Horms <horms@verge.net.au>
+ * 
+ * Created By: Horms
  *
- * Should be losely based on arch/i386/kernel/machine_kexec.c
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
  */
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
 #include <public/kexec.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+                    unsigned long indirection_page,
+                    unsigned long reboot_code_buffer,
+                    unsigned long start_address,
+                    unsigned long page_table_a,
+                    unsigned long has_pae);
 
 void machine_kexec(xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    relocate_new_kernel_t rnk;
+
+    rnk = (relocate_new_kernel_t) fix_to_virt(FIX_KEXEC_PAGE);
+    (*rnk)(image->indirection_page, image->reboot_code_buffer, 
+           image->start_address, image->page_table_a[0],
+           (unsigned long)cpu_has_pae);
 }
 
 /*
--- x/xen/include/asm-x86/x86_32/elf.h
+++ x/xen/include/asm-x86/x86_32/elf.h
@@ -3,17 +3,39 @@
  * 
  * Created By: Horms
  *
- * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS
- * from Linux 2.6.16
+ * Based heavily on include/asm-i386/elf.h and 
+ * include/asm-i386/system.h from Linux 2.6.16
  */
 
 #ifndef __X86_ELF_X86_32_H__
 #define __X86_ELF_X86_32_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+/* XXX: Xen doesn't have orig_eax.  For kdump, on a dom0 crash, the values
+ * for the crashing CPU could could be passed down from dom0, but is that
+ * neccessary?
+ * Also, I'm not sure why fs and gs are derived from the CPU
+ * rather than regs */
 
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+#define ELF_CORE_COPY_REGS(pr_reg, regs) do {                           \
+    unsigned i;                                                         \
+    pr_reg[0] = regs->ebx;                                              \
+    pr_reg[1] = regs->ecx;                                              \
+    pr_reg[2] = regs->edx;                                              \
+    pr_reg[3] = regs->esi;                                              \
+    pr_reg[4] = regs->edi;                                              \
+    pr_reg[5] = regs->ebp;                                              \
+    pr_reg[6] = regs->eax;                                              \
+    pr_reg[7] = regs->ds;                                               \
+    pr_reg[8] = regs->es;                                               \
+    asm volatile("mov %%fs,%0":"=rm" (i)); pr_reg[9]  = i;              \
+    asm volatile("mov %%gs,%0":"=rm" (i)); pr_reg[10] = i;              \
+    pr_reg[11] = 0; /* regs->orig_eax; */                               \
+    pr_reg[12] = regs->eip;                                             \
+    pr_reg[13] = regs->cs;                                              \
+    pr_reg[14] = regs->eflags;                                          \
+    pr_reg[15] = regs->esp;                                             \
+    pr_reg[16] = regs->ss;                                              \
+} while(0);
 
 #endif /* __X86_ELF_X86_32_H__ */
 
--- x/xen/include/asm-x86/x86_32/kexec.h
+++ x/xen/include/asm-x86/x86_32/kexec.h
@@ -3,39 +3,72 @@
  * 
  * Created By: Horms
  *
- * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16
- *
+ * Based heavily on include/asm-i386/kexec.h from Linux 2.6.16
  */
 
-#ifndef __X86_32_KEXEC_H__
-#define __X86_32_KEXEC_H__
-
-#include <xen/lib.h>       /* for printk() used in stub */
-#include <xen/types.h>
-#include <public/xen.h>
+#ifndef __X86_KEXEC_X86_32_H__
+#define __X86_KEXEC_X86_32_H__
 
+/* CPU does not save ss and esp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
 static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
-		    struct cpu_user_regs *oldregs)
+                    struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-    return;
-    crash_fixup_ss_esp(newregs, oldregs);
+    memcpy(newregs, oldregs, sizeof(*newregs));
+    newregs->esp = (unsigned long)&(oldregs->esp);
+    __asm__ __volatile__(
+            "xorl %%eax, %%eax\n\t"
+            "movw %%ss, %%ax\n\t"
+            :"=a"(newregs->ss));
 }
 
+/*
+ * This function is responsible for capturing register states if coming
+ * via panic otherwise just fix up the ss and esp if coming via kernel
+ * mode exception.
+ */
 static void crash_setup_regs(struct cpu_user_regs *newregs,
 			     struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    if (oldregs)
+        crash_fixup_ss_esp(newregs, oldregs);
+    else {
+        __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx));
+        __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx));
+        __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx));
+        __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi));
+        __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi));
+        __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp));
+        __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax));
+        __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp));
+        __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->ss));
+        __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->cs));
+        __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->ds));
+        __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->es));
+        __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags));
+
+        newregs->eip = (unsigned long)current_text_addr();
+    }
 }
 
+/*              
+ * From Linux 2.6.16's include/asm-i386/mach-xen/asm/ptrace.h
+ *
+ * user_mode_vm(regs) determines whether a register set came from user mode.
+ * This is true if V8086 mode was enabled OR if the register set was from
+ * protected mode with RPL-3 CS value.  This tricky test checks that with
+ * one comparison.  Many places in the kernel can bypass this full check
+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ */
 static inline int user_mode(struct cpu_user_regs *regs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-    return -1;
+    return (regs->cs & 2) != 0;
 }
 
 
-#endif /* __X86_32_KEXEC_H__ */
+#endif /* __X86_KEXEC_X86_32_H__ */
 
 /*
  * Local variables:
--- /dev/null
+++ x/patches/linux-2.6.16.13/1-linux-2.6.16-kexec_page_table_a_i386.patch
@@ -0,0 +1,457 @@
+kexec: Avoid overwriting the current pgd (V2, i386)
+
+This patch upgrades the i386-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables called "page_table_a". These
+tables are used to provide an executable identity mapping without overwriting
+the current pgd. This updated version of the patch fixes a PAE bug and moves
+the segment handling code into the reloacte_kernel.S.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+---
+
+ The patch has been tested with regular kexec and CONFIG_CRASH_DUMP.
+ Both PAE and non-PAE configurations work well.
+ Applies on top of 2.6.16 and 2.6.17-rc4.
+
+ arch/i386/kernel/machine_kexec.c   |  230 ++++++++++++++----------------------
+ arch/i386/kernel/relocate_kernel.S |   92 ++++++++++++++
+ include/asm-i386/kexec.h           |   12 +
+ 3 files changed, 192 insertions(+), 142 deletions(-)
+
+--- x/arch/i386/kernel/machine_kexec.c
++++ x/arch/i386/kernel/machine_kexec.c
+@@ -2,6 +2,10 @@
+  * machine_kexec.c - handle transition of Linux booting another kernel
+  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - rewrote identity map code to avoid overwriting current pgd
++ * - moved segment handling code into relocate_kernel.S
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+@@ -19,123 +23,73 @@
+ #include <asm/desc.h>
+ #include <asm/system.h>
+ 
+-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+-
+-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L2_ATTR (_PAGE_PRESENT)
+-
+-#define LEVEL0_SIZE (1UL << 12UL)
++typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
++					unsigned long indirection_page,
++					unsigned long reboot_code_buffer,
++					unsigned long start_address,
++					unsigned long page_table_a,
++					unsigned long has_pae) ATTRIB_NORET;
+ 
+-#ifndef CONFIG_X86_PAE
+-#define LEVEL1_SIZE (1UL << 22UL)
+-static u32 pgtable_level1[1024] PAGE_ALIGNED;
++const extern unsigned char relocate_new_kernel[];
++extern void relocate_new_kernel_end(void);
++const extern unsigned int relocate_new_kernel_size;
+ 
+-static void identity_map_page(unsigned long address)
++static int allocate_page_table_a(struct kimage *image)
+ {
+-	unsigned long level1_index, level2_index;
+-	u32 *pgtable_level2;
+-
+-	/* Find the current page table */
+-	pgtable_level2 = __va(read_cr3());
++	struct kimage_arch *arch = &image->arch_data;
++	struct page *page;
++	int k = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
++
++	for (; k > 0; k--) {
++		page = kimage_alloc_control_pages(image, 0);
++		if (!page)
++			return -ENOMEM;
++
++		clear_page(page_address(page));
++		arch->page_table_a[k - 1] = page;
++	}
+ 
+-	/* Find the indexes of the physical address to identity map */
+-	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+-	level2_index = address / LEVEL1_SIZE;
+-
+-	/* Identity map the page table entry */
+-	pgtable_level1[level1_index] = address | L0_ATTR;
+-	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+-
+-	/* Flush the tlb so the new mapping takes effect.
+-	 * Global tlb entries are not flushed but that is not an issue.
+-	 */
+-	load_cr3(pgtable_level2);
++	return 0;
+ }
+ 
+-#else
+-#define LEVEL1_SIZE (1UL << 21UL)
+-#define LEVEL2_SIZE (1UL << 30UL)
+-static u64 pgtable_level1[512] PAGE_ALIGNED;
+-static u64 pgtable_level2[512] PAGE_ALIGNED;
+-
+-static void identity_map_page(unsigned long address)
+-{
+-	unsigned long level1_index, level2_index, level3_index;
+-	u64 *pgtable_level3;
++/* workaround for include/asm-i386/pgtable-3level.h */
+ 
+-	/* Find the current page table */
+-	pgtable_level3 = __va(read_cr3());
+-
+-	/* Find the indexes of the physical address to identity map */
+-	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+-	level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
+-	level3_index = address / LEVEL2_SIZE;
+-
+-	/* Identity map the page table entry */
+-	pgtable_level1[level1_index] = address | L0_ATTR;
+-	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+-	set_64bit(&pgtable_level3[level3_index],
+-					       __pa(pgtable_level2) | L2_ATTR);
+-
+-	/* Flush the tlb so the new mapping takes effect.
+-	 * Global tlb entries are not flushed but that is not an issue.
+-	 */
+-	load_cr3(pgtable_level3);
+-}
++#ifdef CONFIG_X86_PAE
++#undef pgd_present
++#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
++#define _PGD_ATTR _PAGE_PRESENT
++#else
++#define _PGD_ATTR _KERNPG_TABLE
+ #endif
+ 
+-static void set_idt(void *newidt, __u16 limit)
+-{
+-	struct Xgt_desc_struct curidt;
+-
+-	/* ia32 supports unaliged loads & stores */
+-	curidt.size    = limit;
+-	curidt.address = (unsigned long)newidt;
+-
+-	load_idt(&curidt);
+-};
++#define pa_page(page) __pa(page_address(page))
+ 
+-
+-static void set_gdt(void *newgdt, __u16 limit)
++static int create_mapping(struct page *root, struct page **pages, 
++			  unsigned long va, unsigned long pa)
+ {
+-	struct Xgt_desc_struct curgdt;
+-
+-	/* ia32 supports unaligned loads & stores */
+-	curgdt.size    = limit;
+-	curgdt.address = (unsigned long)newgdt;
++	pgd_t *pgd;
++	pud_t *pud;
++	pmd_t *pmd;
++	pte_t *pte;
++	int k = 0;
+ 
+-	load_gdt(&curgdt);
+-};
++	pgd = (pgd_t *)page_address(root) + pgd_index(va);
++	if (!pgd_present(*pgd))
++		set_pgd(pgd, __pgd(pa_page(pages[k++]) | _PGD_ATTR));
+ 
+-static void load_segments(void)
+-{
+-#define __STR(X) #X
+-#define STR(X) __STR(X)
++	pud = pud_offset(pgd, va);
++	if (!pud_present(*pud))
++		set_pud(pud, __pud(pa_page(pages[k++]) | _KERNPG_TABLE));
+ 
+-	__asm__ __volatile__ (
+-		"\tljmp $"STR(__KERNEL_CS)",$1f\n"
+-		"\t1:\n"
+-		"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
+-		"\tmovl %%eax,%%ds\n"
+-		"\tmovl %%eax,%%es\n"
+-		"\tmovl %%eax,%%fs\n"
+-		"\tmovl %%eax,%%gs\n"
+-		"\tmovl %%eax,%%ss\n"
+-		::: "eax", "memory");
+-#undef STR
+-#undef __STR
+-}
++	pmd = pmd_offset(pud, va);
++	if (!pmd_present(*pmd))
++		set_pmd(pmd, __pmd(pa_page(pages[k++]) | _KERNPG_TABLE));
+ 
+-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
+-					unsigned long indirection_page,
+-					unsigned long reboot_code_buffer,
+-					unsigned long start_address,
+-					unsigned int has_pae) ATTRIB_NORET;
++	pte = (pte_t *)page_address(pmd_page(*pmd)) + pte_index(va);
++	set_pte(pte, __pte(pa | _PAGE_KERNEL_EXEC));
+ 
+-const extern unsigned char relocate_new_kernel[];
+-extern void relocate_new_kernel_end(void);
+-const extern unsigned int relocate_new_kernel_size;
++	return k;
++}
+ 
+ /*
+  * A architecture hook called to validate the
+@@ -147,11 +101,38 @@ const extern unsigned int relocate_new_k
+  * Do what every setup is needed on image and the
+  * reboot code buffer to allow us to avoid allocations
+  * later.
+- *
+- * Currently nothing.
+  */
+ int machine_kexec_prepare(struct kimage *image)
+ {
++	void *control_page;
++	unsigned long pa;
++	int k;
++
++	memset(&image->arch_data, 0, sizeof(image->arch_data));
++
++	k = allocate_page_table_a(image);
++	if (k)
++		return k;
++
++	/* fill in control_page with assembly code */
++
++	control_page = page_address(image->control_code_page);
++	memcpy(control_page, relocate_new_kernel, relocate_new_kernel_size);
++
++	/* map the control_page at the virtual address of relocate_kernel.S */
++
++	pa = __pa(control_page);
++
++	k = create_mapping(image->arch_data.page_table_a[0], 
++			   &image->arch_data.page_table_a[1],
++			   (unsigned long)relocate_new_kernel, pa);
++
++	/* identity map the control_page */
++
++	create_mapping(image->arch_data.page_table_a[0], 
++		       &image->arch_data.page_table_a[k + 1],
++		       pa, pa);
++
+ 	return 0;
+ }
+ 
+@@ -170,45 +151,16 @@ void machine_kexec_cleanup(struct kimage
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+ 	unsigned long page_list;
+-	unsigned long reboot_code_buffer;
+-
++	unsigned long control_code;
++	unsigned long page_table_a;
+ 	relocate_new_kernel_t rnk;
+ 
+-	/* Interrupts aren't acceptable while we reboot */
+-	local_irq_disable();
+-
+-	/* Compute some offsets */
+-	reboot_code_buffer = page_to_pfn(image->control_code_page)
+-								<< PAGE_SHIFT;
+ 	page_list = image->head;
+-
+-	/* Set up an identity mapping for the reboot_code_buffer */
+-	identity_map_page(reboot_code_buffer);
+-
+-	/* copy it out */
+-	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+-						relocate_new_kernel_size);
+-
+-	/* The segment registers are funny things, they are
+-	 * automatically loaded from a table, in memory wherever you
+-	 * set them to a specific selector, but this table is never
+-	 * accessed again you set the segment to a different selector.
+-	 *
+-	 * The more common model is are caches where the behide
+-	 * the scenes work is done, but is also dropped at arbitrary
+-	 * times.
+-	 *
+-	 * I take advantage of this here by force loading the
+-	 * segments, before I zap the gdt with an invalid value.
+-	 */
+-	load_segments();
+-	/* The gdt & idt are now invalid.
+-	 * If you want to load them you must set up your own idt & gdt.
+-	 */
+-	set_gdt(phys_to_virt(0),0);
+-	set_idt(phys_to_virt(0),0);
++	control_code = __pa(page_address(image->control_code_page));
++	page_table_a = __pa(page_address(image->arch_data.page_table_a[0]));
+ 
+ 	/* now call it */
+-	rnk = (relocate_new_kernel_t) reboot_code_buffer;
+-	(*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
++	rnk = (relocate_new_kernel_t) relocate_new_kernel;
++	(*rnk)(page_list, control_code, image->start, 
++	       page_table_a, (unsigned long)cpu_has_pae);
+ }
+--- x/arch/i386/kernel/relocate_kernel.S
++++ x/arch/i386/kernel/relocate_kernel.S
+@@ -2,12 +2,20 @@
+  * relocate_kernel.S - put the kernel image in place to boot
+  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - moved segment handling code from machine_kexec.c
++ * - gdt tables stolen from arch/i386/boot/setup.S
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+ 
+ #include <linux/linkage.h>
++#include <asm/page.h>
+ 
++.text
++.align (1 << PAGE_SHIFT)
++	
+ 	/*
+ 	 * Must be relocatable PIC code callable as a C function, that once
+ 	 * it starts can not use the previous processes stack.
+@@ -18,18 +26,68 @@ relocate_new_kernel:
+ 	movl  4(%esp), %ebx /* page_list */
+ 	movl  8(%esp), %ebp /* reboot_code_buffer */
+ 	movl  12(%esp), %edx /* start address */
+-	movl  16(%esp), %ecx /* cpu_has_pae */
++	movl  16(%esp), %edi /* page_table_a */
++	movl  20(%esp), %ecx /* cpu_has_pae */
+ 
+ 	/* zero out flags, and disable interrupts */
+ 	pushl $0
+ 	popfl
+ 
++	/* switch to page_table_a */
++	movl	%edi, %eax
++	movl	%eax, %cr3
++
++	/* setup idt */
++
++	movl	%ebp, %eax
++	addl	$(idt_48 - relocate_new_kernel), %eax
++	lidtl	(%eax)
++
++	/* setup gdt */
++
++	movl	%ebp, %eax
++	addl	$(gdt - relocate_new_kernel), %eax
++	movl	%ebp, %esi
++	addl	$((gdt_48 - relocate_new_kernel) + 2), %esi
++	movl	%eax, (%esi)
++	
++	movl	%ebp, %eax
++	addl	$(gdt_48 - relocate_new_kernel), %eax
++	lgdtl	(%eax)
++
++	/* setup data segment registers */
++	
++	mov	$(gdt_ds - gdt), %eax
++	mov	%eax, %ds
++	mov	%eax, %es
++	mov	%eax, %fs
++	mov	%eax, %gs
++	mov	%eax, %ss
++
+ 	/* set a new stack at the bottom of our page... */
+ 	lea   4096(%ebp), %esp
+ 
++	/* load new code segment */
++
++	movl	%ebp, %esi
++	xorl	%eax, %eax
++	pushl	%eax
++	pushl	%esi
++	pushl	%eax
++	
++	movl	$(gdt_cs - gdt), %eax
++	pushl	%eax
++	
++	movl	%ebp, %eax
++	addl	$(identity_mapped - relocate_new_kernel),%eax
++	pushl	%eax
++	iretl
++
++identity_mapped:	
++
+ 	/* store the parameters back on the stack */
+ 	pushl   %edx /* store the start address */
+-
++	
+ 	/* Set cr0 to a known state:
+ 	 * 31 0 == Paging disabled
+ 	 * 18 0 == Alignment check disabled
+@@ -113,6 +171,36 @@ relocate_new_kernel:
+ 	xorl    %edi, %edi
+ 	xorl    %ebp, %ebp
+ 	ret
++
++	.align	16
++gdt:
++	.fill	1,8,0
++
++gdt_cs:	
++	.word	0xFFFF				# 4Gb - (0x100000*0x1000 = 4Gb)
++	.word	0				# base address = 0
++	.word	0x9A00				# code read/exec
++	.word	0x00CF				# granularity = 4096, 386
++						#  (+5th nibble of limit)
++gdt_ds:
++	.word	0xFFFF				# 4Gb - (0x100000*0x1000 = 4Gb)
++	.word	0				# base address = 0
++	.word	0x9200				# data read/write
++	.word	0x00CF				# granularity = 4096, 386
++						#  (+5th nibble of limit)
++gdt_end:
++	.align	4
++	
++	.word	0				# alignment byte
++idt_48:
++	.word	0				# idt limit = 0
++	.word	0, 0				# idt base = 0L
++
++	.word	0				# alignment byte
++gdt_48:
++	.word	gdt_end - gdt - 1		# gdt limit
++	.word	0, 0				# gdt base (filled in later)
++	
+ relocate_new_kernel_end:
+ 
+ 	.globl relocate_new_kernel_size
+--- x/include/asm-i386/kexec.h
++++ x/include/asm-i386/kexec.h
+@@ -29,7 +29,17 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
+-struct kimage_arch {};
++struct kimage_arch {
++       /* page_table_a[] holds enough pages to create a new page table
++        * that maps the control page twice..
++        */
++
++#if defined(CONFIG_X86_PAE)
++       struct page *page_table_a[5]; /* (2 * pte) + (2 * pmd) + pgd */
++#else
++       struct page *page_table_a[3]; /* (2 * pte) + pgd */
++#endif
++};
+ 
+ /* CPU does not save ss and esp on stack if execution is already
+  * running in kernel mode at the time of NMI occurrence. This code
--- /dev/null
+++ x/patches/linux-2.6.16.13/4-linux-2.6.16-kexec_page_table_a_i386-xen.patch
@@ -0,0 +1,59 @@
+kexec: xen specific portions of the page table a patch for kexec
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+
+ arch/i386/kernel/machine_kexec.c |   23 +++++++++++++++++++++++
+ 1 file changed, 23 insertions(+)
+
+--- x/arch/i386/kernel/machine_kexec.c	2006-07-12 13:16:20.000000000 +0900
++++ x/arch/i386/kernel/machine_kexec.c	2006-07-12 13:16:38.000000000 +0900
+@@ -23,15 +23,23 @@
+ #include <asm/desc.h>
+ #include <asm/system.h>
+ 
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#endif
++
++#ifndef CONFIG_XEN
+ typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
+ 					unsigned long indirection_page,
+ 					unsigned long reboot_code_buffer,
+ 					unsigned long start_address,
+ 					unsigned long page_table_a,
+ 					unsigned long has_pae) ATTRIB_NORET;
++#endif
+ 
+ const extern unsigned char relocate_new_kernel[];
++#ifndef CONFIG_XEN
+ extern void relocate_new_kernel_end(void);
++#endif
+ const extern unsigned int relocate_new_kernel_size;
+ 
+ static int allocate_page_table_a(struct kimage *image)
+@@ -144,6 +152,7 @@
+ {
+ }
+ 
++#ifndef CONFIG_XEN
+ /*
+  * Do not allocate memory (or fail in any way) in machine_kexec().
+  * We are past the point of no return, committed to rebooting now.
+@@ -164,3 +173,17 @@
+ 	(*rnk)(page_list, control_code, image->start, 
+ 	       page_table_a, (unsigned long)cpu_has_pae);
+ }
++#endif
++
++#ifdef CONFIG_XEN
++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
++{
++	struct kimage_arch *arch = &image->arch_data;
++	int k, n = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
++
++	for (k = 0; k < n; k++)
++		xki->page_table_a[k] =
++			pfn_to_mfn(page_to_pfn(arch->page_table_a[k]))
++				<< PAGE_SHIFT;
++}
++#endif

[-- Attachment #6: 51.2.1.2-kexec-x86_64-upstream.patch --]
[-- Type: text/plain, Size: 30055 bytes --]

kexec: x86_64

This is the first x86_64 release of kexec for xen/dom0. The code is in an
early phase, but it compiles and kexec:ing into a Linux kernel seems to work 
well. Rebooting into a new kernel may work using kdump too, but register
saving support is still missing.

The x86 component is a prerequsite for this patch.

Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
Signed-Off-By: Horms <horms@verge.net.au>

 buildconfigs/linux-defconfig_xen_x86_64                                    |    1 
 linux-2.6-xen-sparse/arch/x86_64/Kconfig                                   |    2 
 linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile                           |    2 
 linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c                        |   26 
 linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h                        |   49 +
 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h           |    7 
 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h              |    2 
 patches/linux-2.6.16.13/2-linux-2.6.16-kexec_page_table_a_x86_64.patch     |  421 ++++++++++
 patches/linux-2.6.16.13/3-linux-2.6.16-kexec_page_table_a_x86_64-xen.patch |  151 +++
 xen/arch/x86/x86_64/entry.S                                                |    2 
 xen/arch/x86/x86_64/machine_kexec.c                                        |   21 
 xen/include/asm-x86/x86_64/elf.h                                           |   48 +
 xen/include/asm-x86/x86_64/kexec.h                                         |   33 
 13 files changed, 751 insertions(+), 14 deletions(-)

--- x/buildconfigs/linux-defconfig_xen_x86_64
+++ x/buildconfigs/linux-defconfig_xen_x86_64
@@ -139,6 +139,7 @@ CONFIG_SWIOTLB=y
 CONFIG_PHYSICAL_START=0x100000
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
--- x/linux-2.6-xen-sparse/arch/x86_64/Kconfig
+++ x/linux-2.6-xen-sparse/arch/x86_64/Kconfig
@@ -433,7 +433,7 @@ config X86_MCE_AMD
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_64_XEN
+	depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
@@ -59,7 +59,7 @@ pci-dma-y			+= ../../i386/kernel/pci-dma
 microcode-$(subst m,y,$(CONFIG_MICROCODE))  := ../../i386/kernel/microcode-xen.o
 quirks-y			:= ../../i386/kernel/quirks-xen.o
 
-n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o
+n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o crash.o
 
 include $(srctree)/scripts/Makefile.xen
 
--- x/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
@@ -79,6 +79,10 @@
 #include <asm/mach-xen/setup_arch_post.h>
 #include <xen/interface/memory.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 extern unsigned long start_pfn;
 extern struct edid_info edid_info;
 
@@ -446,6 +450,7 @@ static __init void parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -456,6 +461,10 @@ static __init void parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 
@@ -801,10 +810,23 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif	/* !CONFIG_XEN */
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end) {
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
 	}
+#else
+	{
+		xen_kexec_reserve_t reservation;
+		BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0,
+					&reservation));
+		if (reservation.size) {
+			crashk_res.start = reservation.start;
+			crashk_res.end = reservation.start + 
+				reservation.size - 1;
+		}
+	}
+#endif
 #endif
 
 	paging_init();
@@ -950,6 +972,10 @@ void __init setup_arch(char **cmdline_p)
 	iommu_hole_init();
 #endif
 
+#ifdef CONFIG_KEXEC
+	request_resource(&ioport_resource, &crashk_res);
+#endif
+
 #ifdef CONFIG_XEN
 	{
 		struct physdev_set_iopl set_iopl;
--- /dev/null
+++ x/linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h
@@ -0,0 +1,49 @@
+/*
+ * include/asm-x86_64/kexec-xen.h
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _X86_64_KEXEC_XEN_H
+#define _X86_64_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_64.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+					struct cpu_user_regs *xen_regs)
+{
+	xen_regs->r15 = linux_regs->r15;
+	xen_regs->r14 = linux_regs->r14;
+	xen_regs->r13 = linux_regs->r13;
+	xen_regs->r12 = linux_regs->r12;
+	xen_regs->rbp = linux_regs->rbp;
+	xen_regs->rbx = linux_regs->rbx;
+	xen_regs->r11 = linux_regs->r11;
+	xen_regs->r10 = linux_regs->r10;
+	xen_regs->r9 = linux_regs->r9;
+	xen_regs->r8 = linux_regs->r8;
+	xen_regs->rax = linux_regs->rax;
+	xen_regs->rcx = linux_regs->rcx;
+	xen_regs->rdx = linux_regs->rdx;
+	xen_regs->rsi = linux_regs->rsi;
+	xen_regs->rdi = linux_regs->rdi;
+	xen_regs->rip = linux_regs->rip;
+	xen_regs->cs = linux_regs->cs;
+	xen_regs->rflags = linux_regs->eflags;
+	xen_regs->rsp = linux_regs->rsp;
+	xen_regs->ss = linux_regs->ss;
+}
+
+#endif /* _X86_64_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h
@@ -369,4 +369,11 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, unsigned int arg1, void * extra_args)
+{
+	return _hypercall3(int, kexec_op, op, arg1, extra_args);
+}
+
 #endif /* __HYPERCALL_H__ */
--- x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h
+++ x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h
@@ -90,6 +90,8 @@ extern unsigned long profile_pc(struct p
 #define profile_pc(regs) instruction_pointer(regs)
 #endif
 
+#include <linux/compiler.h>
+
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
 
 struct task_struct;
--- x/xen/arch/x86/x86_64/entry.S
+++ x/xen/arch/x86/x86_64/entry.S
@@ -569,6 +569,7 @@ ENTRY(hypercall_table)
         .quad do_event_channel_op
         .quad do_physdev_op
         .quad do_hvm_op
+        .quad do_kexec
         .rept NR_hypercalls-((.-hypercall_table)/8)
         .quad do_ni_hypercall
         .endr
@@ -609,6 +610,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_event_channel_op  */
         .byte 2 /* do_physdev_op        */
         .byte 2 /* do_hvm_op            */
+        .byte 3 /* do_kexec             */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- x/xen/arch/x86/x86_64/machine_kexec.c
+++ x/xen/arch/x86/x86_64/machine_kexec.c
@@ -4,18 +4,29 @@
  *
  * Created By: Horms <horms@verge.net.au>
  *
- * Should be losely based on arch/x86_64/kernel/machine_kexec.c
+ * Losely based on arch/x86_64/kernel/machine_kexec.c
  */
-
-#include <xen/lib.h>       /* for printk() used in stub */
+  
 #include <xen/types.h>
 #include <public/kexec.h>
+#include <asm/fixmap.h>
+
+typedef void (*relocate_new_kernel_t)(unsigned long indirection_page,
+                                      unsigned long control_code_buffer,
+                                      unsigned long start_address,
+                                      unsigned long page_table_a,
+                                      unsigned long page_table_b);
 
 void machine_kexec(xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-}
+    relocate_new_kernel_t rnk;
 
+    rnk = (relocate_new_kernel_t) fix_to_virt(FIX_KEXEC_PAGE);
+    (*rnk)(image->indirection_page, image->reboot_code_buffer, 
+           image->start_address, image->page_table_a[0],
+           image->page_table_b);
+  }
+  
 /*
  * Local variables:
  * mode: C
--- x/xen/include/asm-x86/x86_64/elf.h
+++ x/xen/include/asm-x86/x86_64/elf.h
@@ -3,17 +3,55 @@
  * 
  * Created By: Horms
  *
- * Should pull be based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS
- * from Linux 2.6.16
+ * Based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS from Linux 2.6.16
  */
 
 #ifndef __X86_ELF_X86_64_H__
 #define __X86_ELF_X86_64_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/lib.h>
 
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+#include <xen/lib.h>
+
+/* XXX: Xen doesn't have orig_rax, so it is omitted.
+ *      Xen dosn't have threads, so fs and gs are read from the CPU and
+ *      thus values 21 and 22 are just duplicates of 25 and 26
+ *      respectively.  All these values could be passed from dom0 in the
+ *      case of it crashing, but does that help?
+ *
+ *      Lastly, I'm not sure why ds, es, fs and gs are read from
+ *      the CPU rather than regs, but linux does this
+ */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs)  do { \
+	unsigned v;						\
+	(pr_reg)[0] = (regs)->r15;				\
+	(pr_reg)[1] = (regs)->r14;				\
+	(pr_reg)[2] = (regs)->r13;				\
+	(pr_reg)[3] = (regs)->r12;				\
+	(pr_reg)[4] = (regs)->rbp;				\
+	(pr_reg)[5] = (regs)->rbx;				\
+	(pr_reg)[6] = (regs)->r11;				\
+	(pr_reg)[7] = (regs)->r10;				\
+	(pr_reg)[8] = (regs)->r9;				\
+	(pr_reg)[9] = (regs)->r8;				\
+	(pr_reg)[10] = (regs)->rax;				\
+	(pr_reg)[11] = (regs)->rcx;				\
+	(pr_reg)[12] = (regs)->rdx;				\
+	(pr_reg)[13] = (regs)->rsi;				\
+	(pr_reg)[14] = (regs)->rdi;				\
+	(pr_reg)[16] = (regs)->rip;			\
+	(pr_reg)[17] = (regs)->cs;			\
+	(pr_reg)[18] = (regs)->eflags;			\
+	(pr_reg)[19] = (regs)->rsp;			\
+	(pr_reg)[20] = (regs)->ss;			\
+	asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[21] = v;	\
+	asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[22] = v;	\
+	asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v;	\
+	asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v;	\
+	asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v;	\
+	asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v;	\
+} while(0);
 
 #endif /* __X86_ELF_X86_64_H__ */
 
--- x/xen/include/asm-x86/x86_64/kexec.h
+++ x/xen/include/asm-x86/x86_64/kexec.h
@@ -10,14 +10,43 @@
 #ifndef __X86_64_KEXEC_H__
 #define __X86_64_KEXEC_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/lib.h>
 #include <xen/types.h>
 #include <public/xen.h>
 
+/*
+ * Saving the registers of the cpu on which panic occured in
+ * crash_kexec to save a valid sp. The registers of other cpus
+ * will be saved in machine_crash_shutdown while shooting down them.
+ */
 static void crash_setup_regs(struct cpu_user_regs *newregs,
 			     struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+	if (oldregs)
+		memcpy(newregs, oldregs, sizeof(*newregs));
+	else {
+		__asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->rbx));
+		__asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->rcx));
+		__asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->rdx));
+		__asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->rsi));
+		__asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->rdi));
+		__asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->rbp));
+		__asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->rax));
+		__asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->rsp));
+		__asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8));
+		__asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9));
+		__asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10));
+		__asm__ __volatile__("movq %%r11,%0" : "=m"(newregs->r11));
+		__asm__ __volatile__("movq %%r12,%0" : "=m"(newregs->r12));
+		__asm__ __volatile__("movq %%r13,%0" : "=m"(newregs->r13));
+		__asm__ __volatile__("movq %%r14,%0" : "=m"(newregs->r14));
+		__asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15));
+		__asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
+		__asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
+		__asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->eflags));
+
+		newregs->rip = (unsigned long)current_text_addr();
+	}
 }
 
 #endif /* __X86_64_KEXEC_H__ */
--- /dev/null
+++ x/patches/linux-2.6.16.13/2-linux-2.6.16-kexec_page_table_a_x86_64.patch
@@ -0,0 +1,421 @@
+kexec: Avoid overwriting the current pgd (V2, x86_64)
+
+This patch upgrades the x86_64-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables called "page_table_a". These
+tables are used to provide an executable identity mapping without overwriting
+the current pgd. The already existing page table is renamed to "page_table_b".
+
+KEXEC_CONTROL_CODE_SIZE is changed into a single page. This updated version of
+the patch also moves the segment handling code into the reloacte_kernel.S.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+---
+
+ The patch has been tested with regular kexec and CONFIG_CRASH_DUMP.
+ Applies on top of 2.6.16 and 2.6.17-rc4.
+
+ arch/x86_64/kernel/machine_kexec.c   |  193 +++++++++++++++++-----------------
+ arch/x86_64/kernel/relocate_kernel.S |   84 +++++++++++++-
+ include/asm-x86_64/kexec.h           |   15 ++
+ 3 files changed, 189 insertions(+), 103 deletions(-)
+
+--- x/arch/x86_64/kernel/machine_kexec.c
++++ x/arch/x86_64/kernel/machine_kexec.c
+@@ -2,6 +2,10 @@
+  * machine_kexec.c - handle transition of Linux booting another kernel
+  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - rewrote identity map code to avoid overwriting current pgd
++ * - moved segment handling code into relocate_kernel.S
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+@@ -96,81 +100,110 @@ out:
+ }
+ 
+ 
+-static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
++static int create_page_table_b(struct kimage *image)
+ {
+-	pgd_t *level4p;
+-	level4p = (pgd_t *)__va(start_pgtable);
+- 	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+-}
++	struct kimage_arch *arch = &image->arch_data;
+ 
+-static void set_idt(void *newidt, u16 limit)
+-{
+-	struct desc_ptr curidt;
++	arch->page_table_b = kimage_alloc_control_pages(image, 0);
+ 
+-	/* x86-64 supports unaliged loads & stores */
+-	curidt.size    = limit;
+-	curidt.address = (unsigned long)newidt;
++	if (!arch->page_table_b)
++		return -ENOMEM;
+ 
+-	__asm__ __volatile__ (
+-		"lidtq %0\n"
+-		: : "m" (curidt)
+-		);
+-};
++ 	return init_level4_page(image, page_address(arch->page_table_b),
++				0, end_pfn << PAGE_SHIFT);
++}
+ 
++typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
++					unsigned long control_code_buffer,
++					unsigned long start_address,
++					unsigned long page_table_a,
++					unsigned long page_table_b) ATTRIB_NORET;
++
++const extern unsigned char relocate_new_kernel[];
++const extern unsigned long relocate_new_kernel_size;
+ 
+-static void set_gdt(void *newgdt, u16 limit)
++static int allocate_page_table_a(struct kimage *image)
+ {
+-	struct desc_ptr curgdt;
++	struct kimage_arch *arch = &image->arch_data;
++	struct page *page;
++	int k = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
+ 
+-	/* x86-64 supports unaligned loads & stores */
+-	curgdt.size    = limit;
+-	curgdt.address = (unsigned long)newgdt;
++	for (; k > 0; k--) {
++		page = kimage_alloc_control_pages(image, 0);
++		if (!page)
++			return -ENOMEM;
+ 
+-	__asm__ __volatile__ (
+-		"lgdtq %0\n"
+-		: : "m" (curgdt)
+-		);
+-};
++		clear_page(page_address(page));
++		arch->page_table_a[k - 1] = page;
++	}
+ 
+-static void load_segments(void)
+-{
+-	__asm__ __volatile__ (
+-		"\tmovl %0,%%ds\n"
+-		"\tmovl %0,%%es\n"
+-		"\tmovl %0,%%ss\n"
+-		"\tmovl %0,%%fs\n"
+-		"\tmovl %0,%%gs\n"
+-		: : "a" (__KERNEL_DS) : "memory"
+-		);
++	return 0;
+ }
+ 
+-typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
+-					unsigned long control_code_buffer,
+-					unsigned long start_address,
+-					unsigned long pgtable) ATTRIB_NORET;
++#define _PAGE_KERNEL_EXEC __PAGE_KERNEL_EXEC
++#define pa_page(page) __pa_symbol(page_address(page)) /* __pa() miscompiles */
+ 
+-const extern unsigned char relocate_new_kernel[];
+-const extern unsigned long relocate_new_kernel_size;
++static int create_mapping(struct page *root, struct page **pages, 
++			  unsigned long va, unsigned long pa)
++{
++	pgd_t *pgd;
++	pud_t *pud;
++	pmd_t *pmd;
++	pte_t *pte;
++	int k = 0;
++
++	pgd = (pgd_t *)page_address(root) + pgd_index(va);
++	if (!pgd_present(*pgd))
++		set_pgd(pgd, __pgd(pa_page(pages[k++]) | _KERNPG_TABLE));
++
++	pud = pud_offset(pgd, va);
++	if (!pud_present(*pud))
++		set_pud(pud, __pud(pa_page(pages[k++]) | _KERNPG_TABLE));
++
++	pmd = pmd_offset(pud, va);
++	if (!pmd_present(*pmd))
++		set_pmd(pmd, __pmd(pa_page(pages[k++]) | _KERNPG_TABLE));
++
++	pte = (pte_t *)page_address(pmd_page(*pmd)) + pte_index(va);
++	set_pte(pte, __pte(pa | _PAGE_KERNEL_EXEC));
++
++	return k;
++}
+ 
+ int machine_kexec_prepare(struct kimage *image)
+ {
+-	unsigned long start_pgtable, control_code_buffer;
+-	int result;
++	void *control_page;
++	unsigned long pa;
++	int k;
+ 
+-	/* Calculate the offsets */
+-	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+-	control_code_buffer = start_pgtable + PAGE_SIZE;
+-
+-	/* Setup the identity mapped 64bit page table */
+-	result = init_pgtable(image, start_pgtable);
+-	if (result)
+-		return result;
+-
+-	/* Place the code in the reboot code buffer */
+-	memcpy(__va(control_code_buffer), relocate_new_kernel,
+-						relocate_new_kernel_size);
++	memset(&image->arch_data, 0, sizeof(image->arch_data));
+ 
+-	return 0;
++	k = allocate_page_table_a(image);
++	if (k)
++		return k;
++
++	/* fill in control_page with assembly code */
++
++	control_page = page_address(image->control_code_page);
++	memcpy(control_page, relocate_new_kernel, relocate_new_kernel_size);
++
++	/* map the control_page at the virtual address of relocate_kernel.S */
++
++	pa = __pa(control_page);
++
++	k = create_mapping(image->arch_data.page_table_a[0], 
++			   &image->arch_data.page_table_a[1],
++			   (unsigned long)relocate_new_kernel, pa);
++
++	/* identity map the control_page */
++
++	create_mapping(image->arch_data.page_table_a[0], 
++		       &image->arch_data.page_table_a[k + 1],
++		       pa, pa);
++
++	/* create identity mapped page table aka page_table_b */
++
++	return create_page_table_b(image);
+ }
+ 
+ void machine_kexec_cleanup(struct kimage *image)
+@@ -185,47 +218,17 @@ void machine_kexec_cleanup(struct kimage
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+ 	unsigned long page_list;
+-	unsigned long control_code_buffer;
+-	unsigned long start_pgtable;
++	unsigned long control_code;
++	unsigned long page_table_a;
++	unsigned long page_table_b;
+ 	relocate_new_kernel_t rnk;
+ 
+-	/* Interrupts aren't acceptable while we reboot */
+-	local_irq_disable();
+-
+-	/* Calculate the offsets */
+ 	page_list = image->head;
+-	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+-	control_code_buffer = start_pgtable + PAGE_SIZE;
++	control_code = __pa(page_address(image->control_code_page));
++	page_table_a = __pa(page_address(image->arch_data.page_table_a[0]));
++	page_table_b = __pa(page_address(image->arch_data.page_table_b));
+ 
+-	/* Set the low half of the page table to my identity mapped
+-	 * page table for kexec.  Leave the high half pointing at the
+-	 * kernel pages.   Don't bother to flush the global pages
+-	 * as that will happen when I fully switch to my identity mapped
+-	 * page table anyway.
+-	 */
+-	memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
+-	__flush_tlb();
+-
+-
+-	/* The segment registers are funny things, they are
+-	 * automatically loaded from a table, in memory wherever you
+-	 * set them to a specific selector, but this table is never
+-	 * accessed again unless you set the segment to a different selector.
+-	 *
+-	 * The more common model are caches where the behide
+-	 * the scenes work is done, but is also dropped at arbitrary
+-	 * times.
+-	 *
+-	 * I take advantage of this here by force loading the
+-	 * segments, before I zap the gdt with an invalid value.
+-	 */
+-	load_segments();
+-	/* The gdt & idt are now invalid.
+-	 * If you want to load them you must set up your own idt & gdt.
+-	 */
+-	set_gdt(phys_to_virt(0),0);
+-	set_idt(phys_to_virt(0),0);
+ 	/* now call it */
+-	rnk = (relocate_new_kernel_t) control_code_buffer;
+-	(*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
++	rnk = (relocate_new_kernel_t) relocate_new_kernel;
++	(*rnk)(page_list, control_code, image->start, page_table_a, page_table_b);
+ }
+--- x/arch/x86_64/kernel/relocate_kernel.S
++++ x/arch/x86_64/kernel/relocate_kernel.S
+@@ -2,11 +2,18 @@
+  * relocate_kernel.S - put the kernel image in place to boot
+  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - moved segment handling code from machine_kexec.c
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+ 
+ #include <linux/linkage.h>
++#include <asm/page.h>
++
++.text
++.align (1 << PAGE_SHIFT)
+ 
+ 	/*
+ 	 * Must be relocatable PIC code callable as a C function, that once
+@@ -18,21 +25,69 @@ relocate_new_kernel:
+ 	/* %rdi page_list
+ 	 * %rsi reboot_code_buffer
+ 	 * %rdx start address
+-	 * %rcx page_table
+-	 * %r8  arg5
++	 * %rcx page_table_a
++	 * %r8  page_table_b
+ 	 * %r9  arg6
+ 	 */
+-
++	
+ 	/* zero out flags, and disable interrupts */
+ 	pushq $0
+ 	popfq
+ 
++	/* switch to page_table_a */
++	movq    %rcx, %cr3
++
++	/* setup idt */
++
++	movq	%rsi, %rax
++	addq	$(idt_48 - relocate_new_kernel), %rax
++	lidtq	(%rax)
++
++	/* setup gdt */
++
++	movq	%rsi, %rax
++	addq	$(gdt - relocate_new_kernel), %rax
++	movq	%rsi, %r9
++	addq	$((gdt_48 - relocate_new_kernel) + 2), %r9
++	movq	%rax, (%r9)
++	
++	movq	%rsi, %rax
++	addq	$(gdt_48 - relocate_new_kernel), %rax
++	lgdtq	(%rax)
++
++	/* setup data segment registers */
++
++	xorl	%eax,%eax
++	movl	%eax, %ds
++	movl	%eax, %es
++	movl	%eax, %fs
++	movl	%eax, %gs
++	movl	%eax, %ss
++
+ 	/* set a new stack at the bottom of our page... */
+ 	lea   4096(%rsi), %rsp
+ 
++	/* load new code segment */
++
++	movq	%rsp, %rcx
++	xorq	%rax, %rax
++	pushq	%rax                                              /* SS */
++	pushq	%rcx                                              /* ESP */
++	pushq	%rax                                              /* RFLAGS */
++
++	movq	$(gdt_code - gdt), %rax
++	pushq	%rax                                              /* CS */
++
++	movq	%rsi, %rax
++	addq	$(identity_mapped - relocate_new_kernel), %rax
++	pushq	%rax                                              /* RIP */
++
++	iretq
++	
++identity_mapped:
+ 	/* store the parameters back on the stack */
+ 	pushq	%rdx /* store the start address */
+-
++	
+ 	/* Set cr0 to a known state:
+ 	 * 31 1 == Paging enabled
+ 	 * 18 0 == Alignment check disabled
+@@ -69,7 +124,7 @@ relocate_new_kernel:
+ 	/* Switch to the identity mapped page tables,
+ 	 * and flush the TLB.
+ 	*/
+-	movq	%rcx, %cr3
++	movq	%r8, %cr3
+ 
+ 	/* Do the copies */
+ 	movq	%rdi, %rcx 	/* Put the page_list in %rcx */
+@@ -136,6 +191,25 @@ relocate_new_kernel:
+ 	xorq	%r15, %r15
+ 
+ 	ret
++	.align	16
++gdt:
++	.long   0x00000000  /* NULL descriptor */
++	.long   0x00000000
++gdt_code:
++	.long   0x00000000  /* code descriptor */
++	.long   0x00209800
++
++gdt_end:
++	.align	4
++	
++idt_48:
++	.word	0				# idt limit = 0
++	.quad	0, 0				# idt base = 0L
++
++gdt_48:
++	.word	gdt_end - gdt - 1		# gdt limit
++	.quad	0, 0				# gdt base (filled in later)
++
+ relocate_new_kernel_end:
+ 
+ 	.globl relocate_new_kernel_size
+--- x/include/asm-x86_64/kexec.h
++++ x/include/asm-x86_64/kexec.h
+@@ -21,15 +21,24 @@
+ /* Maximum address we can use for the control pages */
+ #define KEXEC_CONTROL_MEMORY_LIMIT     (0xFFFFFFFFFFUL)
+ 
+-/* Allocate one page for the pdp and the second for the code */
+-#define KEXEC_CONTROL_CODE_SIZE  (4096UL + 4096UL)
++#define KEXEC_CONTROL_CODE_SIZE  4096
+ 
+ /* The native architecture */
+ #define KEXEC_ARCH KEXEC_ARCH_X86_64
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
+-struct kimage_arch {};
++struct kimage_arch {
++	/* page_table_a[] holds enough pages to create a new page table
++	 * that maps the control page twice..
++	 *
++	 * page_table_b points to the root page of a page table which is used
++	 * to provide identity mapping of all ram.
++	 */
++
++	struct page *page_table_a[7]; /* 2 * (pte + pud + pmd) + pgd */
++	struct page *page_table_b;
++};
+ 
+ /*
+  * Saving the registers of the cpu on which panic occured in
--- /dev/null
+++ x/patches/linux-2.6.16.13/3-linux-2.6.16-kexec_page_table_a_x86_64-xen.patch
@@ -0,0 +1,151 @@
+ arch/x86_64/kernel/machine_kexec.c |   84 +++++++++++++++++++++++++++++++++---
+ 1 file changed, 77 insertions(+), 7 deletions(-)
+
+--- x/arch/x86_64/kernel/machine_kexec.c
++++ x/arch/x86_64/kernel/machine_kexec.c
+@@ -19,6 +19,50 @@
+ #include <asm/mmu_context.h>
+ #include <asm/io.h>
+ 
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#include <xen/interface/memory.h>
++
++#define x__pmd(x) ((pmd_t) { (x) } )
++#define x__pud(x) ((pud_t) { (x) } )
++#define x__pgd(x) ((pgd_t) { (x) } )
++
++#define x_pmd_val(x)	((x).pmd)
++#define x_pud_val(x)	((x).pud)
++#define x_pgd_val(x)	((x).pgd)
++
++static inline void x_set_pmd(pmd_t *dst, pmd_t val)
++{
++        x_pmd_val(*dst) = x_pmd_val(val); 
++} 
++
++static inline void x_set_pud(pud_t *dst, pud_t val)
++{
++	x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
++}
++
++static inline void x_pud_clear (pud_t *pud)
++{
++	x_pud_val(*pud) = 0;
++}
++
++static inline void x_set_pgd(pgd_t *dst, pgd_t val)
++{
++	x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val)); 
++} 
++
++static inline void x_pgd_clear (pgd_t * pgd)
++{
++	x_pgd_val(*pgd) = 0; 
++}
++
++#define MY_LARGE_EXEC _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
++#define MY_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
++#else
++#define MY_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
++#define MY_TABLE _KERNPG_TABLE
++#endif /* CONFIG_XEN */
++
+ static void init_level2_page(pmd_t *level2p, unsigned long addr)
+ {
+ 	unsigned long end_addr;
+@@ -26,7 +70,7 @@ static void init_level2_page(pmd_t *leve
+ 	addr &= PAGE_MASK;
+ 	end_addr = addr + PUD_SIZE;
+ 	while (addr < end_addr) {
+-		set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
++		x_set_pmd(level2p++, x__pmd(addr | MY_LARGE_EXEC));
+ 		addr += PMD_SIZE;
+ 	}
+ }
+@@ -51,12 +95,12 @@ static int init_level3_page(struct kimag
+ 		}
+ 		level2p = (pmd_t *)page_address(page);
+ 		init_level2_page(level2p, addr);
+-		set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
++		x_set_pud(level3p++, x__pud(__pa(level2p) | MY_TABLE));
+ 		addr += PUD_SIZE;
+ 	}
+ 	/* clear the unused entries */
+ 	while (addr < end_addr) {
+-		pud_clear(level3p++);
++		x_pud_clear(level3p++);
+ 		addr += PUD_SIZE;
+ 	}
+ out:
+@@ -87,12 +131,12 @@ static int init_level4_page(struct kimag
+ 		if (result) {
+ 			goto out;
+ 		}
+-		set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
++		x_set_pgd(level4p++, x__pgd(__pa(level3p) | MY_TABLE));
+ 		addr += PGDIR_SIZE;
+ 	}
+ 	/* clear the unused entries */
+ 	while (addr < end_addr) {
+-		pgd_clear(level4p++);
++		x_pgd_clear(level4p++);
+ 		addr += PGDIR_SIZE;
+ 	}
+ out:
+@@ -103,14 +147,21 @@ out:
+ static int create_page_table_b(struct kimage *image)
+ {
+ 	struct kimage_arch *arch = &image->arch_data;
++	unsigned long last_page;
+ 
+ 	arch->page_table_b = kimage_alloc_control_pages(image, 0);
+ 
+ 	if (!arch->page_table_b)
+ 		return -ENOMEM;
+ 
++#ifdef CONFIG_XEN
++	last_page = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
++#else
++	last_page = end_pfn;
++#endif
++
+  	return init_level4_page(image, page_address(arch->page_table_b),
+-				0, end_pfn << PAGE_SHIFT);
++				0, last_page << PAGE_SHIFT);
+ }
+ 
+ typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
+@@ -211,6 +262,7 @@ void machine_kexec_cleanup(struct kimage
+ 	return;
+ }
+ 
++#ifndef CONFIG_XEN
+ /*
+  * Do not allocate memory (or fail in any way) in machine_kexec().
+  * We are past the point of no return, committed to rebooting now.
+@@ -230,5 +282,23 @@ NORET_TYPE void machine_kexec(struct kim
+ 
+ 	/* now call it */
+ 	rnk = (relocate_new_kernel_t) relocate_new_kernel;
+-	(*rnk)(page_list, control_code, image->start, page_table_a, page_table_b);
++	(*rnk)(page_list, control_code, image->start, page_table_a,
++	       page_table_b);
++}
++#endif /* !CONFIG_XEN */
++
++#ifdef CONFIG_XEN
++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki,struct kimage *image)
++{
++	struct kimage_arch *arch = &image->arch_data;
++	int k, n = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
++
++	for (k = 0; k < n; k++)
++		xki->page_table_a[k] = 
++			pfn_to_mfn(page_to_pfn(arch->page_table_a[k]))
++				<< PAGE_SHIFT;
++
++	xki->page_table_b =
++		pfn_to_mfn(page_to_pfn(arch->page_table_b)) << PAGE_SHIFT;
+ }
++#endif /* CONFIG_XEN */

[-- Attachment #7: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 68+ messages in thread

* [PATCH] kexec: framework and i386 (Take XIV)
  2006-08-11  7:48                                                         ` [PATCH] kexec: framework and i386 (Take XIII) Horms
@ 2006-08-31  7:43                                                           ` Horms
  2006-08-31  8:55                                                             ` Akio Takebe
  2006-09-05 11:43                                                             ` [Xen-devel] " Kazuo Moriwaka
  0 siblings, 2 replies; 68+ messages in thread
From: Horms @ 2006-08-31  7:43 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Zou, Nanhai, Akio Takebe,
	Isaku Yamahata, Magnus Damm, Mark Williamson, xen-ia64-devel

[-- Attachment #1: Type: text/plain, Size: 2519 bytes --]

Hi,

here is an update of the kexec/kdump patchset.

Summary:

* Up port to xen-unstable.hg-11296 (45f6ee334fcc)
  - kexec hypercall number fragment is now in xen-unstable
* Make kexec_page_to_pfn and friends need to be architecture specific
  - this abstraction is needed to support ia64
* Use kexec_page_to_pfn in machine_kexec_setup_load_arg()
  - this abstraction is needed to support ia64
* Rename do_kexec to do_kexec_op to make it consistent with other
  hypercalls
* Add ppc stubs
* Add ia64 support

Architectures:

x86_32: 

Seems to be working fine

x86_64:

Probably working fine, but I can't test this as dom0 refuses to boot for
me on xen-unstable-11388 (50aea0ec406b).  That is, even without the
kexec patches. I'm not sure what the problem is and I've devicided to
get these patches out rather and investigate later.

ia64:

This patchset also, for the first time, includes ia64 code.
Please note that this currently does _not_ work. I am actually
struggling to work out why, and would really appreaciate it
if someone could cast an eye over it.

One possible area of concern is that relocate_kernel wipes out TLB
entries. However many of the entries instated in
arch/ia64/xen/xenasm.S:ia64_new_rr7() are not wiped. In particular,
VHPT_ADDR, Shared info, and Map mapped_reg are not handled by
relocate_kernel(), and the handling of current seems to be different.

There are also problems with constants inside kexec_fake_sal_rendez.
However this function probably also suffers the same problems as
relocate_kernel. And it is easy not ro run kexec_fake_sal_rendez
by booting xen with maxcpus=1, thus avoiding calling
kexec_fake_sal_rendez, which is used in cpu shutdown.

ppc:

stubs only

Patches

   1. 51.1-kexec-generic-upstream.patch
      * Common code for all architectures,
        the basic plumbing for kexec/kdump

   2. 51.1.1-kexec-trigger_crash_dump.patch
      * xen-console trigger crash_dump
      * Depends on 1

   3. 51.2.1-kexec-x86-upstream.patch
      * Glue between 1, and 3 and 4.
      * Depends on 1

   4. 51.2.1.1-kexec-x86_32-upstream.patch
      * Kexec/kdump for x86_32
      * Depends on 3 (and 1)

   5. 51.2.31.2-kexec-x86_64-upstream.patch
      * Kexec/kdump for x86_64
      * Depends on 3 (and 1)

   6. 51.2.2-kexec-ia64-upstream.patch
      * Kexec/kdump for ia64
      * Depends 1

Discussion:

Email is always good. Also my partner in crime, Magnus Damm,
will be at Xen Summit.

-- 
Horms
  H: http://www.vergenet.net/~horms/
  W: http://www.valinux.co.jp/en/


[-- Attachment #2: 51.1-kexec-generic-upstream.patch --]
[-- Type: text/plain, Size: 40083 bytes --]

kexec: framework

This is an implementation of kexec for dom0/xen, that allows
kexecing of the physical machine from xen. The approach taken is
to move the architecture-dependant kexec code into a new hypercall.

This patch only includes the framework, it cann't be used without
architecture dependant hooks, however the code should compile as is.

Signed-Off-By: Horms <horms@verge.net.au>
Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>

 linux-2.6-xen-sparse/drivers/xen/core/Makefile                      |    1 
 linux-2.6-xen-sparse/drivers/xen/core/crash.c                       |   49 +
 linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c               |   78 ++
 linux-2.6-xen-sparse/drivers/xen/core/reboot.c                      |    4 
 patches/linux-2.6.16.13/kexec-generic.patch                         |  283 ++++++++++
 patches/linux-2.6.16.13/linux-2.6.16-kexec_page_table_a_stubs.patch |   85 +++
 patches/linux-2.6.16.13/series                                      |    2 
 xen/arch/ia64/xen/Makefile                                          |    2 
 xen/arch/ia64/xen/crash.c                                           |   26 
 xen/arch/ia64/xen/machine_kexec.c                                   |   46 +
 xen/arch/powerpc/Makefile                                           |    2 
 xen/arch/powerpc/crash.c                                            |   26 
 xen/arch/powerpc/machine_kexec.c                                    |   46 +
 xen/arch/x86/Makefile                                               |    2 
 xen/arch/x86/crash.c                                                |   26 
 xen/arch/x86/machine_kexec.c                                        |   46 +
 xen/common/Makefile                                                 |    1 
 xen/common/kexec.c                                                  |  211 +++++++
 xen/common/page_alloc.c                                             |   33 -
 xen/drivers/char/console.c                                          |    3 
 xen/include/asm-ia64/kexec.h                                        |   32 +
 xen/include/asm-x86/kexec.h                                         |   31 +
 xen/include/public/kexec.h                                          |   85 +++
 xen/include/xen/elfcore.h                                           |   73 ++
 xen/include/xen/hypercall.h                                         |    6 
 xen/include/xen/kexec.h                                             |   33 +
 xen/include/xen/mm.h                                                |    1 
 27 files changed, 1222 insertions(+), 11 deletions(-)

--- x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
+++ x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
@@ -11,3 +11,4 @@ obj-$(CONFIG_XEN_SYSFS)		+= xen_sysfs.o
 obj-$(CONFIG_XEN_SKBUFF)	+= skbuff.o
 obj-$(CONFIG_XEN_REBOOT)	+= reboot.o
 obj-$(CONFIG_XEN_SMPBOOT)	+= smpboot.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o crash.o
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/crash.c
@@ -0,0 +1,49 @@
+/*
+ * drivers/xen/core/crash.c
+ * Architecture independent functions for kexec based crash dumps in xen.
+ *
+ * Created by: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <asm/ptrace.h>
+#include <linux/types.h>
+#include <asm/kexec-xen.h>
+#include <asm/hypervisor.h>
+#include <asm/system.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <asm/hw_irq.h>
+#include <xen/interface/kexec.h>
+
+/* 
+ * This passes the registers's down to the hypervisor and has it kexec()
+ * This is a bit different to the linux implementation which
+ * has this call save registers and stop CPUs and then goes into
+ * machine_kexec() later. But for Xen it makes more sense to
+ * have the kexec hypercall do everything, and this call
+ * has the registers parameter that is needed.
+ * to the hypervisor to allow the hypervisor to kdump itself
+ * on an internal panic 
+ */
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	struct cpu_user_regs xen_regs;
+	printk("machine_crash_shutdown: %d\n", smp_processor_id());
+	local_irq_disable();
+#ifdef CONFIG_X86_IO_APIC
+	disable_IO_APIC();
+#endif
+	crash_translate_regs(regs, &xen_regs);
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, KEXEC_TYPE_CRASH, &xen_regs);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
@@ -0,0 +1,78 @@
+/*
+ * drivers/xen/core/machine_kexec.c 
+ * handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <linux/kexec.h>
+#include <xen/interface/kexec.h>
+#include <linux/mm.h>
+#include <asm/hypercall.h>
+#include <asm/kexec-xen.h>
+
+extern void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, 
+					 struct kimage *image);
+
+static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+	memset(xki, 0, sizeof(*xki));
+
+	machine_kexec_setup_load_arg(xki, image);
+
+	xki->indirection_page = image->head;
+	xki->reboot_code_buffer = 
+		kexec_page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+	xki->start_address = image->start;
+}
+
+/*
+ * Load the image into xen so xen can kdump itself
+ * This might have been done in prepare, but prepare
+ * is currently called too early. It might make sense
+ * to move prepare, but for now, just add an extra hook.
+ */
+int xen_machine_kexec_load(struct kimage *image)
+{
+	xen_kexec_image_t xki;
+
+	setup_load_arg(&xki, image);
+	return HYPERVISOR_kexec(KEXEC_CMD_kexec_load, image->type, &xki);
+}
+
+/*
+ * Unload the image that was stored by machine_kexec_load()
+ * This might have been done in machine_kexec_cleanup() but it
+ * is called too late, and its possible xen could try and kdump
+ * using resources that have been freed.
+ */
+void xen_machine_kexec_unload(struct kimage *image)
+{
+	HYPERVISOR_kexec(KEXEC_CMD_kexec_unload, image->type, NULL);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ *
+ * This has the hypervisor move to the prefered reboot CPU, 
+ * stop all CPUs and kexec. That is it combines machine_shutdown()
+ * and machine_kexec() in Linux kexec terms.
+ */
+NORET_TYPE void xen_machine_kexec(struct kimage *image)
+{
+	HYPERVISOR_kexec(KEXEC_CMD_kexec, image->type, NULL);
+	panic("KEXEC_CMD_kexec hypercall should not return\n");
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
+++ x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
@@ -65,6 +65,10 @@ void machine_power_off(void)
 	HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 }
 
+#ifdef CONFIG_KEXEC
+void machine_shutdown(void) { }
+#endif
+
 int reboot_thru_bios = 0;	/* for dmi_scan.c */
 EXPORT_SYMBOL(machine_restart);
 EXPORT_SYMBOL(machine_halt);
--- x/xen/arch/ia64/xen/Makefile
+++ x/xen/arch/ia64/xen/Makefile
@@ -25,5 +25,7 @@ obj-y += xensetup.o
 obj-y += xentime.o
 obj-y += flushd.o
 obj-y += privop_stat.o
+obj-y += machine_kexec.o
+obj-y += crash.o
 
 obj-$(crash_debug) += gdbstub.o
--- /dev/null
+++ x/xen/arch/ia64/xen/crash.c
@@ -0,0 +1,26 @@
+/**********************************************************************
+ * arch/ia64/xen/crash.c
+ *
+ * Created By: Horms
+ * 
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+void machine_crash_shutdown(struct cpu_user_regs *regs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
--- /dev/null
+++ x/xen/arch/ia64/xen/machine_kexec.c
@@ -0,0 +1,46 @@
+/**********************************************************************
+ * arch/ia64/xen/machine_kexec.c
+ *
+ * Created By: Horms
+ * 
+ */
+
+#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+int machine_kexec_load(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    return -1;
+}
+
+void machine_kexec_unload(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_kexec_reserved(xen_kexec_reserve_t *reservation)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_shutdown(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/powerpc/Makefile
+++ x/xen/arch/powerpc/Makefile
@@ -35,6 +35,8 @@ obj-y += setup.o
 obj-y += smp.o
 obj-y += time.o
 obj-y += usercopy.o
+obj-y += machine_kexec.o
+obj-y += crash.o
 
 obj-$(debug) += 0opt.o
 obj-$(crash_debug) += gdbstub.o
--- /dev/null
+++ x/xen/arch/powerpc/crash.c
@@ -0,0 +1,26 @@
+/**********************************************************************
+ * arch/powerpc/crash.c
+ *
+ * Created By: Horms
+ * 
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+void machine_crash_shutdown(struct cpu_user_regs *regs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
--- /dev/null
+++ x/xen/arch/powerpc/machine_kexec.c
@@ -0,0 +1,46 @@
+/**********************************************************************
+ * arch/powerpc/machine_kexec.c
+ *
+ * Created By: Horms
+ * 
+ */
+
+#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+int machine_kexec_load(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    return -1;
+}
+
+void machine_kexec_unload(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_kexec_reserved(xen_kexec_reserve_t *reservation)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_shutdown(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/Makefile
+++ x/xen/arch/x86/Makefile
@@ -41,6 +41,8 @@ obj-y += trampoline.o
 obj-y += traps.o
 obj-y += usercopy.o
 obj-y += x86_emulate.o
+obj-y += machine_kexec.o
+obj-y += crash.o
 
 obj-$(crash_debug) += gdbstub.o
 
--- /dev/null
+++ x/xen/arch/x86/crash.c
@@ -0,0 +1,26 @@
+/******************************************************************************
+ * arch/x86/crash.c
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+void machine_crash_shutdown(struct cpu_user_regs *regs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/arch/x86/machine_kexec.c
@@ -0,0 +1,46 @@
+/******************************************************************************
+ * arch/x86/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ */
+
+#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+int machine_kexec_load(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    return -1;
+}
+
+void machine_kexec_unload(int type, xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_kexec_reserved(xen_kexec_reserve_t *reservation)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+void machine_shutdown(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/Makefile
+++ x/xen/common/Makefile
@@ -7,6 +7,7 @@ obj-y += event_channel.o
 obj-y += grant_table.o
 obj-y += kernel.o
 obj-y += keyhandler.o
+obj-y += kexec.o
 obj-y += lib.o
 obj-y += memory.o
 obj-y += multicall.o
--- /dev/null
+++ x/xen/common/kexec.c
@@ -0,0 +1,211 @@
+/******************************************************************************
+ * common/kexec.c - Achitecture independent kexec code for Xen
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Based in part on Linux 2.6.16's kernel/kexec.c
+ */
+
+#include <asm/kexec.h>
+#include <xen/lib.h>
+#include <xen/errno.h>
+#include <xen/guest_access.h>
+#include <xen/sched.h>
+#include <xen/types.h>
+#include <xen/kexec.h>
+#include <public/kexec.h>
+
+DEFINE_PER_CPU (note_buf_t, crash_notes);
+
+static xen_kexec_image_t kexec_image;
+static int kexec_image_set = 0;
+static xen_kexec_image_t kexec_crash_image;
+static int kexec_crash_image_set = 0;
+static int kexec_crash_lock = 0;
+
+/* Must call with kexec_crash_lock held */
+void __crash_kexec(struct cpu_user_regs *regs)
+{
+    struct cpu_user_regs fixed_regs;
+
+    if (!kexec_crash_image_set)
+	    return;
+    crash_setup_regs(&fixed_regs, regs);
+    machine_crash_shutdown(&fixed_regs);
+    machine_kexec(&kexec_crash_image); /* Does not return */
+}
+
+void crash_kexec(struct cpu_user_regs *regs)
+{
+    int locked;
+
+    locked = xchg(&kexec_crash_lock, 1);
+    if (locked)
+        return;
+    __crash_kexec(regs);
+
+    /* The if() here is bogus, but gcc will throws a warning that the
+     * computed value is unused and xen compiles with -Werror.
+     * This seems like a viable work around.
+     * This did not seem to happen with slightly older gcc.
+     * Observed with: 
+     * gcc version 4.1.2 20060604 (prerelease) (Debian * 4.1.1-2) */
+    if (xchg(&kexec_crash_lock, 0)) ;
+
+    return;
+}
+
+static int get_crash_note(int vcpuid, XEN_GUEST_HANDLE(void) uarg)
+{
+    struct domain *domain = current->domain;
+    unsigned long crash_note;
+    struct vcpu *vcpu;
+    int locked;
+
+    if (vcpuid < 0 || vcpuid > MAX_VIRT_CPUS)
+	return -EINVAL;
+
+    if ( ! (vcpu = domain->vcpu[vcpuid]) )
+	return -EINVAL;
+
+    locked = xchg(&kexec_crash_lock, 1);
+    if (locked)
+    {
+       printk("do_kexec_op: (CMD_kexec_crash_note): dump is locked\n");
+       return -EFAULT;
+    }
+    crash_note = __pa((unsigned long)per_cpu(crash_notes, vcpu->processor));
+
+    /* The if() here is bogus, but gcc will throws a warning that the
+     * computed value is unused and xen compiles with -Werror.
+     * This seems like a viable work around.
+     * This did not seem to happen with slightly older gcc.
+     * Observed with: 
+     * gcc version 4.1.2 20060604 (prerelease) (Debian * 4.1.1-2) */
+    if (xchg(&kexec_crash_lock, 0)) ;
+
+    if ( unlikely(copy_to_guest(uarg, &crash_note, 1) != 0) )
+    {
+        printk("do_kexec_op: (CMD_kexec_crash_note): copy_to_guest failed\n");
+        return -EFAULT;
+    }
+    
+    return 0;
+}
+
+static int get_reserve(XEN_GUEST_HANDLE(void) uarg)
+{
+    xen_kexec_reserve_t reservation;
+
+    machine_kexec_reserved(&reservation);
+    if ( unlikely(copy_to_guest(uarg, &reservation, 1) != 0) )
+    {
+        printk("do_kexec_op (CMD_kexec_reserve): copy_to_guest failed\n");
+        return -EFAULT;
+    }
+    
+    return 0;
+}
+
+static int __do_kexec(unsigned long type, XEN_GUEST_HANDLE(void) uarg,
+		      xen_kexec_image_t *image)
+{
+    cpu_user_regs_t regs;
+
+    if (type == KEXEC_TYPE_DEFAULT)
+        machine_shutdown(image); /* Does not return */
+    else
+    {
+        if ( unlikely(copy_from_guest(&regs, uarg, 1) != 0) )
+        {
+            printk("do_kexec_op (CMD_kexec): copy_from_guest failed\n");
+            return -EFAULT;
+        }
+        __crash_kexec(&regs); /* Does not return */
+    }
+
+    return -EINVAL;
+}
+
+long do_kexec_op(unsigned long op, int arg1, XEN_GUEST_HANDLE(void) uarg)
+{
+    xen_kexec_image_t *image;
+    int locked;
+    int *image_set;
+    int status = -EINVAL;
+
+    if ( !IS_PRIV(current->domain) )  
+        return -EPERM;
+
+    switch (op)
+    {
+    case KEXEC_CMD_kexec_crash_note:
+        return get_crash_note(arg1, uarg);
+    case KEXEC_CMD_kexec_reserve:
+	return get_reserve(uarg);
+    }
+
+    /* For all other ops, arg1 is the type of kexec, that is
+     * KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH */
+    if (arg1 == KEXEC_TYPE_CRASH)
+    {
+        image = &kexec_crash_image;
+        image_set = &kexec_crash_image_set;
+        locked = xchg(&kexec_crash_lock, 1);
+        if (locked)
+        {
+           printk("do_kexec_op: dump is locked\n");
+           return -EFAULT;
+        }
+    }
+    else
+    {
+        image = &kexec_image;
+        image_set = &kexec_image_set;
+    }
+
+    switch(op) {
+    case KEXEC_CMD_kexec:
+        BUG_ON(!*image_set);
+	status = __do_kexec(arg1, uarg, image);
+        break;
+    case KEXEC_CMD_kexec_load:
+        BUG_ON(*image_set);
+        if ( unlikely(copy_from_guest(image, uarg, 1) != 0) )
+        {
+            printk("do_kexec_op (CMD_kexec_load): copy_from_guest failed\n");
+            status = -EFAULT;
+	    break;
+        }
+        *image_set = 1;
+        status = machine_kexec_load(arg1, image);
+        break;
+    case KEXEC_CMD_kexec_unload:
+        BUG_ON(!*image_set);
+        *image_set = 0;
+        machine_kexec_unload(arg1, image);
+        status = 0;
+        break;
+    }
+
+    if (arg1 == KEXEC_TYPE_CRASH)
+        /* The if() here is bogus, but gcc will throws a warning that the
+         * computed value is unused and xen compiles with -Werror.
+         * This seems like a viable work around.
+         * This did not seem to happen with slightly older gcc.
+         * Observed with: 
+         * gcc version 4.1.2 20060604 (prerelease) (Debian * 4.1.1-2) */
+        if (xchg(&kexec_crash_lock, 0)) ;
+
+    return status;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/page_alloc.c
+++ x/xen/common/page_alloc.c
@@ -213,24 +213,35 @@ void init_boot_pages(paddr_t ps, paddr_t
     }
 }
 
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at)
+{
+    unsigned long i;
+
+    for ( i = 0; i < nr_pfns; i++ )
+        if ( allocated_in_map(pfn_at + i) )
+             break;
+
+    if ( i == nr_pfns )
+    {
+        map_alloc(pfn_at, nr_pfns);
+        return pfn_at;
+    }
+
+    return 0;
+}
+
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
 {
-    unsigned long pg, i;
+    unsigned long pg, i = 0;
 
     for ( pg = 0; (pg + nr_pfns) < max_page; pg += pfn_align )
     {
-        for ( i = 0; i < nr_pfns; i++ )
-            if ( allocated_in_map(pg + i) )
-                 break;
-
-        if ( i == nr_pfns )
-        {
-            map_alloc(pg, nr_pfns);
-            return pg;
-        }
+        i = alloc_boot_pages_at(nr_pfns, pg);
+        if (i != 0)
+            break;
     }
 
-    return 0;
+    return i;
 }
 
 
--- x/xen/drivers/char/console.c
+++ x/xen/drivers/char/console.c
@@ -585,6 +585,7 @@ void panic(const char *fmt, ...)
     char buf[128];
     unsigned long flags;
     static DEFINE_SPINLOCK(lock);
+    extern void crash_kexec(struct cpu_user_regs *regs);
     
     debugtrace_dump();
 
@@ -607,6 +608,8 @@ void panic(const char *fmt, ...)
 
     debugger_trap_immediate();
 
+    crash_kexec(NULL);
+
     if ( opt_noreboot )
     {
         machine_halt();
--- /dev/null
+++ x/xen/include/asm-ia64/kexec.h
@@ -0,0 +1,32 @@
+/******************************************************************************
+ * include/asm-ia64/kexec.h
+ * 
+ * Created By: Horms
+ *
+ */
+
+#ifndef __IA64_KEXEC_H__
+#define __IA64_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+                            struct cpu_user_regs *oldregs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+#endif /* __IA64_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
--- /dev/null
+++ x/xen/include/asm-x86/kexec.h
@@ -0,0 +1,31 @@
+/******************************************************************************
+ * include/asm-x86/kexec.h
+ * 
+ * Created By: Horms
+ *
+ */
+
+#ifndef __X86_KEXEC_H__
+#define __X86_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+#endif /* __X86_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/public/kexec.h
@@ -0,0 +1,85 @@
+/******************************************************************************
+ * kexec.h - Public portion
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Types based on those in ./vcpu.h on request from Keir Frasier
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+#include "xen.h"
+
+/*
+ * Prototype for this hypercall is:
+ *  int kexec_op(int cmd, int type, void *extra_args)
+ * @cmd        == KEXEC_CMD_... 
+ *                KEXEC operation to perform
+ * @arg1       == Operation-specific unsigned long argument
+ *                This could be in extra_args, but by putting it here
+ *                copy_from_user can be avoided, inparticular in
+ *                KEXEC_CMD_kexec during a crash dump, which is a failry
+ *                critical section of code.If this turns out not to be
+ *                important then it can be collapsed into extra_args.
+ * @extra_args == Operation-specific extra arguments (NULL if none).
+ */
+
+#define KEXEC_TYPE_DEFAULT 0
+#define KEXEC_TYPE_CRASH   1
+
+/*
+ * Perform kexec having previously loaded a kexec or kdump kernel
+ * as appropriate.
+ * @arg1      == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH
+ * @extra_arg == pointer to cpu_user_regs_t structure.
+ */
+#define KEXEC_CMD_kexec                 0
+
+/*
+ * Load kernel image in preparation for kexec or kdump.
+ * @arg1      == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH
+ * @extra_arg == pointer to xen_kexec_image_t structure.
+ */
+#define KEXEC_CMD_kexec_load            1
+typedef struct xen_kexec_image {
+    unsigned long indirection_page;
+    unsigned long reboot_code_buffer;
+    unsigned long start_address;
+} xen_kexec_image_t;
+
+/*
+ * Clean up image loaded by KEXEC_CMD_kexec_load
+ * @arg1      == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH
+ */
+#define KEXEC_CMD_kexec_unload          2
+
+/*
+ * Find the base pointer and size of the area that xen has 
+ * reserved for use by the crash kernel.
+ * @extra_arg == pointer to xen_kexec_reserve_t structure.
+ */
+#define KEXEC_CMD_kexec_reserve         3
+typedef struct xen_kexec_reserve {
+    unsigned long size;
+    unsigned long start;
+} xen_kexec_reserve_t;
+
+/*
+ * Find the base pointer of the area that xen has 
+ * reserved for use by a crash note for a given VCPU
+ * @extra_arg == pointer to unsigned long.
+ */
+#define KEXEC_CMD_kexec_crash_note      4
+
+#endif /* _XEN_PUBLIC_KEXEC_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/xen/elfcore.h
@@ -0,0 +1,73 @@
+/******************************************************************************
+ * include/xen/elfcore.h
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on include/linux/elfcore.h from Linux 2.6.16
+ * Naming scheeme based on include/xen/elf.h (not include/linux/elfcore.h)
+ *
+ */
+
+#ifndef __ELFCOREC_H__
+#define __ELFCOREC_H__
+
+#include <xen/types.h>
+#include <xen/elf.h>
+#include <public/xen.h>
+
+#define NT_PRSTATUS     1
+
+typedef struct
+{
+    int signo;                       /* signal number */
+    int code;                        /* extra code */
+    int errno;                       /* errno */
+} ELF_Signifo;
+
+/* These seem to be the same length on all architectures on Linux */
+typedef int ELF_Pid;
+typedef struct {
+	long tv_sec;
+	long tv_usec;
+} ELF_Timeval;
+typedef unsigned long ELF_Greg;
+#define ELF_NGREG (sizeof (struct cpu_user_regs) / sizeof(ELF_Greg))
+typedef ELF_Greg ELF_Gregset[ELF_NGREG];
+
+/*
+ * Definitions to generate Intel SVR4-like core files.
+ * These mostly have the same names as the SVR4 types with "elf_"
+ * tacked on the front to prevent clashes with linux definitions,
+ * and the typedef forms have been avoided.  This is mostly like
+ * the SVR4 structure, but more Linuxy, with things that Linux does
+ * not support and which gdb doesn't really use excluded.
+ */
+typedef struct
+{
+    ELF_Signifo pr_info;         /* Info associated with signal */
+    short pr_cursig;             /* Current signal */
+    unsigned long pr_sigpend;    /* Set of pending signals */
+    unsigned long pr_sighold;    /* Set of held signals */
+    ELF_Pid pr_pid;
+    ELF_Pid pr_ppid;
+    ELF_Pid pr_pgrp;
+    ELF_Pid pr_sid;
+    ELF_Timeval pr_utime;        /* User time */
+    ELF_Timeval pr_stime;        /* System time */
+    ELF_Timeval pr_cutime;       /* Cumulative user time */
+    ELF_Timeval pr_cstime;       /* Cumulative system time */
+    ELF_Gregset pr_reg;          /* GP registers */
+    int pr_fpvalid;              /* True if math co-processor being used.  */
+} ELF_Prstatus;
+
+#endif /* __ELFCOREC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/xen/hypercall.h
+++ x/xen/include/xen/hypercall.h
@@ -102,4 +102,10 @@ do_hvm_op(
     unsigned long op,
     XEN_GUEST_HANDLE(void) arg);
 
+extern long
+do_kexec_op(
+    unsigned long op,
+    int arg1,
+    XEN_GUEST_HANDLE(void) arg);
+
 #endif /* __XEN_HYPERCALL_H__ */
--- /dev/null
+++ x/xen/include/xen/kexec.h
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * include/xen/kexec.h - Internal archtecture independant portion
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ */
+
+#include <public/kexec.h>
+
+#define MAX_NOTE_BYTES 1024
+
+typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
+DECLARE_PER_CPU (note_buf_t, crash_notes);
+
+int machine_kexec_load(int type, xen_kexec_image_t *image);
+void machine_kexec_unload(int type, xen_kexec_image_t *image);
+void machine_kexec_reserved(xen_kexec_reserve_t *reservation);
+void machine_kexec(xen_kexec_image_t *image);
+void machine_shutdown(xen_kexec_image_t *image);
+void machine_crash_shutdown(cpu_user_regs_t *regs);
+
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/xen/mm.h
+++ x/xen/include/xen/mm.h
@@ -40,6 +40,7 @@ struct page_info;
 paddr_t init_boot_allocator(paddr_t bitmap_start);
 void init_boot_pages(paddr_t ps, paddr_t pe);
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align);
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at);
 void end_boot_allocator(void);
 
 /* Generic allocator. These functions are *not* interrupt-safe. */
--- x/patches/linux-2.6.16.13/series
+++ x/patches/linux-2.6.16.13/series
@@ -1,3 +1,5 @@
+kexec-generic.patch
+linux-2.6.16-kexec_page_table_a_stubs.patch
 blktap-aio-16_03_06.patch
 device_bind.patch
 fix-hz-suspend.patch
--- /dev/null
+++ x/patches/linux-2.6.16.13/kexec-generic.patch
@@ -0,0 +1,283 @@
+ drivers/base/cpu.c    |   20 +++++++++++++++++
+ include/linux/kexec.h |    5 ++++
+ kernel/kexec.c        |   57 ++++++++++++++++++++++++++++++++++++++-----------
+ kernel/sys.c          |    4 +++
+ 4 files changed, 74 insertions(+), 12 deletions(-)
+
+--- x/drivers/base/cpu.c
++++ x/drivers/base/cpu.c
+@@ -11,6 +11,10 @@
+ 
+ #include "base.h"
+ 
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#endif
++
+ struct sysdev_class cpu_sysdev_class = {
+ 	set_kset_name("cpu"),
+ };
+@@ -86,6 +90,18 @@ static inline void register_cpu_control(
+ #ifdef CONFIG_KEXEC
+ #include <linux/kexec.h>
+ 
++#ifdef CONFIG_XEN
++static unsigned long get_crash_notes(int cpu)
++{
++	unsigned long crash_note;
++
++	if (HYPERVISOR_kexec(KEXEC_CMD_kexec_crash_note, cpu, &crash_note) < 0)
++		return 0UL;
++	return crash_note;
++}
++#endif
++
++/* XXX: This only finds dom0's CPU's */
+ static ssize_t show_crash_notes(struct sys_device *dev, char *buf)
+ {
+ 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+@@ -101,7 +117,11 @@ static ssize_t show_crash_notes(struct s
+ 	 * boot up and this data does not change there after. Hence this
+ 	 * operation should be safe. No locking required.
+ 	 */
++#ifndef CONFIG_XEN
+ 	addr = __pa(per_cpu_ptr(crash_notes, cpunum));
++#else
++	addr = (unsigned long long)get_crash_notes(cpunum);
++#endif
+ 	rc = sprintf(buf, "%Lx\n", addr);
+ 	return rc;
+ }
+--- x/include/linux/kexec.h
++++ x/include/linux/kexec.h
+@@ -91,6 +91,11 @@ struct kimage {
+ extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
+ extern int machine_kexec_prepare(struct kimage *image);
+ extern void machine_kexec_cleanup(struct kimage *image);
++#ifdef CONFIG_XEN
++extern int xen_machine_kexec_load(struct kimage *image);
++extern void xen_machine_kexec_unload(struct kimage *image);
++extern NORET_TYPE void xen_machine_kexec(struct kimage *image) ATTRIB_NORET;
++#endif
+ extern asmlinkage long sys_kexec_load(unsigned long entry,
+ 					unsigned long nr_segments,
+ 					struct kexec_segment __user *segments,
+--- x/kernel/kexec.c
++++ x/kernel/kexec.c
+@@ -26,6 +26,9 @@
+ #include <asm/io.h>
+ #include <asm/system.h>
+ #include <asm/semaphore.h>
++#ifdef CONFIG_XEN
++#include <asm/kexec-xen.h>
++#endif
+ 
+ /* Per cpu memory for storing cpu states in case of system crash. */
+ note_buf_t* crash_notes;
+@@ -403,7 +406,7 @@ static struct page *kimage_alloc_normal_
+ 		pages = kimage_alloc_pages(GFP_KERNEL, order);
+ 		if (!pages)
+ 			break;
+-		pfn   = page_to_pfn(pages);
++		pfn   = kexec_page_to_pfn(pages);
+ 		epfn  = pfn + count;
+ 		addr  = pfn << PAGE_SHIFT;
+ 		eaddr = epfn << PAGE_SHIFT;
+@@ -437,6 +440,7 @@ static struct page *kimage_alloc_normal_
+ 	return pages;
+ }
+ 
++#ifndef CONFIG_XEN
+ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ 						      unsigned int order)
+ {
+@@ -490,7 +494,7 @@ static struct page *kimage_alloc_crash_c
+ 		}
+ 		/* If I don't overlap any segments I have found my hole! */
+ 		if (i == image->nr_segments) {
+-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
++			pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
+ 			break;
+ 		}
+ 	}
+@@ -517,6 +521,13 @@ struct page *kimage_alloc_control_pages(
+ 
+ 	return pages;
+ }
++#else /* !CONFIG_XEN */
++struct page *kimage_alloc_control_pages(struct kimage *image,
++					 unsigned int order)
++{
++	return kimage_alloc_normal_control_pages(image, order);
++}
++#endif
+ 
+ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+ {
+@@ -532,7 +543,7 @@ static int kimage_add_entry(struct kimag
+ 			return -ENOMEM;
+ 
+ 		ind_page = page_address(page);
+-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
++		*image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
+ 		image->entry = ind_page;
+ 		image->last_entry = ind_page +
+ 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+@@ -593,13 +604,13 @@ static int kimage_terminate(struct kimag
+ #define for_each_kimage_entry(image, ptr, entry) \
+ 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ 		ptr = (entry & IND_INDIRECTION)? \
+-			phys_to_virt((entry & PAGE_MASK)): ptr +1)
++			kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
+ 
+ static void kimage_free_entry(kimage_entry_t entry)
+ {
+ 	struct page *page;
+ 
+-	page = pfn_to_page(entry >> PAGE_SHIFT);
++	page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
+ 	kimage_free_pages(page);
+ }
+ 
+@@ -611,6 +622,10 @@ static void kimage_free(struct kimage *i
+ 	if (!image)
+ 		return;
+ 
++#ifdef CONFIG_XEN
++	xen_machine_kexec_unload(image);
++#endif
++
+ 	kimage_free_extra_pages(image);
+ 	for_each_kimage_entry(image, ptr, entry) {
+ 		if (entry & IND_INDIRECTION) {
+@@ -686,7 +701,7 @@ static struct page *kimage_alloc_page(st
+ 	 * have a match.
+ 	 */
+ 	list_for_each_entry(page, &image->dest_pages, lru) {
+-		addr = page_to_pfn(page) << PAGE_SHIFT;
++		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
+ 		if (addr == destination) {
+ 			list_del(&page->lru);
+ 			return page;
+@@ -701,12 +716,12 @@ static struct page *kimage_alloc_page(st
+ 		if (!page)
+ 			return NULL;
+ 		/* If the page cannot be used file it away */
+-		if (page_to_pfn(page) >
++		if (kexec_page_to_pfn(page) >
+ 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ 			list_add(&page->lru, &image->unuseable_pages);
+ 			continue;
+ 		}
+-		addr = page_to_pfn(page) << PAGE_SHIFT;
++		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
+ 
+ 		/* If it is the destination page we want use it */
+ 		if (addr == destination)
+@@ -729,7 +744,7 @@ static struct page *kimage_alloc_page(st
+ 			struct page *old_page;
+ 
+ 			old_addr = *old & PAGE_MASK;
+-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
++			old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
+ 			copy_highpage(page, old_page);
+ 			*old = addr | (*old & ~PAGE_MASK);
+ 
+@@ -779,7 +794,7 @@ static int kimage_load_normal_segment(st
+ 			result  = -ENOMEM;
+ 			goto out;
+ 		}
+-		result = kimage_add_page(image, page_to_pfn(page)
++		result = kimage_add_page(image, kexec_page_to_pfn(page)
+ 								<< PAGE_SHIFT);
+ 		if (result < 0)
+ 			goto out;
+@@ -811,6 +826,7 @@ out:
+ 	return result;
+ }
+ 
++#ifndef CONFIG_XEN
+ static int kimage_load_crash_segment(struct kimage *image,
+ 					struct kexec_segment *segment)
+ {
+@@ -833,7 +849,7 @@ static int kimage_load_crash_segment(str
+ 		char *ptr;
+ 		size_t uchunk, mchunk;
+ 
+-		page = pfn_to_page(maddr >> PAGE_SHIFT);
++		page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
+ 		if (page == 0) {
+ 			result  = -ENOMEM;
+ 			goto out;
+@@ -881,6 +897,13 @@ static int kimage_load_segment(struct ki
+ 
+ 	return result;
+ }
++#else /* CONFIG_XEN */
++static int kimage_load_segment(struct kimage *image,
++				struct kexec_segment *segment)
++{
++	return kimage_load_normal_segment(image, segment);
++}
++#endif
+ 
+ /*
+  * Exec Kernel system call: for obvious reasons only root may call it.
+@@ -991,6 +1014,11 @@ asmlinkage long sys_kexec_load(unsigned 
+ 		if (result)
+ 			goto out;
+ 	}
++#ifdef CONFIG_XEN
++	result = xen_machine_kexec_load(image);
++	if (result)
++		goto out;
++#endif
+ 	/* Install the new kernel, and  Uninstall the old */
+ 	image = xchg(dest_image, image);
+ 
+@@ -1045,7 +1073,6 @@ void crash_kexec(struct pt_regs *regs)
+ 	struct kimage *image;
+ 	int locked;
+ 
+-
+ 	/* Take the kexec_lock here to prevent sys_kexec_load
+ 	 * running on one cpu from replacing the crash kernel
+ 	 * we are using after a panic on a different cpu.
+@@ -1061,12 +1088,17 @@ void crash_kexec(struct pt_regs *regs)
+ 			struct pt_regs fixed_regs;
+ 			crash_setup_regs(&fixed_regs, regs);
+ 			machine_crash_shutdown(&fixed_regs);
++#ifdef CONFIG_XEN
++			xen_machine_kexec(image);
++#else
+ 			machine_kexec(image);
++#endif
+ 		}
+ 		xchg(&kexec_lock, 0);
+ 	}
+ }
+ 
++#ifndef CONFIG_XEN
+ static int __init crash_notes_memory_init(void)
+ {
+ 	/* Allocate memory for saving cpu registers. */
+@@ -1079,3 +1111,4 @@ static int __init crash_notes_memory_ini
+ 	return 0;
+ }
+ module_init(crash_notes_memory_init)
++#endif
+--- x/kernel/sys.c
++++ x/kernel/sys.c
+@@ -435,8 +435,12 @@ void kernel_kexec(void)
+ 	kernel_restart_prepare(NULL);
+ 	printk(KERN_EMERG "Starting new kernel\n");
+ 	machine_shutdown();
++#ifdef CONFIG_XEN
++	xen_machine_kexec(image);
++#else
+ 	machine_kexec(image);
+ #endif
++#endif
+ }
+ EXPORT_SYMBOL_GPL(kernel_kexec);
+ 
--- /dev/null
+++ x/patches/linux-2.6.16.13/linux-2.6.16-kexec_page_table_a_stubs.patch
@@ -0,0 +1,85 @@
+kexec: Avoid overwriting the current pgd (V2, stubs)
+
+This patch adds an architecture specific structure "struct kimage_arch" to
+struct kimage. This structure is filled in with members by the architecture
+specific patches followed by this one.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+---
+
+ Applies on top of 2.6.16 and 2.6.17-rc4.
+
+ include/asm-i386/kexec.h    |    2 ++
+ include/asm-powerpc/kexec.h |    2 ++
+ include/asm-s390/kexec.h    |    2 ++
+ include/asm-sh/kexec.h      |    2 ++
+ include/asm-x86_64/kexec.h  |    2 ++
+ include/linux/kexec.h       |    2 ++
+ 6 files changed, 12 insertions(+)
+
+--- x/include/asm-i386/kexec.h
++++ x/include/asm-i386/kexec.h
+@@ -29,6 +29,8 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ /* CPU does not save ss and esp on stack if execution is already
+  * running in kernel mode at the time of NMI occurrence. This code
+  * fixes it.
+--- x/include/asm-powerpc/kexec.h
++++ x/include/asm-powerpc/kexec.h
+@@ -108,6 +108,8 @@ static inline void crash_setup_regs(stru
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ #ifdef __powerpc64__
+ extern void kexec_smp_wait(void);	/* get and clear naca physid, wait for
+ 					  master to copy new code to 0 */
+--- x/include/asm-s390/kexec.h
++++ x/include/asm-s390/kexec.h
+@@ -36,6 +36,8 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ /* Provide a dummy definition to avoid build failures. */
+ static inline void crash_setup_regs(struct pt_regs *newregs,
+ 					struct pt_regs *oldregs) { }
+--- x/include/asm-sh/kexec.h
++++ x/include/asm-sh/kexec.h
+@@ -25,6 +25,8 @@
+ 
+ #ifndef __ASSEMBLY__
+ 
++struct kimage_arch {};
++
+ extern void machine_shutdown(void);
+ extern void *crash_notes;
+ 
+--- x/include/asm-x86_64/kexec.h
++++ x/include/asm-x86_64/kexec.h
+@@ -29,6 +29,8 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ /*
+  * Saving the registers of the cpu on which panic occured in
+  * crash_kexec to save a valid sp. The registers of other cpus
+--- x/include/linux/kexec.h
++++ x/include/linux/kexec.h
+@@ -69,6 +69,8 @@ struct kimage {
+ 	unsigned long start;
+ 	struct page *control_code_page;
+ 
++	struct kimage_arch arch_data;
++
+ 	unsigned long nr_segments;
+ 	struct kexec_segment segment[KEXEC_SEGMENT_MAX];
+ 

[-- Attachment #3: 51.1.1-kexec-trigger_crash_dump.patch --]
[-- Type: text/plain, Size: 1624 bytes --]

console: allow a crash dump to be triggered from the xen console

This feature is needed to test crash dump. It is essential for development
(though developers could easily add the patch). It may also be of
use for testing of the roll-out of production systems (people who don't
want to add patches).

The original version of this patch triggered a panic, Keir Fraser
suggested changing it to trigger a crash dump in line with a
similar feature in Linux's sysrq.

Christian Limpach suggested changing the original trigger 'p' (for panic)
to 'D' for dump, as p is already used by the performance counters.
This patch uses 'c' for crashdump, again in line with the similar
feature in Linux's sysrq. On inspection of the code, 'c' does not
seem to be already taken.

Signed-Off-By: Horms <horms@verge.net.au>

 xen/common/kexec.c |   14 ++++++++++++++
 1 file changed, 14 insertions(+)

--- x/xen/common/kexec.c
+++ x/xen/common/kexec.c
@@ -13,6 +13,7 @@
 #include <xen/sched.h>
 #include <xen/types.h>
 #include <xen/kexec.h>
+#include <xen/keyhandler.h>
 #include <public/kexec.h>
 
 DEFINE_PER_CPU (note_buf_t, crash_notes);
@@ -55,6 +56,19 @@ void crash_kexec(struct cpu_user_regs *r
     return;
 }
 
+static void do_crashdump_trigger(unsigned char key)
+{
+	printk("triggering crashdump\n");
+	crash_kexec(NULL);
+}
+
+static __init int register_crashdump_trigger(void)
+{
+	register_keyhandler('c', do_crashdump_trigger, "trigger a crashdump");
+	return 0;
+}
+__initcall(register_crashdump_trigger);
+
 static int get_crash_note(int vcpuid, XEN_GUEST_HANDLE(void) uarg)
 {
     struct domain *domain = current->domain;

[-- Attachment #4: 51.2.1-kexec-x86-upstream.patch --]
[-- Type: text/plain, Size: 23382 bytes --]

kexec: x86

This is the x86 component of kexec for xen.
The generic component is a prerequsite for this patch.
The x86_64 or x86_32 (i386) patch is also needed
in order to use this code, however the code should compile is.

Signed-Off-By: Horms <horms@verge.net.au>
Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>

 xen/arch/x86/crash.c                |  173 ++++++++++++++++++++++++++++++++++-
 xen/arch/x86/machine_kexec.c        |  145 +++++++++++++++++++++++++++--
 xen/arch/x86/setup.c                |   75 +++++++++++++--
 xen/arch/x86/x86_32/Makefile        |    1 
 xen/arch/x86/x86_32/machine_kexec.c |   26 +++++
 xen/arch/x86/x86_64/Makefile        |    1 
 xen/arch/x86/x86_64/machine_kexec.c |   27 +++++
 xen/include/asm-x86/elf.h           |   27 +++++
 xen/include/asm-x86/fixmap.h        |    1 
 xen/include/asm-x86/hypercall.h     |    5 +
 xen/include/asm-x86/kexec.h         |   13 +-
 xen/include/asm-x86/x86_32/elf.h    |   28 +++++
 xen/include/asm-x86/x86_32/kexec.h  |   48 +++++++++
 xen/include/asm-x86/x86_64/elf.h    |   28 +++++
 xen/include/asm-x86/x86_64/kexec.h  |   33 ++++++
 xen/include/public/kexec.h          |    3 
 xen/include/xen/elfcore.h           |    3 
 17 files changed, 611 insertions(+), 26 deletions(-)

--- x/xen/arch/x86/crash.c
+++ x/xen/arch/x86/crash.c
@@ -3,16 +3,183 @@
  * 
  * Created By: Horms
  *
- * Should be based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
+ * Based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
  */
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <asm/atomic.h>
+#include <asm/elf.h>
+#include <asm/percpu.h>
+#include <asm/kexec.h>
 #include <xen/types.h>
+#include <xen/irq.h>
+#include <asm/ipi.h>
+#include <asm/nmi.h>
+#include <xen/string.h>
+#include <xen/elf.h>
+#include <xen/elfcore.h>
+#include <xen/smp.h>
+#include <xen/delay.h>
+#include <xen/perfc.h>
+#include <xen/kexec.h>
 #include <public/xen.h>
 
+static int crashing_cpu;
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+							       size_t data_len)
+{
+	Elf_Note note;
+
+	note.namesz = strlen(name) + 1;
+	note.descsz = data_len;
+	note.type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) +3)/4;
+	memcpy(buf, name, note.namesz);
+	buf += (note.namesz + 3)/4;
+	memcpy(buf, data, note.descsz);
+	buf += (note.descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	Elf_Note note;
+
+	note.namesz = 0;
+	note.descsz = 0;
+	note.type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static void crash_save_this_cpu(struct cpu_user_regs *regs, int cpu)
+{
+	ELF_Prstatus prstatus;
+	uint32_t *buf;
+
+	printk("crash_save_this_cpu: %d\n",  cpu);
+
+	if ((cpu < 0) || (cpu >= NR_CPUS))
+		return;
+
+	/* Using ELF notes here is opportunistic.
+	 * A well defined structure format with tags is needed
+	 * ELF notes happen to provide this and there is infastructure
+	 * in the Linux kernel to supprot them. In order to make
+	 * crash dumps produced by xen the same, the same
+	 * technique is used here.
+	 */
+
+	/* It should be safe to use per_cpu() here instead of per_cpu_ptr()
+	 * (which does not exist in xen) as kexecing_lock must be held in
+	 * order to get anywhere near here */
+	buf = (uint32_t *)per_cpu(crash_notes, cpu);
+	if (!buf) /* XXX: Can this ever occur? */
+		return;
+	memset(&prstatus, 0, sizeof(prstatus));
+	/* XXX: Xen does not have processes. For the crashing CPU on a dom0
+	 * crash this could be pased down from dom0, but is this
+	 * neccessary?
+	 * prstatus.pr_pid = current->pid; */
+	ELF_CORE_COPY_REGS(prstatus.pr_reg, regs);
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+				sizeof(prstatus));
+	final_note(buf);
+}
+
+static void crash_save_self(struct cpu_user_regs *regs)
+{
+	crash_save_this_cpu(regs, smp_processor_id());
+}
+
+#ifdef CONFIG_SMP
+static atomic_t waiting_for_crash_ipi;
+
+static int crash_nmi_callback(struct cpu_user_regs *regs, int cpu)
+{
+	struct cpu_user_regs fixed_regs;
+
+	/* Don't do anything if this handler is invoked on crashing cpu.
+	 * Otherwise, system will completely hang. Crashing cpu can get
+	 * an NMI if system was initially booted with nmi_watchdog parameter.
+	 */
+	if (cpu == crashing_cpu)
+		return 1;
+	local_irq_disable();
+
+#ifdef CONFIG_X86_32
+	if (!user_mode(regs)) {
+		crash_fixup_ss_esp(&fixed_regs, regs);
+		regs = &fixed_regs;
+	}
+#endif
+	crash_save_this_cpu(regs, cpu);
+	disable_local_APIC();
+	atomic_dec(&waiting_for_crash_ipi);
+	/* Assume hlt works */
+	__asm__ __volatile__ ( "hlt" );
+	for(;;);
+
+	return 1;
+
+	/* Need to use this somewhere as Xen builds with -Werror */
+	crash_setup_regs(&fixed_regs, regs);
+}
+
+/*
+ * By using the NMI code instead of a vector we just sneak thru the
+ * word generator coming out with just what we want.  AND it does
+ * not matter if clustered_apic_mode is set or not.
+ */
+static void smp_send_nmi_allbutself(void)
+{
+	cpumask_t allbutself = cpu_online_map;
+    	cpu_clear(smp_processor_id(), allbutself);
+	send_IPI_mask(allbutself, APIC_DM_NMI);
+}
+
+static void nmi_shootdown_cpus(void)
+{
+	unsigned long msecs;
+
+	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+	/* Would it be better to replace the trap vector here? */
+	set_nmi_callback(crash_nmi_callback);
+	/* Ensure the new callback function is set before sending
+	 * out the NMI
+	 */
+	wmb();
+
+	smp_send_nmi_allbutself();
+
+	msecs = 1000; /* Wait at most a second for the other cpus to stop */
+	while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+		mdelay(1);
+		msecs--;
+	}
+
+	/* Leave the nmi callback set */
+	disable_local_APIC();
+}
+#else
+static void nmi_shootdown_cpus(void)
+{
+	/* There are no cpus to shootdown */
+}
+#endif
+
 void machine_crash_shutdown(struct cpu_user_regs *regs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+	printk("machine_crash_shutdown: %d\n", smp_processor_id());
+	local_irq_disable();
+
+	crashing_cpu = smp_processor_id();
+	nmi_shootdown_cpus();
+#ifdef CONFIG_X86_IO_APIC
+        disable_IO_APIC();
+#endif
+	crash_save_self(regs);
 }
 
 /*
--- x/xen/arch/x86/machine_kexec.c
+++ x/xen/arch/x86/machine_kexec.c
@@ -5,34 +5,163 @@
  *
  */
 
-#include <xen/lib.h>       /* for printk() used in stubs */
+#include <xen/lib.h>
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/flushtlb.h>
+#include <xen/lib.h>
+#include <xen/smp.h>
+#include <xen/nmi.h>
 #include <xen/types.h>
+#include <xen/console.h>
+#include <xen/kexec.h>
 #include <public/kexec.h>
+#include <xen/domain_page.h>
+#include <asm/fixmap.h>
+ 
+#define create_level_mapping(lvl, next, pages, nopages, k, va)               \
+{                                                                            \
+    lvl##_pgentry_t *table;                                                  \
+    void *old = next;                                                        \
+                                                                             \
+    table = (lvl##_pgentry_t *)next + lvl##_table_offset(va);                \
+    if (!(lvl##e_get_flags(*table) & _PAGE_PRESENT)) {                       \
+        if (k >= nopages || pages[k] == 0)                                   \
+            return -1;                                                       \
+        *table = lvl##e_from_pfn(pages[k++]>>PAGE_SHIFT, __PAGE_HYPERVISOR); \
+    }                                                                        \
+    next = map_domain_page(lvl##e_get_pfn(*table));                          \
+    unmap_domain_page(old);                                                  \
+}                                                                            
+
+#define create_level_1_mapping(next, nopages, va, pa)               \
+{                                                                   \
+    l1_pgentry_t *table;                                            \
+                                                                    \
+    table = (l1_pgentry_t *)next + l1_table_offset(va);             \
+    if (!(l1e_get_flags(*table) & _PAGE_PRESENT)) {                 \
+        *table = l1e_from_pfn(pa >> PAGE_SHIFT, __PAGE_HYPERVISOR); \
+    }                                                               \
+    unmap_domain_page(next);                                        \
+}
+
+static int create_mapping(unsigned long root,
+                          unsigned long *pages, int nopages,
+                          unsigned long va, unsigned long pa)
+{
+    void *next = map_domain_page(root >> PAGE_SHIFT);
+    int k = 0;
+
+#if CONFIG_PAGING_LEVELS >= 4
+    create_level_mapping(l4, next, pages, nopages, k, va);
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+    create_level_mapping(l3, next, pages, nopages, k, va);
+#endif
+    create_level_mapping(l2, next, pages, nopages, k, va);
+
+    create_level_1_mapping(next, nopages, va, pa);
+
+    return k;
+}
+
+static int setup_page_table_a(xen_kexec_image_t *image)
+{
+    void *page;
+    int k, n = sizeof(image->page_table_a) / sizeof(image->page_table_a[0]);
+
+    /* clear page_table_a pages */
+
+    for (k = 0; k < n; k++) {
+        if (!image->page_table_a[k])
+            break;
+
+        page = map_domain_page(image->page_table_a[k] >> PAGE_SHIFT);
+        clear_page(page);
+        unmap_domain_page(page);
+    }
+
+    /* check that the first page (root page) is actually non-zero */
+
+    if (k == 0)
+        return -1;
+
+    /* setup fixmap to point to our control page */
+
+    set_fixmap(FIX_KEXEC_PAGE, image->reboot_code_buffer);
+
+    /* fill in page_table_a: create mapping at fixmap address */
+
+    k = create_mapping(image->page_table_a[0],
+                       &image->page_table_a[1],
+                       n - 1, fix_to_virt(FIX_KEXEC_PAGE),
+                       image->reboot_code_buffer);
+    if (k < 0)
+        return -1;
+
+    /* fill in page_table_a: create identity mapping */
+
+    k = create_mapping(image->page_table_a[0],
+                       &image->page_table_a[1 + k],
+                       n - (1 + k), image->reboot_code_buffer,
+                       image->reboot_code_buffer);
+    if (k < 0)
+        return -1;
+
+        return 0;
+}
 
 int machine_kexec_load(int type, xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-    return -1;
+    return setup_page_table_a(image);
 }
 
 void machine_kexec_unload(int type, xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
 }
 
 void machine_kexec_reserved(xen_kexec_reserve_t *reservation)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    reservation->size = opt_kdump_megabytes << 20;
+    reservation->start = opt_kdump_megabytes_base << 20;
 }
 
-void machine_kexec(xen_kexec_image_t *image)
+static void __machine_shutdown(void *data)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    xen_kexec_image_t *image = (xen_kexec_image_t *)data;
+
+    watchdog_disable();
+    console_start_sync();
+
+    smp_send_stop();
+
+#ifdef CONFIG_X86_IO_APIC
+    disable_IO_APIC();
+#endif   
+
+    machine_kexec(image);
 }
 
 void machine_shutdown(xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    int reboot_cpu_id;
+    cpumask_t reboot_cpu;
+
+    reboot_cpu_id = 0;
+
+    if (!cpu_isset(reboot_cpu_id, cpu_online_map))
+        reboot_cpu_id = smp_processor_id();
+    
+    if (reboot_cpu_id != smp_processor_id()) {
+        cpus_clear(reboot_cpu);
+        cpu_set(reboot_cpu_id, reboot_cpu);
+        on_selected_cpus(reboot_cpu, __machine_shutdown, image, 1, 0);
+        for (;;)
+                ; /* nothing */
+    }
+    else
+        __machine_shutdown(image);
+    BUG();
 }
 
 /*
--- x/xen/arch/x86/setup.c
+++ x/xen/arch/x86/setup.c
@@ -39,6 +39,11 @@ static unsigned int opt_xenheap_megabyte
 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
 #endif
 
+unsigned int opt_kdump_megabytes = 0;
+integer_param("kdump_megabytes", opt_kdump_megabytes);
+unsigned int opt_kdump_megabytes_base = 0;
+integer_param("kdump_megabytes_base", opt_kdump_megabytes_base);
+
 /* opt_nosmp: If true, secondary processors are ignored. */
 static int opt_nosmp = 0;
 boolean_param("nosmp", opt_nosmp);
@@ -220,6 +225,20 @@ static void __init init_idle_domain(void
     setup_idle_pagetable();
 }
 
+void __init move_memory(unsigned long dst, 
+                          unsigned long src_start, unsigned long src_end)
+{
+#if defined(CONFIG_X86_32)
+    memmove((void *)dst,  /* use low mapping */
+            (void *)src_start,      /* use low mapping */
+            src_end - src_start);
+#elif defined(CONFIG_X86_64)
+    memmove(__va(dst),
+            __va(src_start),
+            src_end - src_start);
+#endif
+}
+
 void __init __start_xen(multiboot_info_t *mbi)
 {
     char __cmdline[] = "", *cmdline = __cmdline;
@@ -353,15 +372,8 @@ void __init __start_xen(multiboot_info_t
         initial_images_start = xenheap_phys_end;
     initial_images_end = initial_images_start + modules_length;
 
-#if defined(CONFIG_X86_32)
-    memmove((void *)initial_images_start,  /* use low mapping */
-            (void *)mod[0].mod_start,      /* use low mapping */
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#elif defined(CONFIG_X86_64)
-    memmove(__va(initial_images_start),
-            __va(mod[0].mod_start),
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#endif
+    move_memory(initial_images_start, 
+                mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
 
     /* Initialise boot-time allocator with all RAM situated after modules. */
     xenheap_phys_start = init_boot_allocator(__pa(&_end));
@@ -409,6 +421,51 @@ void __init __start_xen(multiboot_info_t
 #endif
     }
 
+    if (opt_kdump_megabytes) {
+        unsigned long kdump_start, kdump_size, k;
+
+        /* mark images pages as free for now */
+
+        init_boot_pages(initial_images_start, initial_images_end);
+
+        kdump_start = opt_kdump_megabytes_base << 20;
+        kdump_size = opt_kdump_megabytes << 20;
+
+        printk("Kdump: %luMB (%lukB) at 0x%lx\n", 
+               kdump_size >> 20,
+               kdump_size >> 10,
+               kdump_start);
+
+        if ((kdump_start & ~PAGE_MASK) || (kdump_size & ~PAGE_MASK))
+            panic("Kdump parameters not page aligned\n");
+
+        kdump_start >>= PAGE_SHIFT;
+        kdump_size >>= PAGE_SHIFT;
+
+        /* allocate pages for Kdump memory area */
+
+        k = alloc_boot_pages_at(kdump_size, kdump_start);
+
+        if (k != kdump_start)
+            panic("Unable to reserve Kdump memory\n");
+
+        /* allocate pages for relocated initial images */
+
+        k = ((initial_images_end - initial_images_start) & ~PAGE_MASK) ? 1 : 0;
+        k += (initial_images_end - initial_images_start) >> PAGE_SHIFT;
+
+        k = alloc_boot_pages(k, 1);
+
+        if (!k)
+            panic("Unable to allocate initial images memory\n");
+
+        move_memory(k << PAGE_SHIFT, initial_images_start, initial_images_end);
+
+        initial_images_end -= initial_images_start;
+        initial_images_start = k << PAGE_SHIFT;
+        initial_images_end += initial_images_start;
+    }        
+
     memguard_init();
     percpu_guard_areas();
 
--- x/xen/arch/x86/x86_32/Makefile
+++ x/xen/arch/x86/x86_32/Makefile
@@ -3,5 +3,6 @@ obj-y += entry.o
 obj-y += mm.o
 obj-y += seg_fixup.o
 obj-y += traps.o
+obj-y += machine_kexec.o
 
 obj-$(supervisor_mode_kernel) += supervisor_mode_kernel.o
--- /dev/null
+++ x/xen/arch/x86/x86_32/machine_kexec.c
@@ -0,0 +1,26 @@
+/*
+ * arch/x86/x86_32/machine_kexec.c
+ * Handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Should be losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <public/kexec.h>
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/x86_64/Makefile
+++ x/xen/arch/x86/x86_64/Makefile
@@ -1,3 +1,4 @@
 obj-y += entry.o
 obj-y += mm.o
 obj-y += traps.o
+obj-y += machine_kexec.o
--- /dev/null
+++ x/xen/arch/x86/x86_64/machine_kexec.c
@@ -0,0 +1,27 @@
+/******************************************************************************
+ * arch/x86/x86_64/machine_kexec.c
+ * Handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@verge.net.au>
+ *
+ * Should be losely based on arch/x86_64/kernel/machine_kexec.c
+ */
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/kexec.h>
+
+void machine_kexec(xen_kexec_image_t *image)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/elf.h
@@ -0,0 +1,27 @@
+/******************************************************************************
+ * include/asm-x86/elf.h
+ * 
+ * Created By: Horms
+ *
+ */
+
+#ifndef __X86_ELF_H__
+#define __X86_ELF_H__
+
+#ifdef __x86_64__
+#include <asm/x86_64/elf.h>
+#else
+#include <asm/x86_32/elf.h>
+#endif
+
+#endif /* __X86_ELF_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/asm-x86/fixmap.h
+++ x/xen/include/asm-x86/fixmap.h
@@ -36,6 +36,7 @@ enum fixed_addresses {
     FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
     FIX_HPET_BASE,
     FIX_CYCLONE_TIMER,
+    FIX_KEXEC_PAGE,
     __end_of_fixed_addresses
 };
 
--- x/xen/include/asm-x86/hypercall.h
+++ x/xen/include/asm-x86/hypercall.h
@@ -6,6 +6,7 @@
 #define __ASM_X86_HYPERCALL_H__
 
 #include <public/physdev.h>
+#include <xen/types.h>
 
 extern long
 do_event_channel_op_compat(
@@ -87,6 +88,10 @@ extern long
 arch_do_vcpu_op(
     int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg);
 
+extern int
+do_kexec(
+    unsigned long op, unsigned arg1, XEN_GUEST_HANDLE(void) uarg);
+
 #ifdef __x86_64__
 
 extern long
--- x/xen/include/asm-x86/kexec.h
+++ x/xen/include/asm-x86/kexec.h
@@ -8,15 +8,16 @@
 #ifndef __X86_KEXEC_H__
 #define __X86_KEXEC_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <asm/processor.h>
 #include <xen/types.h>
+#include <xen/string.h>
 #include <public/xen.h>
 
-static void crash_setup_regs(struct cpu_user_regs *newregs,
-			     struct cpu_user_regs *oldregs)
-{
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-}
+#ifdef __x86_64__
+#include <asm/x86_64/kexec.h>
+#else
+#include <asm/x86_32/kexec.h>
+#endif
 
 #endif /* __X86_KEXEC_H__ */
 
--- /dev/null
+++ x/xen/include/asm-x86/x86_32/elf.h
@@ -0,0 +1,28 @@
+/******************************************************************************
+ * include/asm-x86/x86_32/elf.h
+ * 
+ * Created By: Horms
+ *
+ * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS
+ * from Linux 2.6.16
+ */
+
+#ifndef __X86_ELF_X86_32_H__
+#define __X86_ELF_X86_32_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+
+#endif /* __X86_ELF_X86_32_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_32/kexec.h
@@ -0,0 +1,48 @@
+/******************************************************************************
+ * include/asm-x86/x86_32/kexec.h
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16
+ *
+ */
+
+#ifndef __X86_32_KEXEC_H__
+#define __X86_32_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
+		    struct cpu_user_regs *oldregs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    return;
+    crash_fixup_ss_esp(newregs, oldregs);
+}
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+static inline int user_mode(struct cpu_user_regs *regs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    return -1;
+}
+
+
+#endif /* __X86_32_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_64/elf.h
@@ -0,0 +1,28 @@
+/******************************************************************************
+ * include/asm-x86/x86_64/elf.h
+ * 
+ * Created By: Horms
+ *
+ * Should pull be based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS
+ * from Linux 2.6.16
+ */
+
+#ifndef __X86_ELF_X86_64_H__
+#define __X86_ELF_X86_64_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+
+#endif /* __X86_ELF_X86_64_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ x/xen/include/asm-x86/x86_64/kexec.h
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * include/asm-x86/x86_64/kexec.h
+ * 
+ * Created By: Horms
+ *
+ * Should be based heavily on include/asm-x86_64/kexec.h from Linux 2.6.16
+ *
+ */
+
+#ifndef __X86_64_KEXEC_H__
+#define __X86_64_KEXEC_H__
+
+#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
+#include <public/xen.h>
+
+static void crash_setup_regs(struct cpu_user_regs *newregs,
+			     struct cpu_user_regs *oldregs)
+{
+    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+#endif /* __X86_64_KEXEC_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/public/kexec.h
+++ x/xen/include/public/kexec.h
@@ -43,6 +43,9 @@
  */
 #define KEXEC_CMD_kexec_load            1
 typedef struct xen_kexec_image {
+#if defined(__i386__) || defined(__x86_64__)
+    unsigned long page_table_a[7];
+#endif
     unsigned long indirection_page;
     unsigned long reboot_code_buffer;
     unsigned long start_address;
--- x/xen/include/xen/elfcore.h
+++ x/xen/include/xen/elfcore.h
@@ -16,6 +16,9 @@
 #include <public/xen.h>
 
 #define NT_PRSTATUS     1
+#define NT_XEN_DOM0_CR3 0x10000001 /* XXX: Hopefully this is unused,
+					   feel free to change to a 
+					   better/different value */
 
 typedef struct
 {

[-- Attachment #5: 51.2.1.1-kexec-x86_32-upstream.patch --]
[-- Type: text/plain, Size: 32749 bytes --]

kexec: x86_32

This is the x86_32 component of kexec for xen.
The x86 component is a prerequsite for this patch.

Signed-Off-By: Horms <horms@verge.net.au>
Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>

 buildconfigs/linux-defconfig_xen_x86_32                                |    2 
 linux-2.6-xen-sparse/arch/i386/Kconfig                                 |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/Makefile                         |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c                      |   29 
 linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h                      |   57 +
 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h         |    8 
 patches/linux-2.6.16.13/linux-2.6.16-kexec_page_table_a_i386-xen.patch |   59 +
 patches/linux-2.6.16.13/linux-2.6.16-kexec_page_table_a_i386.patch     |  457 ++++++++++
 patches/linux-2.6.16.13/series                                         |    2 
 xen/arch/x86/crash.c                                                   |   47 +
 xen/arch/x86/x86_32/entry.S                                            |    2 
 xen/arch/x86/x86_32/machine_kexec.c                                    |   27 
 xen/include/asm-x86/x86_32/elf.h                                       |   32 
 xen/include/asm-x86/x86_32/kexec.h                                     |   65 +
 14 files changed, 758 insertions(+), 33 deletions(-)

--- x/buildconfigs/linux-defconfig_xen_x86_32
+++ x/buildconfigs/linux-defconfig_xen_x86_32
@@ -184,6 +184,7 @@ CONFIG_MTRR=y
 CONFIG_REGPARM=y
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
@@ -2775,6 +2776,7 @@ CONFIG_NTFS_FS=m
 #
 CONFIG_PROC_FS=y
 CONFIG_PROC_KCORE=y
+# CONFIG_PROC_VMCORE is not set
 CONFIG_SYSFS=y
 CONFIG_TMPFS=y
 # CONFIG_HUGETLB_PAGE is not set
--- x/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ x/linux-2.6-xen-sparse/arch/i386/Kconfig
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_XEN
+	depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
@@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
@@ -68,6 +68,10 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -939,6 +943,7 @@ static void __init parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -949,6 +954,10 @@ static void __init parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1318,9 +1327,22 @@ void __init setup_bootmem_allocator(void
 	}
 #endif
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end)
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
+#else
+	{
+		xen_kexec_reserve_t reservation;
+		BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0,
+					&reservation));
+		if (reservation.size) {
+			crashk_res.start = reservation.start;
+			crashk_res.end = reservation.start + 
+				reservation.size - 1;
+		}
+	}
+#endif
 #endif
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1374,6 +1396,10 @@ legacy_init_iomem_resources(struct e820e
 		res->end = res->start + e820[i].size - 1;
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		request_resource(&iomem_resource, res);
+#ifdef CONFIG_KEXEC
+		request_resource(res, &crashk_res);
+#endif
+
 		if (e820[i].type == E820_RAM) {
 			/*
 			 *  We don't know which RAM region contains kernel data,
@@ -1382,9 +1408,6 @@ legacy_init_iomem_resources(struct e820e
 			 */
 			request_resource(res, code_resource);
 			request_resource(res, data_resource);
-#ifdef CONFIG_KEXEC
-			request_resource(res, &crashk_res);
-#endif
 		}
 	}
 }
--- /dev/null
+++ x/linux-2.6-xen-sparse/include/asm-i386/kexec-xen.h
@@ -0,0 +1,57 @@
+/*
+ * include/asm-i386/kexec-xen.h
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _I386_KEXEC_XEN_H
+#define _I386_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_32.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+					struct cpu_user_regs *xen_regs)
+{
+	xen_regs->ebx    = linux_regs->ebx;
+	xen_regs->ecx    = linux_regs->ecx;
+	xen_regs->edx    = linux_regs->edx;
+	xen_regs->esi    = linux_regs->esi;
+	xen_regs->edi    = linux_regs->edi;
+	xen_regs->ebp    = linux_regs->ebp;
+	xen_regs->eax    = linux_regs->eax;
+	xen_regs->esp    = linux_regs->esp;
+	xen_regs->ss     = linux_regs->xss;
+	xen_regs->cs     = linux_regs->xcs;
+	xen_regs->ds     = linux_regs->xds;
+	xen_regs->es     = linux_regs->xes;
+	xen_regs->eflags = linux_regs->eflags;
+}
+
+/* Kexec needs to know about the actual physical addresss.
+ * But in xen, on some architectures, a physical address is a
+ * pseudo-physical addresss. */
+#ifdef CONFIG_XEN
+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#else
+#define kexec_page_to_pfn(page)  page_to_pfn(page)
+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#endif
+
+#endif /* _I386_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
@@ -385,5 +385,13 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, unsigned int arg1, void * extra_args)
+{
+	return _hypercall3(int, kexec_op, op, arg1, extra_args);
+}
+
+
 
 #endif /* __HYPERCALL_H__ */
--- x/xen/arch/x86/crash.c
+++ x/xen/arch/x86/crash.c
@@ -21,6 +21,7 @@
 #include <xen/delay.h>
 #include <xen/perfc.h>
 #include <xen/kexec.h>
+#include <xen/sched.h>
 #include <public/xen.h>
 
 static int crashing_cpu;
@@ -169,6 +170,51 @@ static void nmi_shootdown_cpus(void)
 }
 #endif
 
+/* The cr3 for dom0 on each of its vcpus
+ * It is added as ELF_Prstatus prstatus.pr_reg[ELF_NGREG-1)], where
+ * prstatus is the data of the elf note, and ELF_NGREG was extended
+ * by one to allow extra space.
+ * This code runs after all cpus except the crashing one have
+ * been shutdown so as to avoid having to hold domlist_lock,
+ * as locking after a crash is playing with fire */
+void find_dom0_cr3(void)
+{
+	struct domain *d;
+	struct vcpu   *v;
+	uint32_t *buf;
+	uint32_t cr3;
+	Elf_Note note;
+
+	/* Don't need to grab domlist_lock as we are the only thing running */
+
+	/* No need to traverse domain_list, as dom0 is always first */
+	d = domain_list;
+	BUG_ON(d->domain_id);
+
+	for_each_vcpu ( d, v ) {
+		if ( test_bit(_VCPUF_down, &v->vcpu_flags) )
+			continue;
+		buf = (uint32_t *)per_cpu(crash_notes, v->processor);
+		if (!buf) /* XXX: Can this ever occur? */
+			continue;
+
+		memcpy(&note, buf, sizeof(Elf_Note));
+		buf += (sizeof(Elf_Note) +3)/4 + (note.namesz + 3)/4 +
+			(note.descsz + 3)/4;
+
+		/* XXX: This probably doesn't take into account shadow mode,
+		 * but that might not be a problem */
+		cr3 = pagetable_get_pfn(v->arch.guest_table);
+
+		buf = append_elf_note(buf, "Xen Domanin-0 CR3",
+			NT_XEN_DOM0_CR3, &cr3, 4);
+		final_note(buf);
+
+		printk("domain:%i vcpu:%u processor:%u cr3:%08x\n", 
+		       d->domain_id, v->vcpu_id, v->processor, cr3);
+	}
+}
+
 void machine_crash_shutdown(struct cpu_user_regs *regs)
 {
 	printk("machine_crash_shutdown: %d\n", smp_processor_id());
@@ -180,6 +226,7 @@ void machine_crash_shutdown(struct cpu_u
         disable_IO_APIC();
 #endif
 	crash_save_self(regs);
+	find_dom0_cr3();
 }
 
 /*
--- x/xen/arch/x86/x86_32/entry.S
+++ x/xen/arch/x86/x86_32/entry.S
@@ -660,6 +660,7 @@ ENTRY(hypercall_table)
         .long do_hvm_op
         .long do_sysctl             /* 35 */
         .long do_domctl
+        .long do_kexec_op
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -702,6 +703,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_hvm_op            */
         .byte 1 /* do_sysctl            */  /* 35 */
         .byte 1 /* do_domctl            */
+        .byte 1 /* do_kexec_op          */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- x/xen/arch/x86/x86_32/machine_kexec.c
+++ x/xen/arch/x86/x86_32/machine_kexec.c
@@ -1,18 +1,31 @@
-/*
+/******************************************************************************
  * arch/x86/x86_32/machine_kexec.c
- * Handle transition of Linux booting another kernel
- *
- * Created By: Horms <horms@verge.net.au>
+ * 
+ * Created By: Horms
  *
- * Should be losely based on arch/i386/kernel/machine_kexec.c
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
  */
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/types.h>
 #include <public/kexec.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+                    unsigned long indirection_page,
+                    unsigned long reboot_code_buffer,
+                    unsigned long start_address,
+                    unsigned long page_table_a,
+                    unsigned long has_pae);
 
 void machine_kexec(xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    relocate_new_kernel_t rnk;
+
+    rnk = (relocate_new_kernel_t) fix_to_virt(FIX_KEXEC_PAGE);
+    (*rnk)(image->indirection_page, image->reboot_code_buffer, 
+           image->start_address, image->page_table_a[0],
+           (unsigned long)cpu_has_pae);
 }
 
 /*
--- x/xen/include/asm-x86/x86_32/elf.h
+++ x/xen/include/asm-x86/x86_32/elf.h
@@ -3,17 +3,39 @@
  * 
  * Created By: Horms
  *
- * Should pull be based on include/asm-i386/elf.h:ELF_CORE_COPY_REGS
- * from Linux 2.6.16
+ * Based heavily on include/asm-i386/elf.h and 
+ * include/asm-i386/system.h from Linux 2.6.16
  */
 
 #ifndef __X86_ELF_X86_32_H__
 #define __X86_ELF_X86_32_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+/* XXX: Xen doesn't have orig_eax.  For kdump, on a dom0 crash, the values
+ * for the crashing CPU could could be passed down from dom0, but is that
+ * neccessary?
+ * Also, I'm not sure why fs and gs are derived from the CPU
+ * rather than regs */
 
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+#define ELF_CORE_COPY_REGS(pr_reg, regs) do {                           \
+    unsigned i;                                                         \
+    pr_reg[0] = regs->ebx;                                              \
+    pr_reg[1] = regs->ecx;                                              \
+    pr_reg[2] = regs->edx;                                              \
+    pr_reg[3] = regs->esi;                                              \
+    pr_reg[4] = regs->edi;                                              \
+    pr_reg[5] = regs->ebp;                                              \
+    pr_reg[6] = regs->eax;                                              \
+    pr_reg[7] = regs->ds;                                               \
+    pr_reg[8] = regs->es;                                               \
+    asm volatile("mov %%fs,%0":"=rm" (i)); pr_reg[9]  = i;              \
+    asm volatile("mov %%gs,%0":"=rm" (i)); pr_reg[10] = i;              \
+    pr_reg[11] = 0; /* regs->orig_eax; */                               \
+    pr_reg[12] = regs->eip;                                             \
+    pr_reg[13] = regs->cs;                                              \
+    pr_reg[14] = regs->eflags;                                          \
+    pr_reg[15] = regs->esp;                                             \
+    pr_reg[16] = regs->ss;                                              \
+} while(0);
 
 #endif /* __X86_ELF_X86_32_H__ */
 
--- x/xen/include/asm-x86/x86_32/kexec.h
+++ x/xen/include/asm-x86/x86_32/kexec.h
@@ -3,39 +3,72 @@
  * 
  * Created By: Horms
  *
- * Should be based heavily on include/asm-i386/kexec.h from Linux 2.6.16
- *
+ * Based heavily on include/asm-i386/kexec.h from Linux 2.6.16
  */
 
-#ifndef __X86_32_KEXEC_H__
-#define __X86_32_KEXEC_H__
-
-#include <xen/lib.h>       /* for printk() used in stub */
-#include <xen/types.h>
-#include <public/xen.h>
+#ifndef __X86_KEXEC_X86_32_H__
+#define __X86_KEXEC_X86_32_H__
 
+/* CPU does not save ss and esp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
 static void crash_fixup_ss_esp(struct cpu_user_regs *newregs,
-		    struct cpu_user_regs *oldregs)
+                    struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-    return;
-    crash_fixup_ss_esp(newregs, oldregs);
+    memcpy(newregs, oldregs, sizeof(*newregs));
+    newregs->esp = (unsigned long)&(oldregs->esp);
+    __asm__ __volatile__(
+            "xorl %%eax, %%eax\n\t"
+            "movw %%ss, %%ax\n\t"
+            :"=a"(newregs->ss));
 }
 
+/*
+ * This function is responsible for capturing register states if coming
+ * via panic otherwise just fix up the ss and esp if coming via kernel
+ * mode exception.
+ */
 static void crash_setup_regs(struct cpu_user_regs *newregs,
 			     struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    if (oldregs)
+        crash_fixup_ss_esp(newregs, oldregs);
+    else {
+        __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx));
+        __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx));
+        __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx));
+        __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi));
+        __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi));
+        __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp));
+        __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax));
+        __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp));
+        __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->ss));
+        __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->cs));
+        __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->ds));
+        __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->es));
+        __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags));
+
+        newregs->eip = (unsigned long)current_text_addr();
+    }
 }
 
+/*              
+ * From Linux 2.6.16's include/asm-i386/mach-xen/asm/ptrace.h
+ *
+ * user_mode_vm(regs) determines whether a register set came from user mode.
+ * This is true if V8086 mode was enabled OR if the register set was from
+ * protected mode with RPL-3 CS value.  This tricky test checks that with
+ * one comparison.  Many places in the kernel can bypass this full check
+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ */
 static inline int user_mode(struct cpu_user_regs *regs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-    return -1;
+    return (regs->cs & 2) != 0;
 }
 
 
-#endif /* __X86_32_KEXEC_H__ */
+#endif /* __X86_KEXEC_X86_32_H__ */
 
 /*
  * Local variables:
--- x/patches/linux-2.6.16.13/series
+++ x/patches/linux-2.6.16.13/series
@@ -1,5 +1,7 @@
 kexec-generic.patch
 linux-2.6.16-kexec_page_table_a_stubs.patch
+linux-2.6.16-kexec_page_table_a_i386.patch
+linux-2.6.16-kexec_page_table_a_i386-xen.patch
 blktap-aio-16_03_06.patch
 device_bind.patch
 fix-hz-suspend.patch
--- /dev/null
+++ x/patches/linux-2.6.16.13/linux-2.6.16-kexec_page_table_a_i386.patch
@@ -0,0 +1,457 @@
+kexec: Avoid overwriting the current pgd (V2, i386)
+
+This patch upgrades the i386-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables called "page_table_a". These
+tables are used to provide an executable identity mapping without overwriting
+the current pgd. This updated version of the patch fixes a PAE bug and moves
+the segment handling code into the reloacte_kernel.S.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+---
+
+ The patch has been tested with regular kexec and CONFIG_CRASH_DUMP.
+ Both PAE and non-PAE configurations work well.
+ Applies on top of 2.6.16 and 2.6.17-rc4.
+
+ arch/i386/kernel/machine_kexec.c   |  230 ++++++++++++++----------------------
+ arch/i386/kernel/relocate_kernel.S |   92 ++++++++++++++
+ include/asm-i386/kexec.h           |   12 +
+ 3 files changed, 192 insertions(+), 142 deletions(-)
+
+--- x/arch/i386/kernel/machine_kexec.c
++++ x/arch/i386/kernel/machine_kexec.c
+@@ -2,6 +2,10 @@
+  * machine_kexec.c - handle transition of Linux booting another kernel
+  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - rewrote identity map code to avoid overwriting current pgd
++ * - moved segment handling code into relocate_kernel.S
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+@@ -19,123 +23,73 @@
+ #include <asm/desc.h>
+ #include <asm/system.h>
+ 
+-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+-
+-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+-#define L2_ATTR (_PAGE_PRESENT)
+-
+-#define LEVEL0_SIZE (1UL << 12UL)
++typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
++					unsigned long indirection_page,
++					unsigned long reboot_code_buffer,
++					unsigned long start_address,
++					unsigned long page_table_a,
++					unsigned long has_pae) ATTRIB_NORET;
+ 
+-#ifndef CONFIG_X86_PAE
+-#define LEVEL1_SIZE (1UL << 22UL)
+-static u32 pgtable_level1[1024] PAGE_ALIGNED;
++const extern unsigned char relocate_new_kernel[];
++extern void relocate_new_kernel_end(void);
++const extern unsigned int relocate_new_kernel_size;
+ 
+-static void identity_map_page(unsigned long address)
++static int allocate_page_table_a(struct kimage *image)
+ {
+-	unsigned long level1_index, level2_index;
+-	u32 *pgtable_level2;
+-
+-	/* Find the current page table */
+-	pgtable_level2 = __va(read_cr3());
++	struct kimage_arch *arch = &image->arch_data;
++	struct page *page;
++	int k = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
++
++	for (; k > 0; k--) {
++		page = kimage_alloc_control_pages(image, 0);
++		if (!page)
++			return -ENOMEM;
++
++		clear_page(page_address(page));
++		arch->page_table_a[k - 1] = page;
++	}
+ 
+-	/* Find the indexes of the physical address to identity map */
+-	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+-	level2_index = address / LEVEL1_SIZE;
+-
+-	/* Identity map the page table entry */
+-	pgtable_level1[level1_index] = address | L0_ATTR;
+-	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+-
+-	/* Flush the tlb so the new mapping takes effect.
+-	 * Global tlb entries are not flushed but that is not an issue.
+-	 */
+-	load_cr3(pgtable_level2);
++	return 0;
+ }
+ 
+-#else
+-#define LEVEL1_SIZE (1UL << 21UL)
+-#define LEVEL2_SIZE (1UL << 30UL)
+-static u64 pgtable_level1[512] PAGE_ALIGNED;
+-static u64 pgtable_level2[512] PAGE_ALIGNED;
+-
+-static void identity_map_page(unsigned long address)
+-{
+-	unsigned long level1_index, level2_index, level3_index;
+-	u64 *pgtable_level3;
++/* workaround for include/asm-i386/pgtable-3level.h */
+ 
+-	/* Find the current page table */
+-	pgtable_level3 = __va(read_cr3());
+-
+-	/* Find the indexes of the physical address to identity map */
+-	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
+-	level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
+-	level3_index = address / LEVEL2_SIZE;
+-
+-	/* Identity map the page table entry */
+-	pgtable_level1[level1_index] = address | L0_ATTR;
+-	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
+-	set_64bit(&pgtable_level3[level3_index],
+-					       __pa(pgtable_level2) | L2_ATTR);
+-
+-	/* Flush the tlb so the new mapping takes effect.
+-	 * Global tlb entries are not flushed but that is not an issue.
+-	 */
+-	load_cr3(pgtable_level3);
+-}
++#ifdef CONFIG_X86_PAE
++#undef pgd_present
++#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
++#define _PGD_ATTR _PAGE_PRESENT
++#else
++#define _PGD_ATTR _KERNPG_TABLE
+ #endif
+ 
+-static void set_idt(void *newidt, __u16 limit)
+-{
+-	struct Xgt_desc_struct curidt;
+-
+-	/* ia32 supports unaliged loads & stores */
+-	curidt.size    = limit;
+-	curidt.address = (unsigned long)newidt;
+-
+-	load_idt(&curidt);
+-};
++#define pa_page(page) __pa(page_address(page))
+ 
+-
+-static void set_gdt(void *newgdt, __u16 limit)
++static int create_mapping(struct page *root, struct page **pages, 
++			  unsigned long va, unsigned long pa)
+ {
+-	struct Xgt_desc_struct curgdt;
+-
+-	/* ia32 supports unaligned loads & stores */
+-	curgdt.size    = limit;
+-	curgdt.address = (unsigned long)newgdt;
++	pgd_t *pgd;
++	pud_t *pud;
++	pmd_t *pmd;
++	pte_t *pte;
++	int k = 0;
+ 
+-	load_gdt(&curgdt);
+-};
++	pgd = (pgd_t *)page_address(root) + pgd_index(va);
++	if (!pgd_present(*pgd))
++		set_pgd(pgd, __pgd(pa_page(pages[k++]) | _PGD_ATTR));
+ 
+-static void load_segments(void)
+-{
+-#define __STR(X) #X
+-#define STR(X) __STR(X)
++	pud = pud_offset(pgd, va);
++	if (!pud_present(*pud))
++		set_pud(pud, __pud(pa_page(pages[k++]) | _KERNPG_TABLE));
+ 
+-	__asm__ __volatile__ (
+-		"\tljmp $"STR(__KERNEL_CS)",$1f\n"
+-		"\t1:\n"
+-		"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
+-		"\tmovl %%eax,%%ds\n"
+-		"\tmovl %%eax,%%es\n"
+-		"\tmovl %%eax,%%fs\n"
+-		"\tmovl %%eax,%%gs\n"
+-		"\tmovl %%eax,%%ss\n"
+-		::: "eax", "memory");
+-#undef STR
+-#undef __STR
+-}
++	pmd = pmd_offset(pud, va);
++	if (!pmd_present(*pmd))
++		set_pmd(pmd, __pmd(pa_page(pages[k++]) | _KERNPG_TABLE));
+ 
+-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
+-					unsigned long indirection_page,
+-					unsigned long reboot_code_buffer,
+-					unsigned long start_address,
+-					unsigned int has_pae) ATTRIB_NORET;
++	pte = (pte_t *)page_address(pmd_page(*pmd)) + pte_index(va);
++	set_pte(pte, __pte(pa | _PAGE_KERNEL_EXEC));
+ 
+-const extern unsigned char relocate_new_kernel[];
+-extern void relocate_new_kernel_end(void);
+-const extern unsigned int relocate_new_kernel_size;
++	return k;
++}
+ 
+ /*
+  * A architecture hook called to validate the
+@@ -147,11 +101,38 @@ const extern unsigned int relocate_new_k
+  * Do what every setup is needed on image and the
+  * reboot code buffer to allow us to avoid allocations
+  * later.
+- *
+- * Currently nothing.
+  */
+ int machine_kexec_prepare(struct kimage *image)
+ {
++	void *control_page;
++	unsigned long pa;
++	int k;
++
++	memset(&image->arch_data, 0, sizeof(image->arch_data));
++
++	k = allocate_page_table_a(image);
++	if (k)
++		return k;
++
++	/* fill in control_page with assembly code */
++
++	control_page = page_address(image->control_code_page);
++	memcpy(control_page, relocate_new_kernel, relocate_new_kernel_size);
++
++	/* map the control_page at the virtual address of relocate_kernel.S */
++
++	pa = __pa(control_page);
++
++	k = create_mapping(image->arch_data.page_table_a[0], 
++			   &image->arch_data.page_table_a[1],
++			   (unsigned long)relocate_new_kernel, pa);
++
++	/* identity map the control_page */
++
++	create_mapping(image->arch_data.page_table_a[0], 
++		       &image->arch_data.page_table_a[k + 1],
++		       pa, pa);
++
+ 	return 0;
+ }
+ 
+@@ -170,45 +151,16 @@ void machine_kexec_cleanup(struct kimage
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+ 	unsigned long page_list;
+-	unsigned long reboot_code_buffer;
+-
++	unsigned long control_code;
++	unsigned long page_table_a;
+ 	relocate_new_kernel_t rnk;
+ 
+-	/* Interrupts aren't acceptable while we reboot */
+-	local_irq_disable();
+-
+-	/* Compute some offsets */
+-	reboot_code_buffer = page_to_pfn(image->control_code_page)
+-								<< PAGE_SHIFT;
+ 	page_list = image->head;
+-
+-	/* Set up an identity mapping for the reboot_code_buffer */
+-	identity_map_page(reboot_code_buffer);
+-
+-	/* copy it out */
+-	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+-						relocate_new_kernel_size);
+-
+-	/* The segment registers are funny things, they are
+-	 * automatically loaded from a table, in memory wherever you
+-	 * set them to a specific selector, but this table is never
+-	 * accessed again you set the segment to a different selector.
+-	 *
+-	 * The more common model is are caches where the behide
+-	 * the scenes work is done, but is also dropped at arbitrary
+-	 * times.
+-	 *
+-	 * I take advantage of this here by force loading the
+-	 * segments, before I zap the gdt with an invalid value.
+-	 */
+-	load_segments();
+-	/* The gdt & idt are now invalid.
+-	 * If you want to load them you must set up your own idt & gdt.
+-	 */
+-	set_gdt(phys_to_virt(0),0);
+-	set_idt(phys_to_virt(0),0);
++	control_code = __pa(page_address(image->control_code_page));
++	page_table_a = __pa(page_address(image->arch_data.page_table_a[0]));
+ 
+ 	/* now call it */
+-	rnk = (relocate_new_kernel_t) reboot_code_buffer;
+-	(*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
++	rnk = (relocate_new_kernel_t) relocate_new_kernel;
++	(*rnk)(page_list, control_code, image->start, 
++	       page_table_a, (unsigned long)cpu_has_pae);
+ }
+--- x/arch/i386/kernel/relocate_kernel.S
++++ x/arch/i386/kernel/relocate_kernel.S
+@@ -2,12 +2,20 @@
+  * relocate_kernel.S - put the kernel image in place to boot
+  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - moved segment handling code from machine_kexec.c
++ * - gdt tables stolen from arch/i386/boot/setup.S
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+ 
+ #include <linux/linkage.h>
++#include <asm/page.h>
+ 
++.text
++.align (1 << PAGE_SHIFT)
++	
+ 	/*
+ 	 * Must be relocatable PIC code callable as a C function, that once
+ 	 * it starts can not use the previous processes stack.
+@@ -18,18 +26,68 @@ relocate_new_kernel:
+ 	movl  4(%esp), %ebx /* page_list */
+ 	movl  8(%esp), %ebp /* reboot_code_buffer */
+ 	movl  12(%esp), %edx /* start address */
+-	movl  16(%esp), %ecx /* cpu_has_pae */
++	movl  16(%esp), %edi /* page_table_a */
++	movl  20(%esp), %ecx /* cpu_has_pae */
+ 
+ 	/* zero out flags, and disable interrupts */
+ 	pushl $0
+ 	popfl
+ 
++	/* switch to page_table_a */
++	movl	%edi, %eax
++	movl	%eax, %cr3
++
++	/* setup idt */
++
++	movl	%ebp, %eax
++	addl	$(idt_48 - relocate_new_kernel), %eax
++	lidtl	(%eax)
++
++	/* setup gdt */
++
++	movl	%ebp, %eax
++	addl	$(gdt - relocate_new_kernel), %eax
++	movl	%ebp, %esi
++	addl	$((gdt_48 - relocate_new_kernel) + 2), %esi
++	movl	%eax, (%esi)
++	
++	movl	%ebp, %eax
++	addl	$(gdt_48 - relocate_new_kernel), %eax
++	lgdtl	(%eax)
++
++	/* setup data segment registers */
++	
++	mov	$(gdt_ds - gdt), %eax
++	mov	%eax, %ds
++	mov	%eax, %es
++	mov	%eax, %fs
++	mov	%eax, %gs
++	mov	%eax, %ss
++
+ 	/* set a new stack at the bottom of our page... */
+ 	lea   4096(%ebp), %esp
+ 
++	/* load new code segment */
++
++	movl	%ebp, %esi
++	xorl	%eax, %eax
++	pushl	%eax
++	pushl	%esi
++	pushl	%eax
++	
++	movl	$(gdt_cs - gdt), %eax
++	pushl	%eax
++	
++	movl	%ebp, %eax
++	addl	$(identity_mapped - relocate_new_kernel),%eax
++	pushl	%eax
++	iretl
++
++identity_mapped:	
++
+ 	/* store the parameters back on the stack */
+ 	pushl   %edx /* store the start address */
+-
++	
+ 	/* Set cr0 to a known state:
+ 	 * 31 0 == Paging disabled
+ 	 * 18 0 == Alignment check disabled
+@@ -113,6 +171,36 @@ relocate_new_kernel:
+ 	xorl    %edi, %edi
+ 	xorl    %ebp, %ebp
+ 	ret
++
++	.align	16
++gdt:
++	.fill	1,8,0
++
++gdt_cs:	
++	.word	0xFFFF				# 4Gb - (0x100000*0x1000 = 4Gb)
++	.word	0				# base address = 0
++	.word	0x9A00				# code read/exec
++	.word	0x00CF				# granularity = 4096, 386
++						#  (+5th nibble of limit)
++gdt_ds:
++	.word	0xFFFF				# 4Gb - (0x100000*0x1000 = 4Gb)
++	.word	0				# base address = 0
++	.word	0x9200				# data read/write
++	.word	0x00CF				# granularity = 4096, 386
++						#  (+5th nibble of limit)
++gdt_end:
++	.align	4
++	
++	.word	0				# alignment byte
++idt_48:
++	.word	0				# idt limit = 0
++	.word	0, 0				# idt base = 0L
++
++	.word	0				# alignment byte
++gdt_48:
++	.word	gdt_end - gdt - 1		# gdt limit
++	.word	0, 0				# gdt base (filled in later)
++	
+ relocate_new_kernel_end:
+ 
+ 	.globl relocate_new_kernel_size
+--- x/include/asm-i386/kexec.h
++++ x/include/asm-i386/kexec.h
+@@ -29,7 +29,17 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
+-struct kimage_arch {};
++struct kimage_arch {
++       /* page_table_a[] holds enough pages to create a new page table
++        * that maps the control page twice..
++        */
++
++#if defined(CONFIG_X86_PAE)
++       struct page *page_table_a[5]; /* (2 * pte) + (2 * pmd) + pgd */
++#else
++       struct page *page_table_a[3]; /* (2 * pte) + pgd */
++#endif
++};
+ 
+ /* CPU does not save ss and esp on stack if execution is already
+  * running in kernel mode at the time of NMI occurrence. This code
--- /dev/null
+++ x/patches/linux-2.6.16.13/linux-2.6.16-kexec_page_table_a_i386-xen.patch
@@ -0,0 +1,59 @@
+kexec: xen specific portions of the page table a patch for kexec
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+
+ arch/i386/kernel/machine_kexec.c |   23 +++++++++++++++++++++++
+ 1 file changed, 23 insertions(+)
+
+--- x/arch/i386/kernel/machine_kexec.c	2006-07-12 13:16:20.000000000 +0900
++++ x/arch/i386/kernel/machine_kexec.c	2006-07-12 13:16:38.000000000 +0900
+@@ -23,15 +23,23 @@
+ #include <asm/desc.h>
+ #include <asm/system.h>
+ 
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#endif
++
++#ifndef CONFIG_XEN
+ typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
+ 					unsigned long indirection_page,
+ 					unsigned long reboot_code_buffer,
+ 					unsigned long start_address,
+ 					unsigned long page_table_a,
+ 					unsigned long has_pae) ATTRIB_NORET;
++#endif
+ 
+ const extern unsigned char relocate_new_kernel[];
++#ifndef CONFIG_XEN
+ extern void relocate_new_kernel_end(void);
++#endif
+ const extern unsigned int relocate_new_kernel_size;
+ 
+ static int allocate_page_table_a(struct kimage *image)
+@@ -144,6 +152,7 @@
+ {
+ }
+ 
++#ifndef CONFIG_XEN
+ /*
+  * Do not allocate memory (or fail in any way) in machine_kexec().
+  * We are past the point of no return, committed to rebooting now.
+@@ -164,3 +173,17 @@
+ 	(*rnk)(page_list, control_code, image->start, 
+ 	       page_table_a, (unsigned long)cpu_has_pae);
+ }
++#endif
++
++#ifdef CONFIG_XEN
++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
++{
++	struct kimage_arch *arch = &image->arch_data;
++	int k, n = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
++
++	for (k = 0; k < n; k++)
++		xki->page_table_a[k] =
++			pfn_to_mfn(page_to_pfn(arch->page_table_a[k]))
++				<< PAGE_SHIFT;
++}
++#endif

[-- Attachment #6: 51.2.1.2-kexec-x86_64-upstream.patch --]
[-- Type: text/plain, Size: 31657 bytes --]

kexec: x86_64

This is the first x86_64 release of kexec for xen/dom0. The code is in an
early phase, but it compiles and kexec:ing into a Linux kernel seems to work 
well. Rebooting into a new kernel may work using kdump too, but register
saving support is still missing.

The x86 component is a prerequsite for this patch.

Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
Signed-Off-By: Horms <horms@verge.net.au>

 buildconfigs/linux-defconfig_xen_x86_64                                  |    1 
 linux-2.6-xen-sparse/arch/x86_64/Kconfig                                 |    2 
 linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile                         |    2 
 linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c                      |   26 
 linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h                      |   64 +
 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h         |    7 
 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h            |    2 
 patches/linux-2.6.16.13/linux-2.6.16-kexec_page_table_a_x86_64-xen.patch |  151 +++
 patches/linux-2.6.16.13/linux-2.6.16-kexec_page_table_a_x86_64.patch     |  421 ++++++++++
 patches/linux-2.6.16.13/series                                           |    2 
 xen/arch/x86/x86_64/entry.S                                              |    2 
 xen/arch/x86/x86_64/machine_kexec.c                                      |   21 
 xen/include/asm-x86/x86_64/elf.h                                         |   48 +
 xen/include/asm-x86/x86_64/kexec.h                                       |   33 
 xen/include/public/kexec.h                                               |    3 
 15 files changed, 771 insertions(+), 14 deletions(-)

--- x/buildconfigs/linux-defconfig_xen_x86_64
+++ x/buildconfigs/linux-defconfig_xen_x86_64
@@ -139,6 +139,7 @@ CONFIG_SWIOTLB=y
 CONFIG_PHYSICAL_START=0x100000
 CONFIG_SECCOMP=y
 CONFIG_HZ_100=y
+CONFIG_KEXEC=y
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=100
--- x/linux-2.6-xen-sparse/arch/x86_64/Kconfig
+++ x/linux-2.6-xen-sparse/arch/x86_64/Kconfig
@@ -433,7 +433,7 @@ config X86_MCE_AMD
 
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && !X86_64_XEN
+	depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
@@ -59,7 +59,7 @@ pci-dma-y			+= ../../i386/kernel/pci-dma
 microcode-$(subst m,y,$(CONFIG_MICROCODE))  := ../../i386/kernel/microcode-xen.o
 quirks-y			:= ../../i386/kernel/quirks-xen.o
 
-n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o
+n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o crash.o
 
 include $(srctree)/scripts/Makefile.xen
 
--- x/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
@@ -79,6 +79,10 @@
 #include <asm/mach-xen/setup_arch_post.h>
 #include <xen/interface/memory.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 extern unsigned long start_pfn;
 extern struct edid_info edid_info;
 
@@ -446,6 +450,7 @@ static __init void parse_cmdline_early (
 		 * after a kernel panic.
 		 */
 		else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
 			unsigned long size, base;
 			size = memparse(from+12, &from);
 			if (*from == '@') {
@@ -456,6 +461,10 @@ static __init void parse_cmdline_early (
 				crashk_res.start = base;
 				crashk_res.end   = base + size - 1;
 			}
+#else
+			printk("Ignoring crashkernel command line, "
+			       "parameter will be supplied by xen\n");
+#endif
 		}
 #endif
 
@@ -827,10 +836,23 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif	/* !CONFIG_XEN */
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
 	if (crashk_res.start != crashk_res.end) {
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1);
 	}
+#else
+	{
+		xen_kexec_reserve_t reservation;
+		BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_kexec_reserve, 0,
+					&reservation));
+		if (reservation.size) {
+			crashk_res.start = reservation.start;
+			crashk_res.end = reservation.start + 
+				reservation.size - 1;
+		}
+	}
+#endif
 #endif
 
 	paging_init();
@@ -970,6 +992,10 @@ void __init setup_arch(char **cmdline_p)
 	iommu_hole_init();
 #endif
 
+#ifdef CONFIG_KEXEC
+	request_resource(&ioport_resource, &crashk_res);
+#endif
+
 #ifdef CONFIG_XEN
 	{
 		struct physdev_set_iopl set_iopl;
--- /dev/null
+++ x/linux-2.6-xen-sparse/include/asm-x86_64/kexec-xen.h
@@ -0,0 +1,64 @@
+/*
+ * include/asm-x86_64/kexec-xen.h
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _X86_64_KEXEC_XEN_H
+#define _X86_64_KEXEC_XEN_H
+
+#include <asm/ptrace.h>
+#include <asm/types.h>
+#include <xen/interface/arch-x86_64.h>
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+					struct cpu_user_regs *xen_regs)
+{
+	xen_regs->r15 = linux_regs->r15;
+	xen_regs->r14 = linux_regs->r14;
+	xen_regs->r13 = linux_regs->r13;
+	xen_regs->r12 = linux_regs->r12;
+	xen_regs->rbp = linux_regs->rbp;
+	xen_regs->rbx = linux_regs->rbx;
+	xen_regs->r11 = linux_regs->r11;
+	xen_regs->r10 = linux_regs->r10;
+	xen_regs->r9 = linux_regs->r9;
+	xen_regs->r8 = linux_regs->r8;
+	xen_regs->rax = linux_regs->rax;
+	xen_regs->rcx = linux_regs->rcx;
+	xen_regs->rdx = linux_regs->rdx;
+	xen_regs->rsi = linux_regs->rsi;
+	xen_regs->rdi = linux_regs->rdi;
+	xen_regs->rip = linux_regs->rip;
+	xen_regs->cs = linux_regs->cs;
+	xen_regs->rflags = linux_regs->eflags;
+	xen_regs->rsp = linux_regs->rsp;
+	xen_regs->ss = linux_regs->ss;
+}
+
+/* Kexec needs to know about the actual physical addresss.
+ * But in xen, on some architectures, a physical address is a
+ * pseudo-physical addresss. */
+#ifdef CONFIG_XEN
+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#else
+#define kexec_page_to_pfn(page)  page_to_pfn(page)
+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#endif
+
+#endif /* _X86_64_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h
@@ -386,4 +386,11 @@ HYPERVISOR_xenoprof_op(
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, unsigned int arg1, void * extra_args)
+{
+	return _hypercall3(int, kexec_op, op, arg1, extra_args);
+}
+
 #endif /* __HYPERCALL_H__ */
--- x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h
+++ x/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h
@@ -90,6 +90,8 @@ extern unsigned long profile_pc(struct p
 #define profile_pc(regs) instruction_pointer(regs)
 #endif
 
+#include <linux/compiler.h>
+
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
 
 struct task_struct;
--- x/xen/arch/x86/x86_64/entry.S
+++ x/xen/arch/x86/x86_64/entry.S
@@ -573,6 +573,7 @@ ENTRY(hypercall_table)
         .quad do_hvm_op
         .quad do_sysctl             /* 35 */
         .quad do_domctl
+        .quad do_kexec_op
         .rept NR_hypercalls-((.-hypercall_table)/8)
         .quad do_ni_hypercall
         .endr
@@ -615,6 +616,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_hvm_op            */
         .byte 1 /* do_sysctl            */  /* 35 */
         .byte 1 /* do_domctl            */
+        .byte 3 /* do_kexec             */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- x/xen/arch/x86/x86_64/machine_kexec.c
+++ x/xen/arch/x86/x86_64/machine_kexec.c
@@ -4,18 +4,29 @@
  *
  * Created By: Horms <horms@verge.net.au>
  *
- * Should be losely based on arch/x86_64/kernel/machine_kexec.c
+ * Losely based on arch/x86_64/kernel/machine_kexec.c
  */
-
-#include <xen/lib.h>       /* for printk() used in stub */
+  
 #include <xen/types.h>
 #include <public/kexec.h>
+#include <asm/fixmap.h>
+
+typedef void (*relocate_new_kernel_t)(unsigned long indirection_page,
+                                      unsigned long control_code_buffer,
+                                      unsigned long start_address,
+                                      unsigned long page_table_a,
+                                      unsigned long page_table_b);
 
 void machine_kexec(xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-}
+    relocate_new_kernel_t rnk;
 
+    rnk = (relocate_new_kernel_t) fix_to_virt(FIX_KEXEC_PAGE);
+    (*rnk)(image->indirection_page, image->reboot_code_buffer, 
+           image->start_address, image->page_table_a[0],
+           image->page_table_b);
+  }
+  
 /*
  * Local variables:
  * mode: C
--- x/xen/include/asm-x86/x86_64/elf.h
+++ x/xen/include/asm-x86/x86_64/elf.h
@@ -3,17 +3,55 @@
  * 
  * Created By: Horms
  *
- * Should pull be based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS
- * from Linux 2.6.16
+ * Based on include/asm-x86_64/elf.h:ELF_CORE_COPY_REGS from Linux 2.6.16
  */
 
 #ifndef __X86_ELF_X86_64_H__
 #define __X86_ELF_X86_64_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/lib.h>
 
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                                \
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+#include <xen/lib.h>
+
+/* XXX: Xen doesn't have orig_rax, so it is omitted.
+ *      Xen dosn't have threads, so fs and gs are read from the CPU and
+ *      thus values 21 and 22 are just duplicates of 25 and 26
+ *      respectively.  All these values could be passed from dom0 in the
+ *      case of it crashing, but does that help?
+ *
+ *      Lastly, I'm not sure why ds, es, fs and gs are read from
+ *      the CPU rather than regs, but linux does this
+ */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs)  do { \
+	unsigned v;						\
+	(pr_reg)[0] = (regs)->r15;				\
+	(pr_reg)[1] = (regs)->r14;				\
+	(pr_reg)[2] = (regs)->r13;				\
+	(pr_reg)[3] = (regs)->r12;				\
+	(pr_reg)[4] = (regs)->rbp;				\
+	(pr_reg)[5] = (regs)->rbx;				\
+	(pr_reg)[6] = (regs)->r11;				\
+	(pr_reg)[7] = (regs)->r10;				\
+	(pr_reg)[8] = (regs)->r9;				\
+	(pr_reg)[9] = (regs)->r8;				\
+	(pr_reg)[10] = (regs)->rax;				\
+	(pr_reg)[11] = (regs)->rcx;				\
+	(pr_reg)[12] = (regs)->rdx;				\
+	(pr_reg)[13] = (regs)->rsi;				\
+	(pr_reg)[14] = (regs)->rdi;				\
+	(pr_reg)[16] = (regs)->rip;			\
+	(pr_reg)[17] = (regs)->cs;			\
+	(pr_reg)[18] = (regs)->eflags;			\
+	(pr_reg)[19] = (regs)->rsp;			\
+	(pr_reg)[20] = (regs)->ss;			\
+	asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[21] = v;	\
+	asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[22] = v;	\
+	asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v;	\
+	asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v;	\
+	asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v;	\
+	asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v;	\
+} while(0);
 
 #endif /* __X86_ELF_X86_64_H__ */
 
--- x/xen/include/asm-x86/x86_64/kexec.h
+++ x/xen/include/asm-x86/x86_64/kexec.h
@@ -10,14 +10,43 @@
 #ifndef __X86_64_KEXEC_H__
 #define __X86_64_KEXEC_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <xen/lib.h>
 #include <xen/types.h>
 #include <public/xen.h>
 
+/*
+ * Saving the registers of the cpu on which panic occured in
+ * crash_kexec to save a valid sp. The registers of other cpus
+ * will be saved in machine_crash_shutdown while shooting down them.
+ */
 static void crash_setup_regs(struct cpu_user_regs *newregs,
 			     struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+	if (oldregs)
+		memcpy(newregs, oldregs, sizeof(*newregs));
+	else {
+		__asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->rbx));
+		__asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->rcx));
+		__asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->rdx));
+		__asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->rsi));
+		__asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->rdi));
+		__asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->rbp));
+		__asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->rax));
+		__asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->rsp));
+		__asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8));
+		__asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9));
+		__asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10));
+		__asm__ __volatile__("movq %%r11,%0" : "=m"(newregs->r11));
+		__asm__ __volatile__("movq %%r12,%0" : "=m"(newregs->r12));
+		__asm__ __volatile__("movq %%r13,%0" : "=m"(newregs->r13));
+		__asm__ __volatile__("movq %%r14,%0" : "=m"(newregs->r14));
+		__asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15));
+		__asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
+		__asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
+		__asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->eflags));
+
+		newregs->rip = (unsigned long)current_text_addr();
+	}
 }
 
 #endif /* __X86_64_KEXEC_H__ */
--- x/xen/include/public/kexec.h
+++ x/xen/include/public/kexec.h
@@ -46,6 +46,9 @@ typedef struct xen_kexec_image {
 #if defined(__i386__) || defined(__x86_64__)
     unsigned long page_table_a[7];
 #endif
+#if defined(__x86_64__)
+    unsigned long page_table_b;
+#endif
     unsigned long indirection_page;
     unsigned long reboot_code_buffer;
     unsigned long start_address;
--- x/patches/linux-2.6.16.13/series
+++ x/patches/linux-2.6.16.13/series
@@ -2,6 +2,8 @@ kexec-generic.patch
 linux-2.6.16-kexec_page_table_a_stubs.patch
 linux-2.6.16-kexec_page_table_a_i386.patch
 linux-2.6.16-kexec_page_table_a_i386-xen.patch
+linux-2.6.16-kexec_page_table_a_x86_64.patch
+linux-2.6.16-kexec_page_table_a_x86_64-xen.patch
 blktap-aio-16_03_06.patch
 device_bind.patch
 fix-hz-suspend.patch
--- /dev/null
+++ x/patches/linux-2.6.16.13/linux-2.6.16-kexec_page_table_a_x86_64.patch
@@ -0,0 +1,421 @@
+kexec: Avoid overwriting the current pgd (V2, x86_64)
+
+This patch upgrades the x86_64-specific kexec code to avoid overwriting the
+current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
+to start a secondary kernel that dumps the memory of the previous kernel.
+
+The code introduces a new set of page tables called "page_table_a". These
+tables are used to provide an executable identity mapping without overwriting
+the current pgd. The already existing page table is renamed to "page_table_b".
+
+KEXEC_CONTROL_CODE_SIZE is changed into a single page. This updated version of
+the patch also moves the segment handling code into the reloacte_kernel.S.
+
+Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
+---
+
+ The patch has been tested with regular kexec and CONFIG_CRASH_DUMP.
+ Applies on top of 2.6.16 and 2.6.17-rc4.
+
+ arch/x86_64/kernel/machine_kexec.c   |  193 +++++++++++++++++-----------------
+ arch/x86_64/kernel/relocate_kernel.S |   84 +++++++++++++-
+ include/asm-x86_64/kexec.h           |   15 ++
+ 3 files changed, 189 insertions(+), 103 deletions(-)
+
+--- x/arch/x86_64/kernel/machine_kexec.c
++++ x/arch/x86_64/kernel/machine_kexec.c
+@@ -2,6 +2,10 @@
+  * machine_kexec.c - handle transition of Linux booting another kernel
+  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - rewrote identity map code to avoid overwriting current pgd
++ * - moved segment handling code into relocate_kernel.S
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+@@ -96,81 +100,110 @@ out:
+ }
+ 
+ 
+-static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
++static int create_page_table_b(struct kimage *image)
+ {
+-	pgd_t *level4p;
+-	level4p = (pgd_t *)__va(start_pgtable);
+- 	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+-}
++	struct kimage_arch *arch = &image->arch_data;
+ 
+-static void set_idt(void *newidt, u16 limit)
+-{
+-	struct desc_ptr curidt;
++	arch->page_table_b = kimage_alloc_control_pages(image, 0);
+ 
+-	/* x86-64 supports unaliged loads & stores */
+-	curidt.size    = limit;
+-	curidt.address = (unsigned long)newidt;
++	if (!arch->page_table_b)
++		return -ENOMEM;
+ 
+-	__asm__ __volatile__ (
+-		"lidtq %0\n"
+-		: : "m" (curidt)
+-		);
+-};
++ 	return init_level4_page(image, page_address(arch->page_table_b),
++				0, end_pfn << PAGE_SHIFT);
++}
+ 
++typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
++					unsigned long control_code_buffer,
++					unsigned long start_address,
++					unsigned long page_table_a,
++					unsigned long page_table_b) ATTRIB_NORET;
++
++const extern unsigned char relocate_new_kernel[];
++const extern unsigned long relocate_new_kernel_size;
+ 
+-static void set_gdt(void *newgdt, u16 limit)
++static int allocate_page_table_a(struct kimage *image)
+ {
+-	struct desc_ptr curgdt;
++	struct kimage_arch *arch = &image->arch_data;
++	struct page *page;
++	int k = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
+ 
+-	/* x86-64 supports unaligned loads & stores */
+-	curgdt.size    = limit;
+-	curgdt.address = (unsigned long)newgdt;
++	for (; k > 0; k--) {
++		page = kimage_alloc_control_pages(image, 0);
++		if (!page)
++			return -ENOMEM;
+ 
+-	__asm__ __volatile__ (
+-		"lgdtq %0\n"
+-		: : "m" (curgdt)
+-		);
+-};
++		clear_page(page_address(page));
++		arch->page_table_a[k - 1] = page;
++	}
+ 
+-static void load_segments(void)
+-{
+-	__asm__ __volatile__ (
+-		"\tmovl %0,%%ds\n"
+-		"\tmovl %0,%%es\n"
+-		"\tmovl %0,%%ss\n"
+-		"\tmovl %0,%%fs\n"
+-		"\tmovl %0,%%gs\n"
+-		: : "a" (__KERNEL_DS) : "memory"
+-		);
++	return 0;
+ }
+ 
+-typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
+-					unsigned long control_code_buffer,
+-					unsigned long start_address,
+-					unsigned long pgtable) ATTRIB_NORET;
++#define _PAGE_KERNEL_EXEC __PAGE_KERNEL_EXEC
++#define pa_page(page) __pa_symbol(page_address(page)) /* __pa() miscompiles */
+ 
+-const extern unsigned char relocate_new_kernel[];
+-const extern unsigned long relocate_new_kernel_size;
++static int create_mapping(struct page *root, struct page **pages, 
++			  unsigned long va, unsigned long pa)
++{
++	pgd_t *pgd;
++	pud_t *pud;
++	pmd_t *pmd;
++	pte_t *pte;
++	int k = 0;
++
++	pgd = (pgd_t *)page_address(root) + pgd_index(va);
++	if (!pgd_present(*pgd))
++		set_pgd(pgd, __pgd(pa_page(pages[k++]) | _KERNPG_TABLE));
++
++	pud = pud_offset(pgd, va);
++	if (!pud_present(*pud))
++		set_pud(pud, __pud(pa_page(pages[k++]) | _KERNPG_TABLE));
++
++	pmd = pmd_offset(pud, va);
++	if (!pmd_present(*pmd))
++		set_pmd(pmd, __pmd(pa_page(pages[k++]) | _KERNPG_TABLE));
++
++	pte = (pte_t *)page_address(pmd_page(*pmd)) + pte_index(va);
++	set_pte(pte, __pte(pa | _PAGE_KERNEL_EXEC));
++
++	return k;
++}
+ 
+ int machine_kexec_prepare(struct kimage *image)
+ {
+-	unsigned long start_pgtable, control_code_buffer;
+-	int result;
++	void *control_page;
++	unsigned long pa;
++	int k;
+ 
+-	/* Calculate the offsets */
+-	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+-	control_code_buffer = start_pgtable + PAGE_SIZE;
+-
+-	/* Setup the identity mapped 64bit page table */
+-	result = init_pgtable(image, start_pgtable);
+-	if (result)
+-		return result;
+-
+-	/* Place the code in the reboot code buffer */
+-	memcpy(__va(control_code_buffer), relocate_new_kernel,
+-						relocate_new_kernel_size);
++	memset(&image->arch_data, 0, sizeof(image->arch_data));
+ 
+-	return 0;
++	k = allocate_page_table_a(image);
++	if (k)
++		return k;
++
++	/* fill in control_page with assembly code */
++
++	control_page = page_address(image->control_code_page);
++	memcpy(control_page, relocate_new_kernel, relocate_new_kernel_size);
++
++	/* map the control_page at the virtual address of relocate_kernel.S */
++
++	pa = __pa(control_page);
++
++	k = create_mapping(image->arch_data.page_table_a[0], 
++			   &image->arch_data.page_table_a[1],
++			   (unsigned long)relocate_new_kernel, pa);
++
++	/* identity map the control_page */
++
++	create_mapping(image->arch_data.page_table_a[0], 
++		       &image->arch_data.page_table_a[k + 1],
++		       pa, pa);
++
++	/* create identity mapped page table aka page_table_b */
++
++	return create_page_table_b(image);
+ }
+ 
+ void machine_kexec_cleanup(struct kimage *image)
+@@ -185,47 +218,17 @@ void machine_kexec_cleanup(struct kimage
+ NORET_TYPE void machine_kexec(struct kimage *image)
+ {
+ 	unsigned long page_list;
+-	unsigned long control_code_buffer;
+-	unsigned long start_pgtable;
++	unsigned long control_code;
++	unsigned long page_table_a;
++	unsigned long page_table_b;
+ 	relocate_new_kernel_t rnk;
+ 
+-	/* Interrupts aren't acceptable while we reboot */
+-	local_irq_disable();
+-
+-	/* Calculate the offsets */
+ 	page_list = image->head;
+-	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+-	control_code_buffer = start_pgtable + PAGE_SIZE;
++	control_code = __pa(page_address(image->control_code_page));
++	page_table_a = __pa(page_address(image->arch_data.page_table_a[0]));
++	page_table_b = __pa(page_address(image->arch_data.page_table_b));
+ 
+-	/* Set the low half of the page table to my identity mapped
+-	 * page table for kexec.  Leave the high half pointing at the
+-	 * kernel pages.   Don't bother to flush the global pages
+-	 * as that will happen when I fully switch to my identity mapped
+-	 * page table anyway.
+-	 */
+-	memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
+-	__flush_tlb();
+-
+-
+-	/* The segment registers are funny things, they are
+-	 * automatically loaded from a table, in memory wherever you
+-	 * set them to a specific selector, but this table is never
+-	 * accessed again unless you set the segment to a different selector.
+-	 *
+-	 * The more common model are caches where the behide
+-	 * the scenes work is done, but is also dropped at arbitrary
+-	 * times.
+-	 *
+-	 * I take advantage of this here by force loading the
+-	 * segments, before I zap the gdt with an invalid value.
+-	 */
+-	load_segments();
+-	/* The gdt & idt are now invalid.
+-	 * If you want to load them you must set up your own idt & gdt.
+-	 */
+-	set_gdt(phys_to_virt(0),0);
+-	set_idt(phys_to_virt(0),0);
+ 	/* now call it */
+-	rnk = (relocate_new_kernel_t) control_code_buffer;
+-	(*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
++	rnk = (relocate_new_kernel_t) relocate_new_kernel;
++	(*rnk)(page_list, control_code, image->start, page_table_a, page_table_b);
+ }
+--- x/arch/x86_64/kernel/relocate_kernel.S
++++ x/arch/x86_64/kernel/relocate_kernel.S
+@@ -2,11 +2,18 @@
+  * relocate_kernel.S - put the kernel image in place to boot
+  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+  *
++ * 2006-05-19 Magnus Damm <damm@opensource.se>:
++ * - moved segment handling code from machine_kexec.c
++ *
+  * This source code is licensed under the GNU General Public License,
+  * Version 2.  See the file COPYING for more details.
+  */
+ 
+ #include <linux/linkage.h>
++#include <asm/page.h>
++
++.text
++.align (1 << PAGE_SHIFT)
+ 
+ 	/*
+ 	 * Must be relocatable PIC code callable as a C function, that once
+@@ -18,21 +25,69 @@ relocate_new_kernel:
+ 	/* %rdi page_list
+ 	 * %rsi reboot_code_buffer
+ 	 * %rdx start address
+-	 * %rcx page_table
+-	 * %r8  arg5
++	 * %rcx page_table_a
++	 * %r8  page_table_b
+ 	 * %r9  arg6
+ 	 */
+-
++	
+ 	/* zero out flags, and disable interrupts */
+ 	pushq $0
+ 	popfq
+ 
++	/* switch to page_table_a */
++	movq    %rcx, %cr3
++
++	/* setup idt */
++
++	movq	%rsi, %rax
++	addq	$(idt_48 - relocate_new_kernel), %rax
++	lidtq	(%rax)
++
++	/* setup gdt */
++
++	movq	%rsi, %rax
++	addq	$(gdt - relocate_new_kernel), %rax
++	movq	%rsi, %r9
++	addq	$((gdt_48 - relocate_new_kernel) + 2), %r9
++	movq	%rax, (%r9)
++	
++	movq	%rsi, %rax
++	addq	$(gdt_48 - relocate_new_kernel), %rax
++	lgdtq	(%rax)
++
++	/* setup data segment registers */
++
++	xorl	%eax,%eax
++	movl	%eax, %ds
++	movl	%eax, %es
++	movl	%eax, %fs
++	movl	%eax, %gs
++	movl	%eax, %ss
++
+ 	/* set a new stack at the bottom of our page... */
+ 	lea   4096(%rsi), %rsp
+ 
++	/* load new code segment */
++
++	movq	%rsp, %rcx
++	xorq	%rax, %rax
++	pushq	%rax                                              /* SS */
++	pushq	%rcx                                              /* ESP */
++	pushq	%rax                                              /* RFLAGS */
++
++	movq	$(gdt_code - gdt), %rax
++	pushq	%rax                                              /* CS */
++
++	movq	%rsi, %rax
++	addq	$(identity_mapped - relocate_new_kernel), %rax
++	pushq	%rax                                              /* RIP */
++
++	iretq
++	
++identity_mapped:
+ 	/* store the parameters back on the stack */
+ 	pushq	%rdx /* store the start address */
+-
++	
+ 	/* Set cr0 to a known state:
+ 	 * 31 1 == Paging enabled
+ 	 * 18 0 == Alignment check disabled
+@@ -69,7 +124,7 @@ relocate_new_kernel:
+ 	/* Switch to the identity mapped page tables,
+ 	 * and flush the TLB.
+ 	*/
+-	movq	%rcx, %cr3
++	movq	%r8, %cr3
+ 
+ 	/* Do the copies */
+ 	movq	%rdi, %rcx 	/* Put the page_list in %rcx */
+@@ -136,6 +191,25 @@ relocate_new_kernel:
+ 	xorq	%r15, %r15
+ 
+ 	ret
++	.align	16
++gdt:
++	.long   0x00000000  /* NULL descriptor */
++	.long   0x00000000
++gdt_code:
++	.long   0x00000000  /* code descriptor */
++	.long   0x00209800
++
++gdt_end:
++	.align	4
++	
++idt_48:
++	.word	0				# idt limit = 0
++	.quad	0, 0				# idt base = 0L
++
++gdt_48:
++	.word	gdt_end - gdt - 1		# gdt limit
++	.quad	0, 0				# gdt base (filled in later)
++
+ relocate_new_kernel_end:
+ 
+ 	.globl relocate_new_kernel_size
+--- x/include/asm-x86_64/kexec.h
++++ x/include/asm-x86_64/kexec.h
+@@ -21,15 +21,24 @@
+ /* Maximum address we can use for the control pages */
+ #define KEXEC_CONTROL_MEMORY_LIMIT     (0xFFFFFFFFFFUL)
+ 
+-/* Allocate one page for the pdp and the second for the code */
+-#define KEXEC_CONTROL_CODE_SIZE  (4096UL + 4096UL)
++#define KEXEC_CONTROL_CODE_SIZE  4096
+ 
+ /* The native architecture */
+ #define KEXEC_ARCH KEXEC_ARCH_X86_64
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
+-struct kimage_arch {};
++struct kimage_arch {
++	/* page_table_a[] holds enough pages to create a new page table
++	 * that maps the control page twice..
++	 *
++	 * page_table_b points to the root page of a page table which is used
++	 * to provide identity mapping of all ram.
++	 */
++
++	struct page *page_table_a[7]; /* 2 * (pte + pud + pmd) + pgd */
++	struct page *page_table_b;
++};
+ 
+ /*
+  * Saving the registers of the cpu on which panic occured in
--- /dev/null
+++ x/patches/linux-2.6.16.13/linux-2.6.16-kexec_page_table_a_x86_64-xen.patch
@@ -0,0 +1,151 @@
+ arch/x86_64/kernel/machine_kexec.c |   84 +++++++++++++++++++++++++++++++++---
+ 1 file changed, 77 insertions(+), 7 deletions(-)
+
+--- x/arch/x86_64/kernel/machine_kexec.c
++++ x/arch/x86_64/kernel/machine_kexec.c
+@@ -19,6 +19,50 @@
+ #include <asm/mmu_context.h>
+ #include <asm/io.h>
+ 
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#include <xen/interface/memory.h>
++
++#define x__pmd(x) ((pmd_t) { (x) } )
++#define x__pud(x) ((pud_t) { (x) } )
++#define x__pgd(x) ((pgd_t) { (x) } )
++
++#define x_pmd_val(x)	((x).pmd)
++#define x_pud_val(x)	((x).pud)
++#define x_pgd_val(x)	((x).pgd)
++
++static inline void x_set_pmd(pmd_t *dst, pmd_t val)
++{
++        x_pmd_val(*dst) = x_pmd_val(val); 
++} 
++
++static inline void x_set_pud(pud_t *dst, pud_t val)
++{
++	x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
++}
++
++static inline void x_pud_clear (pud_t *pud)
++{
++	x_pud_val(*pud) = 0;
++}
++
++static inline void x_set_pgd(pgd_t *dst, pgd_t val)
++{
++	x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val)); 
++} 
++
++static inline void x_pgd_clear (pgd_t * pgd)
++{
++	x_pgd_val(*pgd) = 0; 
++}
++
++#define MY_LARGE_EXEC _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
++#define MY_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
++#else
++#define MY_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
++#define MY_TABLE _KERNPG_TABLE
++#endif /* CONFIG_XEN */
++
+ static void init_level2_page(pmd_t *level2p, unsigned long addr)
+ {
+ 	unsigned long end_addr;
+@@ -26,7 +70,7 @@ static void init_level2_page(pmd_t *leve
+ 	addr &= PAGE_MASK;
+ 	end_addr = addr + PUD_SIZE;
+ 	while (addr < end_addr) {
+-		set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
++		x_set_pmd(level2p++, x__pmd(addr | MY_LARGE_EXEC));
+ 		addr += PMD_SIZE;
+ 	}
+ }
+@@ -51,12 +95,12 @@ static int init_level3_page(struct kimag
+ 		}
+ 		level2p = (pmd_t *)page_address(page);
+ 		init_level2_page(level2p, addr);
+-		set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
++		x_set_pud(level3p++, x__pud(__pa(level2p) | MY_TABLE));
+ 		addr += PUD_SIZE;
+ 	}
+ 	/* clear the unused entries */
+ 	while (addr < end_addr) {
+-		pud_clear(level3p++);
++		x_pud_clear(level3p++);
+ 		addr += PUD_SIZE;
+ 	}
+ out:
+@@ -87,12 +131,12 @@ static int init_level4_page(struct kimag
+ 		if (result) {
+ 			goto out;
+ 		}
+-		set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
++		x_set_pgd(level4p++, x__pgd(__pa(level3p) | MY_TABLE));
+ 		addr += PGDIR_SIZE;
+ 	}
+ 	/* clear the unused entries */
+ 	while (addr < end_addr) {
+-		pgd_clear(level4p++);
++		x_pgd_clear(level4p++);
+ 		addr += PGDIR_SIZE;
+ 	}
+ out:
+@@ -103,14 +147,21 @@ out:
+ static int create_page_table_b(struct kimage *image)
+ {
+ 	struct kimage_arch *arch = &image->arch_data;
++	unsigned long last_page;
+ 
+ 	arch->page_table_b = kimage_alloc_control_pages(image, 0);
+ 
+ 	if (!arch->page_table_b)
+ 		return -ENOMEM;
+ 
++#ifdef CONFIG_XEN
++	last_page = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
++#else
++	last_page = end_pfn;
++#endif
++
+  	return init_level4_page(image, page_address(arch->page_table_b),
+-				0, end_pfn << PAGE_SHIFT);
++				0, last_page << PAGE_SHIFT);
+ }
+ 
+ typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
+@@ -211,6 +262,7 @@ void machine_kexec_cleanup(struct kimage
+ 	return;
+ }
+ 
++#ifndef CONFIG_XEN
+ /*
+  * Do not allocate memory (or fail in any way) in machine_kexec().
+  * We are past the point of no return, committed to rebooting now.
+@@ -230,5 +282,23 @@ NORET_TYPE void machine_kexec(struct kim
+ 
+ 	/* now call it */
+ 	rnk = (relocate_new_kernel_t) relocate_new_kernel;
+-	(*rnk)(page_list, control_code, image->start, page_table_a, page_table_b);
++	(*rnk)(page_list, control_code, image->start, page_table_a,
++	       page_table_b);
++}
++#endif /* !CONFIG_XEN */
++
++#ifdef CONFIG_XEN
++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki,struct kimage *image)
++{
++	struct kimage_arch *arch = &image->arch_data;
++	int k, n = sizeof(arch->page_table_a) / sizeof(arch->page_table_a[0]);
++
++	for (k = 0; k < n; k++)
++		xki->page_table_a[k] = 
++			pfn_to_mfn(page_to_pfn(arch->page_table_a[k]))
++				<< PAGE_SHIFT;
++
++	xki->page_table_b =
++		pfn_to_mfn(page_to_pfn(arch->page_table_b)) << PAGE_SHIFT;
+ }
++#endif /* CONFIG_XEN */

[-- Attachment #7: 51.2.2-kexec-ia64-upstream.patch --]
[-- Type: text/plain, Size: 61602 bytes --]

kexec: ia64

This is the ia64 component of kexec for ia64.
The generic component is a prerequsite for this patch.

Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
Signed-Off-By: Horms <horms@verge.net.au>

 buildconfigs/linux-defconfig_xen0_ia64            |    3 
 linux-2.6-xen-sparse/arch/ia64/Kconfig            |   23 
 linux-2.6-xen-sparse/arch/ia64/kernel/entry.S     |    2 
 linux-2.6-xen-sparse/include/asm-ia64/hypercall.h |    7 
 linux-2.6-xen-sparse/include/asm-ia64/kexec-xen.h |   36 
 patches/linux-2.6.16.13/kexec-ia64-1.patch        |  767 +++++++++++++++++++++
 patches/linux-2.6.16.13/kexec-ia64-2.patch        |  596 ++++++++++++++++
 patches/linux-2.6.16.13/kexec-ia64-xen.patch      |  266 +++++++
 patches/linux-2.6.16.13/series                    |    3 
 xen/arch/ia64/asm-offsets.c                       |    2 
 xen/arch/ia64/linux-xen/smp.c                     |   32 
 xen/arch/ia64/xen/crash.c                         |  110 ++-
 xen/arch/ia64/xen/hypercall.c                     |    2 
 xen/arch/ia64/xen/machine_kexec.c                 |   60 +
 xen/include/asm-ia64/kexec.h                      |   62 +
 xen/include/asm-ia64/linux-xen/asm/smp.h          |    9 
 xen/include/public/kexec.h                        |    3 
 17 files changed, 1965 insertions(+), 18 deletions(-)

--- x/buildconfigs/linux-defconfig_xen0_ia64
+++ x/buildconfigs/linux-defconfig_xen0_ia64
@@ -138,6 +138,8 @@ CONFIG_ARCH_SPARSEMEM_ENABLE=y
 CONFIG_IA64_MCA_RECOVERY=y
 CONFIG_PERFMON=y
 CONFIG_IA64_PALINFO=y
+CONFIG_KEXEC=y
+CONFIG_CRASH_DUMP=y
 
 #
 # Firmware Drivers
@@ -1304,6 +1306,7 @@ CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
 #
 CONFIG_PROC_FS=y
 CONFIG_PROC_KCORE=y
+# CONFIG_PROC_VMCORE is not set
 CONFIG_SYSFS=y
 CONFIG_TMPFS=y
 # CONFIG_HUGETLB_PAGE is not set
--- x/linux-2.6-xen-sparse/arch/ia64/Kconfig
+++ x/linux-2.6-xen-sparse/arch/ia64/Kconfig
@@ -390,6 +390,29 @@ config IA64_PALINFO
 config SGI_SN
 	def_bool y if (IA64_SGI_SN2 || IA64_GENERIC)
 
+config KEXEC
+	bool "kexec system call (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  kexec is a system call that implements the ability to shutdown your
+	  current kernel, and to start another kernel.  It is like a reboot
+	  but it is indepedent of the system firmware.   And like a reboot
+	  you can start any kernel with it, not just Linux.
+
+	  The name comes from the similiarity to the exec system call.
+
+	  It is an ongoing process to be certain the hardware in a machine
+	  is properly shutdown, so do not be surprised if this code does not
+	  initially work for you.  It may help to enable device hotplugging
+	  support.  As of this writing the exact hardware interface is
+	  strongly in flux, so no good recommendation can be made.
+
+config CRASH_DUMP
+	bool "kernel crash dumps (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  Generate crash dump after being started by kexec.
+ 
 source "drivers/firmware/Kconfig"
 
 source "fs/Kconfig.binfmt"
--- x/linux-2.6-xen-sparse/arch/ia64/kernel/entry.S
+++ x/linux-2.6-xen-sparse/arch/ia64/kernel/entry.S
@@ -1596,7 +1596,7 @@ sys_call_table:
 	data8 sys_mq_timedreceive		// 1265
 	data8 sys_mq_notify
 	data8 sys_mq_getsetattr
-	data8 sys_ni_syscall			// reserved for kexec_load
+	data8 sys_kexec_load
 	data8 sys_ni_syscall			// reserved for vserver
 	data8 sys_waitid			// 1270
 	data8 sys_add_key
--- x/linux-2.6-xen-sparse/include/asm-ia64/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-ia64/hypercall.h
@@ -416,4 +416,11 @@ HYPERVISOR_add_physmap(unsigned long gpf
 // for balloon driver
 #define HYPERVISOR_update_va_mapping(va, new_val, flags) (0)
 
+static inline int
+HYPERVISOR_kexec(
+	unsigned long op, unsigned int arg1, void * extra_args)
+{
+	return _hypercall3(int, kexec_op, op, arg1, extra_args);
+}
+
 #endif /* __HYPERCALL_H__ */
--- /dev/null
+++ x/linux-2.6-xen-sparse/include/asm-ia64/kexec-xen.h
@@ -0,0 +1,36 @@
+/*
+ * include/asm-ia64/kexec-xen.h
+ *
+ * Created By: Horms <horms@verge.net.au>
+ */
+
+#ifndef _IA64_KEXEC_XEN_H
+#define _IA64_KEXEC_XEN_H
+
+#include <linux/kernel.h>	/* for printk() used in stub */
+
+static inline void crash_translate_regs(struct pt_regs *linux_regs,
+					struct cpu_user_regs *xen_regs)
+{
+	printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+/* Kexec needs to know about the actual physical addresss.
+ * But in xen, on some architectures, a physical address is a
+ * pseudo-physical addresss. */
+#define kexec_page_to_pfn(page)  page_to_pfn(page)
+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+
+#endif /* _IA64_KEXEC_XEN_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/xen/arch/ia64/asm-offsets.c
+++ x/xen/arch/ia64/asm-offsets.c
@@ -195,6 +195,8 @@ void foo(void)
 
 	BLANK();
 	DEFINE(IA64_CPUINFO_NSEC_PER_CYC_OFFSET, offsetof (struct cpuinfo_ia64, nsec_per_cyc));
+	DEFINE(IA64_CPUINFO_PTCE_BASE_OFFSET, offsetof (struct cpuinfo_ia64, ptce_base));
+	DEFINE(IA64_CPUINFO_PTCE_COUNT_OFFSET, offsetof (struct cpuinfo_ia64, ptce_count));
 	DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec));
 
 
--- x/xen/arch/ia64/linux-xen/smp.c
+++ x/xen/arch/ia64/linux-xen/smp.c
@@ -113,6 +113,38 @@ unlock_ipi_calllock(void)
 	spin_unlock_irq(&call_lock);
 }
 
+#ifdef XEN
+/*
+ * Stop the CPU and put it in fake SAL rendezvous. This allows CPU to wake
+ * up with IPI from boot processor
+ */
+void
+kexec_stop_this_cpu (void *data)
+{
+	void *pal_addr;
+	struct kexec_stop_this_cpu_arg *arg =
+			(struct kexec_stop_this_cpu_arg *)data;
+
+	if (pal_vaddr)
+		pal_addr = pal_vaddr;
+	else
+		pal_addr = efi_get_pal_addr();
+
+	/*
+	 * Remove this CPU by putting it into fake SAL rendezvous
+	 */
+	cpu_clear(smp_processor_id(), cpu_online_map);
+	max_xtp();
+	ia64_eoi();
+
+	/* Disable VHPT */
+	ia64_disable_vhpt();
+
+	local_irq_disable();
+	arg->fake_sal_rendez(arg->func, ap_wakeup_vector, pal_addr);
+}
+#endif
+
 static void
 stop_this_cpu (void)
 {
--- x/xen/arch/ia64/xen/crash.c
+++ x/xen/arch/ia64/xen/crash.c
@@ -1,17 +1,113 @@
-/**********************************************************************
- * arch/ia64/xen/crash.c
- *
- * Created By: Horms
+/******************************************************************************
+ * arch/ia64/crash.c
  * 
+ * Created By: Horms
+ *
+ * Based heavily on arch/ia64/kernel/crash.c from 
+ * Tony Luck's ia64 test tree, circa Linux 2.6.17
  */
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <asm/atomic.h>
+#include <asm/percpu.h>
+#include <asm/kexec.h>
 #include <xen/types.h>
-#include <public/kexec.h>
+#include <xen/irq.h>
+#include <xen/string.h>
+#include <xen/elf.h>
+#include <xen/elfcore.h>
+#include <xen/smp.h>
+#include <xen/delay.h>
+#include <xen/perfc.h>
+#include <xen/kexec.h>
+#include <xen/sched.h>
+#include <public/xen.h>
+
+static void device_shootdown(void)
+{
+	printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+}
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+							       size_t data_len)
+{
+	Elf_Note note;
+
+	note.namesz = strlen(name) + 1;
+	note.descsz = data_len;
+	note.type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) +3)/4;
+	memcpy(buf, name, note.namesz);
+	buf += (note.namesz + 3)/4;
+	memcpy(buf, data, note.descsz);
+	buf += (note.descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	Elf_Note note;
+
+	note.namesz = 0;
+	note.descsz = 0;
+	note.type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static void crash_save_this_cpu(struct cpu_user_regs *regs)
+{
+	uint64_t *buf;
+	ELF_Prstatus prstatus;
+	int cpu = smp_processor_id();
+	ELF_Greg *dst = (ELF_Greg *)&prstatus.pr_reg;
+
+    	dst[1] = regs->r1;
+    	dst[12] = regs->r12;
+    	dst[13] = regs->r13;
+
+    	dst[42] = regs->cr_iip;
+    	dst[45] = regs->ar_rsc;
+
+	ia64_setreg(_IA64_REG_AR_RSC, 0);
+	ia64_srlz_i();
+
+    	dst[46] = regs->r30;
+    	dst[47] = regs->ar_bspstore;
+
+    	dst[48] = regs->ar_rnat;
+    	dst[49] = regs->ar_ccv;
+    	dst[50] = regs->ar_unat;
+
+    	dst[51] = regs->ar_fpsr;
+    	dst[52] = regs->ar_pfs;
+    	dst[53] = regs->r31;
+
+    	dst[54] = regs->r31;
+    	dst[55] = regs->ar_csd;
+    	dst[56] = regs->ar_ssd;
+
+        buf = (uint64_t *) per_cpu(crash_notes, cpu);
+	if (!buf)
+		return;
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+		sizeof(prstatus));
+	final_note(buf);
+}
 
 void machine_crash_shutdown(struct cpu_user_regs *regs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+	printk("machine_crash_shutdown: %d\n", smp_processor_id());
+
+	if (in_interrupt()) {
+		ia64_eoi();
+	}
+	crash_save_this_cpu(regs);
+	device_shootdown();
+#ifdef CONFIG_SMP
+	smp_send_stop();
+#endif
+
 }
 
 /*
--- x/xen/arch/ia64/xen/hypercall.c
+++ x/xen/arch/ia64/xen/hypercall.c
@@ -75,7 +75,7 @@ const hypercall_t ia64_hypercall_table[N
 	(hypercall_t)do_hvm_op,			/*  */
 	(hypercall_t)do_sysctl,			/*  */                  /* 35 */
 	(hypercall_t)do_domctl,			/*  */
-	(hypercall_t)do_ni_hypercall,		/*  */
+	(hypercall_t)do_kexec_op,		/*  */                 /* 35 */
 	(hypercall_t)do_ni_hypercall,		/*  */
 	(hypercall_t)do_ni_hypercall,		/*  */
 	(hypercall_t)do_ni_hypercall,		/*  */                 /* 40 */
--- x/xen/arch/ia64/xen/machine_kexec.c
+++ x/xen/arch/ia64/xen/machine_kexec.c
@@ -5,19 +5,32 @@
  * 
  */
 
+#include <asm/smp.h>
 #include <xen/lib.h>       /* for printk() used in stubs */
 #include <xen/types.h>
+#include <xen/smp.h>
 #include <public/kexec.h>
+#include <linux/efi.h>
+#include <asm/delay.h>
+#include <asm/meminit.h>
+
+typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
+                                        unsigned long indirection_page,
+					unsigned long start_address,
+					struct ia64_boot_param *boot_param,
+					unsigned long pal_addr,
+					unsigned long cpu_data_pa,
+					unsigned long kernel_start,
+					unsigned long page_offset)
+					ATTRIB_NORET;
 
 int machine_kexec_load(int type, xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
-    return -1;
+    return 0;
 }
 
 void machine_kexec_unload(int type, xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
 }
 
 void machine_kexec_reserved(xen_kexec_reserve_t *reservation)
@@ -27,12 +40,49 @@ void machine_kexec_reserved(xen_kexec_re
 
 void machine_kexec(xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    relocate_new_kernel_t rnk;
+    void *pal_addr;
+    unsigned long code_addr = (unsigned long)__va(image->reboot_code_buffer);
+    unsigned long cpu_data_pa = (unsigned long)
+    				__pa(cpu_data(smp_processor_id()));
+
+    printk(__FILE__ ": %s: call unw_init_running()?\n", __FUNCTION__);
+
+    if (pal_vaddr)
+        pal_addr = pal_vaddr;
+    else
+        pal_addr = efi_get_pal_addr();
+
+    /* Interrupts aren't acceptable while we reboot */
+    ia64_set_itv(1<<16);
+    local_irq_disable();
+    rnk = (relocate_new_kernel_t)&code_addr;
+    (*rnk)(image->indirection_page, image->start_address, ia64_boot_param,
+           GRANULEROUNDDOWN((unsigned long) pal_addr),
+	   cpu_data_pa, KERNEL_START, PAGE_OFFSET);
+    BUG();
 }
 
+#ifdef CONFIG_SMP
+static void machine_shutdown_smp(xen_kexec_image_t *image)
+{
+    struct kexec_stop_this_cpu_arg arg;
+    unsigned long code_addr = (unsigned long)__va(image->fake_sal_rendez);
+
+    arg.func = (void *)image->start_address;
+    arg.fake_sal_rendez = (kexec_fake_sal_rendez_t)&code_addr;
+    smp_call_function(kexec_stop_this_cpu, (void *)&arg, 0, 0);
+}
+#else /* !CONFIG_SMP */
+static void machine_shutdown_smp(xen_kexec_image_t *image) { }
+#endif /* CONFIG_SMP */
+
+
 void machine_shutdown(xen_kexec_image_t *image)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+    printk(__FILE__ ": %s: need to shutdown pci devices\n", __FUNCTION__);
+    machine_shutdown_smp(image);
+    machine_kexec(image);
 }
 
 /*
--- x/xen/include/asm-ia64/kexec.h
+++ x/xen/include/asm-ia64/kexec.h
@@ -6,16 +6,70 @@
  */
 
 #ifndef __IA64_KEXEC_H__
-#define __IA64_KEXEC_H__
 
-#include <xen/lib.h>       /* for printk() used in stub */
+#include <asm/ia64regs.h>
+#include <asm/gcc_intrin.h>
+#include <asm/percpu.h>
+#include <xen/lib.h>
 #include <xen/types.h>
 #include <public/xen.h>
 
+#define pte_bits	3
+#define vmlpt_bits	(impl_va_bits - PAGE_SHIFT + pte_bits)
+#define POW2(n)		(1ULL << (n))
+
+DECLARE_PER_CPU(u64, ia64_mca_pal_base);
+const extern unsigned int relocate_new_kernel_size;
+volatile extern long kexec_rendez;
+extern void relocate_new_kernel(unsigned long, unsigned long,
+		struct ia64_boot_param *, unsigned long);
+extern void kexec_fake_sal_rendez(void *start, unsigned long wake_up,
+		unsigned long pal_base);
+
+/*
+ * Saving the registers of the cpu on which panic occured in
+ * crash_kexec to save a valid sp. The registers of other cpus
+ * will be saved in machine_crash_shutdown while shooting down them.
+ */
 static void crash_setup_regs(struct cpu_user_regs *newregs,
-                            struct cpu_user_regs *oldregs)
+			     struct cpu_user_regs *oldregs)
 {
-    printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__);
+	if (oldregs) {
+		memcpy(newregs, oldregs, sizeof(*newregs));
+		return;
+	}
+
+	newregs->r1 = ia64_getreg(_IA64_REG_GP);
+	newregs->r12 = ia64_getreg(_IA64_REG_SP);
+	newregs->r13 = ia64_getreg(_IA64_REG_TP);
+
+	newregs->cr_iip = ia64_getreg(_IA64_REG_IP);
+	newregs->ar_rsc = ia64_getreg(_IA64_REG_AR_RSC);
+
+	ia64_setreg(_IA64_REG_AR_RSC, 0);
+	ia64_srlz_i();
+
+	/* struct cpu_user_regs does not have a ar_bsp element,
+	 * so just use the otherwise unused r30 instead.
+	 * This decision is arbiatry, r30 is not related to ar_bsp in
+	 * any way */
+	newregs->r30 = ia64_getreg(_IA64_REG_AR_BSP);
+	newregs->ar_bspstore = ia64_getreg(_IA64_REG_AR_BSPSTORE);
+
+	newregs->ar_rnat = ia64_getreg(_IA64_REG_AR_RNAT);
+	newregs->ar_ccv = ia64_getreg(_IA64_REG_AR_CCV);
+	newregs->ar_unat = ia64_getreg(_IA64_REG_AR_UNAT);
+
+	newregs->ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR);
+	newregs->ar_pfs = ia64_getreg(_IA64_REG_AR_PFS);
+
+	/* struct cpu_user_regs does not have a ar_lc element,
+	 * so just use the otherwise unused r31 instead.
+	 * This decision is arbiatry, r31 is not related to ar_lc in
+	 * any way */
+	newregs->r31 = ia64_getreg(_IA64_REG_AR_LC);
+	newregs->ar_csd = ia64_getreg(_IA64_REG_AR_CSD);
+	newregs->ar_ssd = ia64_getreg(_IA64_REG_AR_SSD);
 }
 
 #endif /* __IA64_KEXEC_H__ */
--- x/xen/include/asm-ia64/linux-xen/asm/smp.h
+++ x/xen/include/asm-ia64/linux-xen/asm/smp.h
@@ -133,6 +133,15 @@ extern void smp_send_reschedule (int cpu
 extern void lock_ipi_calllock(void);
 extern void unlock_ipi_calllock(void);
 extern void identify_siblings (struct cpuinfo_ia64 *);
+#ifdef XEN
+typedef void (*kexec_fake_sal_rendez_t) (void *start, unsigned long wake_up,
+			unsigned long pal_base);
+struct kexec_stop_this_cpu_arg {
+	void *func;
+	kexec_fake_sal_rendez_t fake_sal_rendez;
+};
+extern void kexec_stop_this_cpu(void *data);
+#endif /* XEN */
 
 #else
 
--- x/xen/include/public/kexec.h
+++ x/xen/include/public/kexec.h
@@ -49,6 +49,9 @@ typedef struct xen_kexec_image {
 #if defined(__x86_64__)
     unsigned long page_table_b;
 #endif
+#if defined(__ia64__)
+    unsigned long fake_sal_rendez;
+#endif
     unsigned long indirection_page;
     unsigned long reboot_code_buffer;
     unsigned long start_address;
--- x/patches/linux-2.6.16.13/series
+++ x/patches/linux-2.6.16.13/series
@@ -4,6 +4,9 @@ linux-2.6.16-kexec_page_table_a_i386.pat
 linux-2.6.16-kexec_page_table_a_i386-xen.patch
 linux-2.6.16-kexec_page_table_a_x86_64.patch
 linux-2.6.16-kexec_page_table_a_x86_64-xen.patch
+kexec-ia64-1.patch
+kexec-ia64-2.patch
+kexec-ia64-xen.patch
 blktap-aio-16_03_06.patch
 device_bind.patch
 fix-hz-suspend.patch
--- /dev/null
+++ x/patches/linux-2.6.16.13/kexec-ia64-1.patch
@@ -0,0 +1,767 @@
+commit b373e385743597f576b67c423807bbdfe3b862e7
+tree c1e50c5f8f38e934cd3595fe9cb01b06549b4fac
+parent 3cd73eedde34c5fd88d62d8523c4260970fdc6fb
+author Khalid Aziz <khalid.aziz@hp.com> 1147215202 -0700
+committer Tony Luck <tony.luck@intel.com> 1147215202 -0700
+
+[IA64] kexec for ia64
+
+Enable kexec for ia64.
+
+Signed-off-by: Khalid Aziz <khalid.aziz@hp.com>
+Signed-off-by: Nanhai Zou <nanhai.zou@intel.com>
+Signed-off-by: Tony Luck <tony.luck@intel.com>
+
+This patch is from Tony Luck's ia64 test git tree circa 2.6.17.
+It has been rediffed and trivially backported to xen 2.6.16.3
+
+Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
+Signed-Off-By: Horms <horms@verge.net.au>
+
+ arch/ia64/Kconfig                  |   17 +
+ arch/ia64/hp/common/sba_iommu.c    |   22 ++
+ arch/ia64/kernel/Makefile          |    1 
+ arch/ia64/kernel/crash.c           |   43 ++++
+ arch/ia64/kernel/entry.S           |    2 
+ arch/ia64/kernel/machine_kexec.c   |  140 ++++++++++++++
+ arch/ia64/kernel/relocate_kernel.S |  359 ++++++++++++++++++++++++++++++++++++
+ arch/ia64/kernel/smp.c             |   29 ++
+ include/asm-ia64/kexec.h           |   36 +++
+ include/asm-ia64/machvec_hpzx1.h   |    2 
+ include/asm-ia64/smp.h             |    3 
+ 11 files changed, 653 insertions(+), 1 deletion(-)
+
+--- x/arch/ia64/Kconfig
++++ x/arch/ia64/Kconfig
+@@ -376,6 +376,23 @@ config IA64_PALINFO
+ config SGI_SN
+ 	def_bool y if (IA64_SGI_SN2 || IA64_GENERIC)
+ 
++config KEXEC
++	bool "kexec system call (EXPERIMENTAL)"
++	depends on EXPERIMENTAL
++	help
++	  kexec is a system call that implements the ability to shutdown your
++	  current kernel, and to start another kernel.  It is like a reboot
++	  but it is indepedent of the system firmware.   And like a reboot
++	  you can start any kernel with it, not just Linux.
++
++	  The name comes from the similiarity to the exec system call.
++
++	  It is an ongoing process to be certain the hardware in a machine
++	  is properly shutdown, so do not be surprised if this code does not
++	  initially work for you.  It may help to enable device hotplugging
++	  support.  As of this writing the exact hardware interface is
++	  strongly in flux, so no good recommendation can be made.
++ 
+ source "drivers/firmware/Kconfig"
+ 
+ source "fs/Kconfig.binfmt"
+--- x/arch/ia64/hp/common/sba_iommu.c
++++ x/arch/ia64/hp/common/sba_iommu.c
+@@ -1624,6 +1624,28 @@ ioc_iova_init(struct ioc *ioc)
+ 	READ_REG(ioc->ioc_hpa + IOC_IBASE);
+ }
+ 
++#ifdef CONFIG_KEXEC
++void
++ioc_iova_disable(void)
++{
++	struct ioc *ioc;
++
++	ioc = ioc_list;
++
++	while (ioc != NULL) {
++		/* Disable IOVA translation */
++		WRITE_REG(ioc->ibase & 0xfffffffffffffffe, ioc->ioc_hpa + IOC_IBASE);
++		READ_REG(ioc->ioc_hpa + IOC_IBASE);
++
++		/* Clear I/O TLB of any possible entries */
++		WRITE_REG(ioc->ibase | (get_iovp_order(ioc->iov_size) + iovp_shift), ioc->ioc_hpa + IOC_PCOM);
++		READ_REG(ioc->ioc_hpa + IOC_PCOM);
++
++		ioc = ioc->next;
++	}
++}
++#endif
++
+ static void __init
+ ioc_resource_init(struct ioc *ioc)
+ {
+--- x/arch/ia64/kernel/Makefile
++++ x/arch/ia64/kernel/Makefile
+@@ -28,6 +28,7 @@ obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
+ obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
+ obj-$(CONFIG_IA64_MCA_RECOVERY)	+= mca_recovery.o
+ obj-$(CONFIG_KPROBES)		+= kprobes.o jprobes.o
++obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o crash.o
+ obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR)	+= uncached.o
+ mca_recovery-y			+= mca_drv.o mca_drv_asm.o
+ 
+--- x//dev/null
++++ x/arch/ia64/kernel/crash.c
+@@ -0,0 +1,43 @@
++/*
++ * arch/ia64/kernel/crash.c
++ *
++ * Architecture specific (ia64) functions for kexec based crash dumps.
++ *
++ * Created by: Khalid Aziz <khalid.aziz@hp.com>
++ *
++ * Copyright (C) 2005 Hewlett-Packard Development Company, L.P.
++ *
++ */
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/smp.h>
++#include <linux/irq.h>
++#include <linux/reboot.h>
++#include <linux/kexec.h>
++#include <linux/irq.h>
++#include <linux/delay.h>
++#include <linux/elf.h>
++#include <linux/elfcore.h>
++#include <linux/device.h>
++
++void
++machine_crash_shutdown(struct pt_regs *pt)
++{
++	/* This function is only called after the system
++	 * has paniced or is otherwise in a critical state.
++	 * The minimum amount of code to allow a kexec'd kernel
++	 * to run successfully needs to happen here.
++	 *
++	 * In practice this means shooting down the other cpus in
++	 * an SMP system.
++	 */
++	if (in_interrupt())
++		ia64_eoi();
++#ifdef CONFIG_SMP
++	smp_send_stop();
++#endif
++#ifdef CONFIG_IA64_HP_ZX1
++	ioc_iova_disable();
++#endif
++}
+--- x/arch/ia64/kernel/entry.S
++++ x/arch/ia64/kernel/entry.S
+@@ -1590,7 +1590,7 @@ sys_call_table:
+ 	data8 sys_mq_timedreceive		// 1265
+ 	data8 sys_mq_notify
+ 	data8 sys_mq_getsetattr
+-	data8 sys_ni_syscall			// reserved for kexec_load
++	data8 sys_kexec_load
+ 	data8 sys_ni_syscall			// reserved for vserver
+ 	data8 sys_waitid			// 1270
+ 	data8 sys_add_key
+--- x//dev/null
++++ x/arch/ia64/kernel/machine_kexec.c
+@@ -0,0 +1,140 @@
++/*
++ * arch/ia64/kernel/machine_kexec.c
++ *
++ * Handle transition of Linux booting another kernel
++ * Copyright (C) 2005 Hewlett-Packard Development Comapny, L.P.
++ * Copyright (C) 2005 Khalid Aziz <khalid.aziz@hp.com>
++ * Copyright (C) 2006 Intel Corp, Zou Nan hai <nanhai.zou@intel.com>
++ *
++ * This source code is licensed under the GNU General Public License,
++ * Version 2.  See the file COPYING for more details.
++ */
++
++#include <linux/kernel.h>
++#include <linux/config.h>
++#include <linux/mm.h>
++#include <linux/kexec.h>
++#include <linux/pci.h>
++#include <linux/cpu.h>
++#include <asm/mmu_context.h>
++#include <asm/setup.h>
++#include <asm/mca.h>
++#include <asm/page.h>
++#include <asm/bitops.h>
++#include <asm/tlbflush.h>
++#include <asm/delay.h>
++#include <asm/meminit.h>
++
++extern unsigned long ia64_iobase;
++
++typedef void (*relocate_new_kernel_t)( unsigned long, unsigned long,
++		struct ia64_boot_param *, unsigned long);
++
++/*
++ * Do what every setup is needed on image and the
++ * reboot code buffer to allow us to avoid allocations
++ * later.
++ */
++int machine_kexec_prepare(struct kimage *image)
++{
++	void *control_code_buffer;
++	const unsigned long *func;
++
++	func = (unsigned long *)&relocate_new_kernel;
++	/* Pre-load control code buffer to minimize work in kexec path */
++	control_code_buffer = page_address(image->control_code_page);
++	memcpy((void *)control_code_buffer, (const void *)func[0],
++			relocate_new_kernel_size);
++	flush_icache_range((unsigned long)control_code_buffer,
++			(unsigned long)control_code_buffer + relocate_new_kernel_size);
++
++	return 0;
++}
++
++void machine_kexec_cleanup(struct kimage *image)
++{
++}
++
++void machine_shutdown(void)
++{
++#ifdef CONFIG_PCI
++	struct pci_dev *dev = NULL;
++	irq_desc_t *idesc;
++	cpumask_t mask = CPU_MASK_NONE;
++
++	/* Disable all PCI devices */
++	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
++		if (!(dev->is_enabled))
++			continue;
++		idesc = irq_descp(dev->irq);
++		if (!idesc)
++			continue;
++		cpu_set(0, mask);
++		disable_irq_nosync(dev->irq);
++		idesc->handler->end(dev->irq);
++		idesc->handler->set_affinity(dev->irq, mask);
++		idesc->action = NULL;
++		pci_disable_device(dev);
++	}
++#endif
++
++#ifdef CONFIG_HOTPLUG_CPU
++	{
++		int cpu;
++
++		for_each_online_cpu(cpu) {
++			if (cpu != smp_processor_id())
++				cpu_down(cpu);
++		}
++	}
++#elif defined(CONFIG_SMP)
++	smp_call_function(kexec_stop_this_cpu, (void *)image->start, 0, 0);
++#endif
++
++	ia64_set_itv(1<<16);
++
++#ifdef CONFIG_IA64_HP_ZX1
++	ioc_iova_disable();
++#endif
++}
++
++/*
++ * Do not allocate memory (or fail in any way) in machine_kexec().
++ * We are past the point of no return, committed to rebooting now.
++ */
++void machine_kexec(struct kimage *image)
++{
++	unsigned long indirection_page;
++	relocate_new_kernel_t rnk;
++	unsigned long pta, impl_va_bits;
++	void *pal_addr = efi_get_pal_addr();
++	unsigned long code_addr = (unsigned long)page_address(image->control_code_page);
++
++	/* Interrupts aren't acceptable while we reboot */
++	local_irq_disable();
++
++	/* Disable VHPT */
++	impl_va_bits = ffz(~(local_cpu_data->unimpl_va_mask | (7UL << 61)));
++	pta = POW2(61) - POW2(vmlpt_bits);
++	ia64_set_pta(pta | (0 << 8) | (vmlpt_bits << 2) | 0);
++
++	/* now execute the control code.
++	 * We will start by executing the control code linked into the
++	 * kernel as opposed to the code we copied in control code buffer		 * page. When this code switches to physical mode, we will start
++	 * executing the code in control code buffer page. Reason for
++	 * doing this is we start code execution in virtual address space.
++	 * If we were to try to execute the newly copied code in virtual
++	 * address space, we will need to make an ITLB entry to avoid ITLB
++	 * miss. By executing the code linked into kernel, we take advantage
++	 * of the ITLB entry already in place for kernel and avoid making
++	 * a new entry.
++	 */
++	indirection_page = image->head & PAGE_MASK;
++
++	rnk = (relocate_new_kernel_t)&code_addr;
++	(*rnk)(indirection_page, image->start, ia64_boot_param,
++		     GRANULEROUNDDOWN((unsigned long) pal_addr));
++	BUG();
++	for (;;)
++		;
++}
+--- x//dev/null
++++ x/arch/ia64/kernel/relocate_kernel.S
+@@ -0,0 +1,359 @@
++/*
++ * arch/ia64/kernel/relocate_kernel.S
++ *
++ * Relocate kexec'able kernel and start it
++ *
++ * Copyright (C) 2005 Hewlett-Packard Development Company, L.P.
++ * Copyright (C) 2005 Khalid Aziz  <khalid.aziz@hp.com>
++ * Copyright (C) 2005 Intel Corp,  Zou Nan hai <nanhai.zou@intel.com>
++ *
++ * This source code is licensed under the GNU General Public License,
++ * Version 2.  See the file COPYING for more details.
++ */
++#include <linux/config.h>
++#include <asm/asmmacro.h>
++#include <asm/kregs.h>
++#include <asm/page.h>
++#include <asm/pgtable.h>
++#include <asm/mca_asm.h>
++
++       /* Must be relocatable PIC code callable as a C function, that once
++        * it starts can not use the previous processes stack.
++        *
++        */
++GLOBAL_ENTRY(relocate_new_kernel)
++	.prologue
++	alloc r31=ar.pfs,4,0,0,0
++        .body
++.reloc_entry:
++{
++	rsm psr.i| psr.ic
++	mov r2=ip
++}
++	;;
++{
++        flushrs                         // must be first insn in group
++        srlz.i
++}
++	;;
++
++	//first switch to physical mode
++	add r3=1f-.reloc_entry, r2
++	movl r16 = IA64_PSR_AC|IA64_PSR_BN|IA64_PSR_IC|IA64_PSR_MFL
++	mov ar.rsc=0	          	// put RSE in enforced lazy mode
++	;;
++	add r2=(memory_stack-.reloc_entry), r2
++	;;
++	add sp=(memory_stack_end - .reloc_entry),r2
++	add r8=(register_stack - .reloc_entry),r2
++	;;
++	tpa sp=sp
++	tpa r3=r3
++	;;
++	loadrs
++	;;
++	mov r18=ar.rnat
++	mov ar.bspstore=r8
++	;;
++        mov cr.ipsr=r16
++        mov cr.iip=r3
++        mov cr.ifs=r0
++	srlz.i
++	;;
++	mov ar.rnat=r18
++	rfi
++	;;
++1:
++	//physical mode code begin
++	mov b6=in1
++	tpa r28=in2			// tpa must before TLB purge
++
++	// purge all TC entries
++#define O(member)       IA64_CPUINFO_##member##_OFFSET
++        GET_THIS_PADDR(r2, cpu_info)    // load phys addr of cpu_info into r2
++        ;;
++        addl r17=O(PTCE_STRIDE),r2
++        addl r2=O(PTCE_BASE),r2
++        ;;
++        ld8 r18=[r2],(O(PTCE_COUNT)-O(PTCE_BASE));;    	// r18=ptce_base
++        ld4 r19=[r2],4                                  // r19=ptce_count[0]
++        ld4 r21=[r17],4                                 // r21=ptce_stride[0]
++        ;;
++        ld4 r20=[r2]                                    // r20=ptce_count[1]
++        ld4 r22=[r17]                                   // r22=ptce_stride[1]
++        mov r24=r0
++        ;;
++        adds r20=-1,r20
++        ;;
++#undef O
++2:
++        cmp.ltu p6,p7=r24,r19
++(p7)    br.cond.dpnt.few 4f
++        mov ar.lc=r20
++3:
++        ptc.e r18
++        ;;
++        add r18=r22,r18
++        br.cloop.sptk.few 3b
++        ;;
++        add r18=r21,r18
++        add r24=1,r24
++        ;;
++        br.sptk.few 2b
++4:
++        srlz.i
++        ;;
++	//purge TR entry for kernel text and data
++        movl r16=KERNEL_START
++        mov r18=KERNEL_TR_PAGE_SHIFT<<2
++        ;;
++        ptr.i r16, r18
++        ptr.d r16, r18
++        ;;
++        srlz.i
++        ;;
++
++	// purge TR entry for percpu data
++        movl r16=PERCPU_ADDR
++        mov r18=PERCPU_PAGE_SHIFT<<2
++        ;;
++        ptr.d r16,r18
++        ;;
++        srlz.d
++	;;
++
++        // purge TR entry for pal code
++        mov r16=in3
++        mov r18=IA64_GRANULE_SHIFT<<2
++        ;;
++        ptr.i r16,r18
++        ;;
++        srlz.i
++	;;
++
++        // purge TR entry for stack
++        mov r16=IA64_KR(CURRENT_STACK)
++        ;;
++        shl r16=r16,IA64_GRANULE_SHIFT
++        movl r19=PAGE_OFFSET
++        ;;
++        add r16=r19,r16
++        mov r18=IA64_GRANULE_SHIFT<<2
++        ;;
++        ptr.d r16,r18
++        ;;
++        srlz.i
++	;;
++
++	// copy kexec kernel segments
++	movl r16=PAGE_MASK
++	ld8  r30=[in0],8;;			// in0 is page_list
++	br.sptk.few .dest_page
++	;;
++.loop:
++	ld8  r30=[in0], 8;;
++.dest_page:
++	tbit.z p0, p6=r30, 0;;    	// 0x1 dest page
++(p6)	and r17=r30, r16
++(p6)	br.cond.sptk.few .loop;;
++
++	tbit.z p0, p6=r30, 1;;		// 0x2 indirect page
++(p6)	and in0=r30, r16
++(p6)	br.cond.sptk.few .loop;;
++
++	tbit.z p0, p6=r30, 2;;		// 0x4 end flag
++(p6)	br.cond.sptk.few .end_loop;;
++
++	tbit.z p6, p0=r30, 3;;		// 0x8 source page
++(p6)	br.cond.sptk.few .loop
++
++	and r18=r30, r16
++
++	// simple copy page, may optimize later
++	movl r14=PAGE_SIZE/8 - 1;;
++	mov ar.lc=r14;;
++1:
++	ld8 r14=[r18], 8;;
++	st8 [r17]=r14, 8;;
++	fc.i r17
++	br.ctop.sptk.few 1b
++	br.sptk.few .loop
++	;;
++
++.end_loop:
++	sync.i			// for fc.i
++	;;
++	srlz.i
++	;;
++	srlz.d
++	;;
++	br.call.sptk.many b0=b6;;
++memory_stack:
++	.fill           8192, 1, 0
++memory_stack_end:
++register_stack:
++	.fill           8192, 1, 0
++register_stack_end:
++relocate_new_kernel_end:
++END(relocate_new_kernel)
++
++GLOBAL_ENTRY(kexec_fake_sal_rendez)
++	.prologue
++	alloc r31=ar.pfs,3,0,0,0
++	.body
++.rendez_entry:
++	rsm	psr.i | psr.ic
++	mov r25=ip
++	;;
++	{
++		flushrs
++		srlz.i
++	}
++	;;
++       /* See where I am running, and compute gp */
++	{
++		mov     ar.rsc = 0      /* Put RSE in enforce lacy, LE mode */
++		mov     gp = ip         /* gp == relocate_new_kernel */
++	}
++
++	movl r8=0x00000100000000
++	;;
++	mov cr.iva=r8
++	/* Transition from virtual to physical mode */
++	srlz.i
++	;;
++	add	r17=5f-.rendez_entry, r25
++	movl	r16=(IA64_PSR_AC | IA64_PSR_BN | IA64_PSR_IC | IA64_PSR_MFL)
++	;;
++	tpa	r17=r17
++	mov	cr.ipsr=r16
++	;;
++	mov	cr.iip=r17
++	mov	cr.ifs=r0
++	;;
++	rfi
++	;;
++5:
++	mov     b6=in0			/* _start addr */
++	mov	r8=in1			/* ap_wakeup_vector */
++	mov	r26=in2			/* PAL addr */
++	;;
++	/* Purge kernel TRs */
++	movl	r16=KERNEL_START
++	mov	r18=KERNEL_TR_PAGE_SHIFT<<2
++	;;
++	ptr.i	r16,r18
++	ptr.d	r16,r18
++	;;
++	srlz.i
++	;;
++	srlz.d
++	;;
++	/* Purge percpu TR */
++	movl	r16=PERCPU_ADDR
++	mov	r18=PERCPU_PAGE_SHIFT<<2
++	;;
++	ptr.d	r16,r18
++	;;
++	srlz.d
++	;;
++	/* Purge PAL TR */
++	mov	r18=IA64_GRANULE_SHIFT<<2
++	;;
++	ptr.i	r26,r18
++	;;
++	srlz.i
++	;;
++	/* Purge stack TR */
++	mov	r16=IA64_KR(CURRENT_STACK)
++	;;
++	shl	r16=r16,IA64_GRANULE_SHIFT
++	movl	r19=PAGE_OFFSET
++	;;
++	add	r16=r19,r16
++	mov	r18=IA64_GRANULE_SHIFT<<2
++	;;
++	ptr.d	r16,r18
++	;;
++	srlz.i
++	;;
++
++	/* Ensure we can read and clear external interrupts */
++	mov	cr.tpr=r0
++	srlz.d
++
++	shr.u	r9=r8,6			/* which irr */
++	;;
++	and	r8=63,r8		/* bit offset into irr */
++	;;
++	mov	r10=1;;
++	;;
++	shl	r10=r10,r8		/* bit mask off irr we want */
++	cmp.eq	p6,p0=0,r9
++	;;
++(p6)	br.cond.sptk.few        check_irr0
++	cmp.eq	p7,p0=1,r9
++	;;
++(p7)	br.cond.sptk.few        check_irr1
++	cmp.eq	p8,p0=2,r9
++	;;
++(p8)	br.cond.sptk.few        check_irr2
++	cmp.eq	p9,p0=3,r9
++	;;
++(p9)	br.cond.sptk.few        check_irr3
++
++check_irr0:
++	mov	r8=cr.irr0
++	;;
++	and	r8=r8,r10
++	;;
++	cmp.eq	p6,p0=0,r8
++(p6)	br.cond.sptk.few	check_irr0
++	br.few	call_start
++
++check_irr1:
++	mov	r8=cr.irr1
++	;;
++	and	r8=r8,r10
++	;;
++	cmp.eq	p6,p0=0,r8
++(p6)	br.cond.sptk.few	check_irr1
++	br.few	call_start
++
++check_irr2:
++	mov	r8=cr.irr2
++	;;
++	and	r8=r8,r10
++	;;
++	cmp.eq	p6,p0=0,r8
++(p6)	br.cond.sptk.few	check_irr2
++	br.few	call_start
++
++check_irr3:
++	mov	r8=cr.irr3
++	;;
++	and	r8=r8,r10
++	;;
++	cmp.eq	p6,p0=0,r8
++(p6)	br.cond.sptk.few	check_irr3
++	br.few	call_start
++
++call_start:
++	mov	cr.eoi=r0
++	;;
++	srlz.d
++	;;
++	mov	r8=cr.ivr
++	;;
++	srlz.d
++	;;
++	cmp.eq	p0,p6=15,r8
++(p6)	br.cond.sptk.few	call_start
++	br.sptk.few		b6
++kexec_fake_sal_rendez_end:
++END(kexec_fake_sal_rendez)
++
++	.global relocate_new_kernel_size
++relocate_new_kernel_size:
++	data8	kexec_fake_sal_rendez_end - relocate_new_kernel
++
+--- x/arch/ia64/kernel/smp.c
++++ x/arch/ia64/kernel/smp.c
+@@ -30,6 +30,7 @@
+ #include <linux/delay.h>
+ #include <linux/efi.h>
+ #include <linux/bitops.h>
++#include <linux/kexec.h>
+ 
+ #include <asm/atomic.h>
+ #include <asm/current.h>
+@@ -84,6 +85,34 @@ unlock_ipi_calllock(void)
+ 	spin_unlock_irq(&call_lock);
+ }
+ 
++#ifdef CONFIG_KEXEC
++/*
++ * Stop the CPU and put it in fake SAL rendezvous. This allows CPU to wake
++ * up with IPI from boot processor
++ */
++void
++kexec_stop_this_cpu (void *func)
++{
++	unsigned long pta, impl_va_bits, pal_base;
++
++	/*
++	 * Remove this CPU by putting it into fake SAL rendezvous
++	 */
++	cpu_clear(smp_processor_id(), cpu_online_map);
++	max_xtp();
++	ia64_eoi();
++
++	/* Disable VHPT */
++	impl_va_bits = ffz(~(local_cpu_data->unimpl_va_mask | (7UL << 61)));
++	pta = POW2(61) - POW2(vmlpt_bits);
++	ia64_set_pta(pta | (0 << 8) | (vmlpt_bits << 2) | 0);
++
++	local_irq_disable();
++	pal_base = __get_cpu_var(ia64_mca_pal_base);
++	kexec_fake_sal_rendez(func, ap_wakeup_vector, pal_base);
++}
++#endif
++
+ static void
+ stop_this_cpu (void)
+ {
+--- x//dev/null
++++ x/include/asm-ia64/kexec.h
+@@ -0,0 +1,36 @@
++#ifndef _ASM_IA64_KEXEC_H
++#define _ASM_IA64_KEXEC_H
++
++
++/* Maximum physical address we can use pages from */
++#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
++/* Maximum address we can reach in physical address mode */
++#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
++/* Maximum address we can use for the control code buffer */
++#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
++
++#define KEXEC_CONTROL_CODE_SIZE (8192 + 8192 + 4096)
++
++/* The native architecture */
++#define KEXEC_ARCH KEXEC_ARCH_IA_64
++
++#define MAX_NOTE_BYTES 1024
++
++#define pte_bits	3
++#define vmlpt_bits	(impl_va_bits - PAGE_SHIFT + pte_bits)
++#define POW2(n)		(1ULL << (n))
++
++DECLARE_PER_CPU(u64, ia64_mca_pal_base);
++
++const extern unsigned int relocate_new_kernel_size;
++volatile extern long kexec_rendez;
++extern void relocate_new_kernel(unsigned long, unsigned long,
++		struct ia64_boot_param *, unsigned long);
++extern void kexec_fake_sal_rendez(void *start, unsigned long wake_up,
++		unsigned long pal_base);
++
++static inline void
++crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs)
++{
++}
++#endif /* _ASM_IA64_KEXEC_H */
+--- x/include/asm-ia64/machvec_hpzx1.h
++++ x/include/asm-ia64/machvec_hpzx1.h
+@@ -34,4 +34,6 @@ extern ia64_mv_dma_mapping_error	sba_dma
+ #define platform_dma_supported			sba_dma_supported
+ #define platform_dma_mapping_error		sba_dma_mapping_error
+ 
++extern void ioc_iova_disable(void);
++
+ #endif /* _ASM_IA64_MACHVEC_HPZX1_h */
+--- x/include/asm-ia64/smp.h
++++ x/include/asm-ia64/smp.h
+@@ -129,6 +129,9 @@ extern void smp_send_reschedule (int cpu
+ extern void lock_ipi_calllock(void);
+ extern void unlock_ipi_calllock(void);
+ extern void identify_siblings (struct cpuinfo_ia64 *);
++#ifdef CONFIG_KEXEC
++extern void kexec_stop_this_cpu(void *);
++#endif
+ 
+ #else
+ 
--- /dev/null
+++ x/patches/linux-2.6.16.13/kexec-ia64-2.patch
@@ -0,0 +1,596 @@
+commit beada884dd437b509c26b39f1a0b0c6b31e6f340
+tree ad7608f34ca8aa9e292e2a863484b3e13250107d
+parent b373e385743597f576b67c423807bbdfe3b862e7
+author Zou Nan hai <nanhai.zou@intel.com> 1150320804 -0700
+committer Tony Luck <tony.luck@intel.com> 1150320804 -0700
+
+[IA64] Miscellaneous updates for kexec/kdump
+
+Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
+
+This patch is from Tony Luck's ia64 test git tree circa 2.6.17.
+It has been rediffed and trivially backported to xen 2.6.16.3
+
+Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
+Signed-Off-By: Horms <horms@verge.net.au>
+
+ arch/ia64/Kconfig                  |    6 +
+ arch/ia64/kernel/crash.c           |  113 +++++++++++++++++++++++++++++++++++-
+ arch/ia64/kernel/efi.c             |   17 ++++-
+ arch/ia64/kernel/machine_kexec.c   |   43 ++-----------
+ arch/ia64/kernel/relocate_kernel.S |   38 +++++-------
+ arch/ia64/kernel/setup.c           |   38 ++++++++++++
+ include/asm-ia64/kexec.h           |    4 -
+ include/asm-ia64/meminit.h         |    3 
+ include/linux/irq.h                |    1 
+ kernel/irq/manage.c                |   19 ++++++
+ 10 files changed, 217 insertions(+), 65 deletions(-)
+
+--- x/arch/ia64/Kconfig
++++ x/arch/ia64/Kconfig
+@@ -437,6 +437,12 @@ config KEXEC
+ 	  support.  As of this writing the exact hardware interface is
+ 	  strongly in flux, so no good recommendation can be made.
+ 
++config CRASH_DUMP
++	bool "kernel crash dumps (EXPERIMENTAL)"
++	depends on EXPERIMENTAL
++	help
++	  Generate crash dump after being started by kexec.
++
+ source "net/Kconfig"
+ 
+ source "drivers/Kconfig"
+--- x/arch/ia64/kernel/crash.c
++++ x/arch/ia64/kernel/crash.c
+@@ -4,8 +4,8 @@
+  * Architecture specific (ia64) functions for kexec based crash dumps.
+  *
+  * Created by: Khalid Aziz <khalid.aziz@hp.com>
+- *
+  * Copyright (C) 2005 Hewlett-Packard Development Company, L.P.
++ * Copyright (C) 2005 Intel Corp	Zou Nan hai <nanhai.zou@intel.com>
+  *
+  */
+ #include <linux/init.h>
+@@ -13,6 +13,7 @@
+ #include <linux/kernel.h>
+ #include <linux/smp.h>
+ #include <linux/irq.h>
++#include <linux/pci.h>
+ #include <linux/reboot.h>
+ #include <linux/kexec.h>
+ #include <linux/irq.h>
+@@ -20,6 +21,111 @@
+ #include <linux/elf.h>
+ #include <linux/elfcore.h>
+ #include <linux/device.h>
++#include <asm/uaccess.h>
++
++size_t copy_oldmem_page(unsigned long pfn, char *buf,
++                               size_t csize, unsigned long offset, int userbuf)
++{
++        void  *vaddr;
++
++        if (!csize)
++                return 0;
++        vaddr = page_address(pfn_to_page(pfn));
++
++        if (userbuf) {
++                if (copy_to_user(buf, (vaddr + offset), csize)) {
++                        return -EFAULT;
++                }
++        } else
++                memcpy(buf, (vaddr + offset), csize);
++        return csize;
++}
++
++static void device_shootdown(void)
++{
++       struct pci_dev *dev;
++       irq_desc_t *desc;
++       u16 pci_command;
++
++       list_for_each_entry(dev, &pci_devices, global_list) {
++               desc = irq_descp(dev->irq);
++               if (!desc->action)
++                       continue;
++               pci_read_config_word(dev, PCI_COMMAND, &pci_command);
++               if (pci_command & PCI_COMMAND_MASTER) {
++                       pci_command &= ~PCI_COMMAND_MASTER;
++                       pci_write_config_word(dev, PCI_COMMAND, pci_command);
++               }
++               disable_irq_nosync(dev->irq);
++               desc->handler->end(dev->irq);
++       }
++}
++
++static Elf64_Word
++*append_elf_note(Elf64_Word *buf, char *name, unsigned type, void *data,
++		size_t data_len)
++{
++	struct elf_note *note = (struct elf_note *)buf;
++	note->n_namesz = strlen(name) + 1;
++	note->n_descsz = data_len;
++	note->n_type   = type;
++	buf += (sizeof(*note) + 3)/4;
++	memcpy(buf, name, note->n_namesz);
++	buf += (note->n_namesz + 3)/4;
++	memcpy(buf, data, data_len);
++	buf += (data_len + 3)/4;
++	return buf;
++}
++
++static void
++final_note(void *buf)
++{
++	memset(buf, 0, sizeof(struct elf_note));
++}
++
++static void
++crash_save_this_cpu(void)
++{
++	void *buf;
++	struct elf_prstatus prstatus;
++	int cpu = smp_processor_id();
++	elf_greg_t *dst = (elf_greg_t *)&prstatus.pr_reg;
++
++	memset(&prstatus, 0, sizeof(prstatus));
++	prstatus.pr_pid = current->pid;
++
++    	dst[1] = ia64_getreg(_IA64_REG_GP);
++    	dst[12] = ia64_getreg(_IA64_REG_SP);
++    	dst[13] = ia64_getreg(_IA64_REG_TP);
++
++    	dst[42] = ia64_getreg(_IA64_REG_IP);
++    	dst[45] = ia64_getreg(_IA64_REG_AR_RSC);
++
++	ia64_setreg(_IA64_REG_AR_RSC, 0);
++	ia64_srlz_i();
++
++    	dst[46] = ia64_getreg(_IA64_REG_AR_BSP);
++    	dst[47] = ia64_getreg(_IA64_REG_AR_BSPSTORE);
++
++    	dst[48] = ia64_getreg(_IA64_REG_AR_RNAT);
++    	dst[49] = ia64_getreg(_IA64_REG_AR_CCV);
++    	dst[50] = ia64_getreg(_IA64_REG_AR_UNAT);
++
++    	dst[51] = ia64_getreg(_IA64_REG_AR_FPSR);
++    	dst[52] = ia64_getreg(_IA64_REG_AR_PFS);
++    	dst[53] = ia64_getreg(_IA64_REG_AR_LC);
++
++    	dst[54] = ia64_getreg(_IA64_REG_AR_LC);
++    	dst[55] = ia64_getreg(_IA64_REG_AR_CSD);
++    	dst[56] = ia64_getreg(_IA64_REG_AR_SSD);
++
++        buf = (u64 *) per_cpu_ptr(crash_notes, cpu);
++	if (!buf)
++		return;
++	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
++		sizeof(prstatus));
++	final_note(buf);
++}
+ 
+ void
+ machine_crash_shutdown(struct pt_regs *pt)
+@@ -32,8 +138,11 @@ machine_crash_shutdown(struct pt_regs *p
+ 	 * In practice this means shooting down the other cpus in
+ 	 * an SMP system.
+ 	 */
+-	if (in_interrupt())
++	if (in_interrupt()) {
+ 		ia64_eoi();
++	}
++	crash_save_this_cpu();
++	device_shootdown();
+ #ifdef CONFIG_SMP
+ 	smp_send_stop();
+ #endif
+--- x/arch/ia64/kernel/efi.c
++++ x/arch/ia64/kernel/efi.c
+@@ -25,6 +25,7 @@
+ #include <linux/types.h>
+ #include <linux/time.h>
+ #include <linux/efi.h>
++#include <linux/kexec.h>
+ 
+ #include <asm/io.h>
+ #include <asm/kregs.h>
+@@ -40,7 +41,7 @@ extern efi_status_t efi_call_phys (void 
+ struct efi efi;
+ EXPORT_SYMBOL(efi);
+ static efi_runtime_services_t *runtime;
+-static unsigned long mem_limit = ~0UL, max_addr = ~0UL;
++static unsigned long mem_limit = ~0UL, max_addr = ~0UL, min_addr = 0UL;
+ 
+ #define efi_call_virt(f, args...)	(*(f))(args)
+ 
+@@ -420,6 +421,8 @@ efi_init (void)
+ 			mem_limit = memparse(cp + 4, &cp);
+ 		} else if (memcmp(cp, "max_addr=", 9) == 0) {
+ 			max_addr = GRANULEROUNDDOWN(memparse(cp + 9, &cp));
++		} else if (memcmp(cp, "min_addr=", 9) == 0) {
++			min_addr = GRANULEROUNDDOWN(memparse(cp + 9, &cp));
+ 		} else {
+ 			while (*cp != ' ' && *cp)
+ 				++cp;
+@@ -427,6 +430,8 @@ efi_init (void)
+ 				++cp;
+ 		}
+ 	}
++	if (min_addr != 0UL)
++		printk(KERN_INFO "Ignoring memory below %luMB\n", min_addr >> 20);
+ 	if (max_addr != ~0UL)
+ 		printk(KERN_INFO "Ignoring memory above %luMB\n", max_addr >> 20);
+ 
+@@ -839,7 +844,8 @@ find_memmap_space (void)
+ 		as = max(contig_low, md->phys_addr);
+ 		ae = min(contig_high, efi_md_end(md));
+ 
+-		/* keep within max_addr= command line arg */
++		/* keep within max_addr= and min_addr= command line arg */
++		as = max(as, min_addr);
+ 		ae = min(ae, max_addr);
+ 		if (ae <= as)
+ 			continue;
+@@ -949,7 +955,8 @@ efi_memmap_init(unsigned long *s, unsign
+ 		} else
+ 			ae = efi_md_end(md);
+ 
+-		/* keep within max_addr= command line arg */
++		/* keep within max_addr= and min_addr= command line arg */
++		as = max(as, min_addr);
+ 		ae = min(ae, max_addr);
+ 		if (ae <= as)
+ 			continue;
+@@ -1061,6 +1068,10 @@ efi_initialize_iomem_resources(struct re
+ 			 */
+ 			insert_resource(res, code_resource);
+ 			insert_resource(res, data_resource);
++#ifdef CONFIG_KEXEC
++			if (crashk_res.end > crashk_res.start)
++				insert_resource(res, &crashk_res);
++#endif
+ 		}
+ 	}
+ }
+--- x/arch/ia64/kernel/machine_kexec.c
++++ x/arch/ia64/kernel/machine_kexec.c
+@@ -1,5 +1,5 @@
+ /*
+- * arch/ia64/kernel/machine_kexec.c
++ * arch/ia64/kernel/machine_kexec.c 
+  *
+  * Handle transition of Linux booting another kernel
+  * Copyright (C) 2005 Hewlett-Packard Development Comapny, L.P.
+@@ -25,9 +25,7 @@
+ #include <asm/delay.h>
+ #include <asm/meminit.h>
+ 
+-extern unsigned long ia64_iobase;
+-
+-typedef void (*relocate_new_kernel_t)( unsigned long, unsigned long,
++typedef void (*relocate_new_kernel_t)(unsigned long, unsigned long,
+ 		struct ia64_boot_param *, unsigned long);
+ 
+ /*
+@@ -43,9 +41,9 @@ int machine_kexec_prepare(struct kimage 
+ 	func = (unsigned long *)&relocate_new_kernel;
+ 	/* Pre-load control code buffer to minimize work in kexec path */
+ 	control_code_buffer = page_address(image->control_code_page);
+-	memcpy((void *)control_code_buffer, (const void *)func[0],
++	memcpy((void *)control_code_buffer, (const void *)func[0], 
+ 			relocate_new_kernel_size);
+-	flush_icache_range((unsigned long)control_code_buffer,
++	flush_icache_range((unsigned long)control_code_buffer, 
+ 			(unsigned long)control_code_buffer + relocate_new_kernel_size);
+ 
+ 	return 0;
+@@ -61,7 +59,6 @@ void machine_shutdown(void)
+ 	struct pci_dev *dev = NULL;
+ 	irq_desc_t *idesc;
+ 	cpumask_t mask = CPU_MASK_NONE;
+-
+ 	/* Disable all PCI devices */
+ 	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+ 		if (!(dev->is_enabled))
+@@ -91,7 +88,6 @@ void machine_shutdown(void)
+ 	smp_call_function(kexec_stop_this_cpu, (void *)image->start, 0, 0);
+ #endif
+ 
+-	ia64_set_itv(1<<16);
+ 
+ #ifdef CONFIG_IA64_HP_ZX1
+ 	ioc_iova_disable();
+@@ -100,41 +96,20 @@ void machine_shutdown(void)
+ 
+ /*
+  * Do not allocate memory (or fail in any way) in machine_kexec().
+- * We are past the point of no return, committed to rebooting now.
++ * We are past the point of no return, committed to rebooting now. 
+  */
++extern void *efi_get_pal_addr(void);
+ void machine_kexec(struct kimage *image)
+ {
+-	unsigned long indirection_page;
+ 	relocate_new_kernel_t rnk;
+-	unsigned long pta, impl_va_bits;
+ 	void *pal_addr = efi_get_pal_addr();
+ 	unsigned long code_addr = (unsigned long)page_address(image->control_code_page);
+-
+ 	/* Interrupts aren't acceptable while we reboot */
++	ia64_set_itv(1<<16);
+ 	local_irq_disable();
+-
+-	/* Disable VHPT */
+-	impl_va_bits = ffz(~(local_cpu_data->unimpl_va_mask | (7UL << 61)));
+-	pta = POW2(61) - POW2(vmlpt_bits);
+-	ia64_set_pta(pta | (0 << 8) | (vmlpt_bits << 2) | 0);
+-
+-	/* now execute the control code.
+-	 * We will start by executing the control code linked into the
+-	 * kernel as opposed to the code we copied in control code buffer		 * page. When this code switches to physical mode, we will start
+-	 * executing the code in control code buffer page. Reason for
+-	 * doing this is we start code execution in virtual address space.
+-	 * If we were to try to execute the newly copied code in virtual
+-	 * address space, we will need to make an ITLB entry to avoid ITLB
+-	 * miss. By executing the code linked into kernel, we take advantage
+-	 * of the ITLB entry already in place for kernel and avoid making
+-	 * a new entry.
+-	 */
+-	indirection_page = image->head & PAGE_MASK;
+-
+ 	rnk = (relocate_new_kernel_t)&code_addr;
+-	(*rnk)(indirection_page, image->start, ia64_boot_param,
++	(*rnk)(image->head, image->start, ia64_boot_param,
+ 		     GRANULEROUNDDOWN((unsigned long) pal_addr));
+ 	BUG();
+-	for (;;)
+-		;
++	for (;;);
+ }
+--- x/arch/ia64/kernel/relocate_kernel.S
++++ x/arch/ia64/kernel/relocate_kernel.S
+@@ -1,5 +1,5 @@
+ /*
+- * arch/ia64/kernel/relocate_kernel.S
++ * arch/ia64/kernel/relocate_kernel.S 
+  *
+  * Relocate kexec'able kernel and start it
+  *
+@@ -17,9 +17,7 @@
+ #include <asm/pgtable.h>
+ #include <asm/mca_asm.h>
+ 
+-       /* Must be relocatable PIC code callable as a C function, that once
+-        * it starts can not use the previous processes stack.
+-        *
++       /* Must be relocatable PIC code callable as a C function
+         */
+ GLOBAL_ENTRY(relocate_new_kernel)
+ 	.prologue
+@@ -36,22 +34,16 @@ GLOBAL_ENTRY(relocate_new_kernel)
+         srlz.i
+ }
+ 	;;
+-
++	dep r2=0,r2,61,3		//to physical address
++	;;
+ 	//first switch to physical mode
+ 	add r3=1f-.reloc_entry, r2
+-	movl r16 = IA64_PSR_AC|IA64_PSR_BN|IA64_PSR_IC|IA64_PSR_MFL
++	movl r16 = IA64_PSR_AC|IA64_PSR_BN|IA64_PSR_IC
+ 	mov ar.rsc=0	          	// put RSE in enforced lazy mode
+ 	;;
+-	add r2=(memory_stack-.reloc_entry), r2
+-	;;
+-	add sp=(memory_stack_end - .reloc_entry),r2
++	add sp=(memory_stack_end - 16 - .reloc_entry),r2
+ 	add r8=(register_stack - .reloc_entry),r2
+ 	;;
+-	tpa sp=sp
+-	tpa r3=r3
+-	;;
+-	loadrs
+-	;;
+ 	mov r18=ar.rnat
+ 	mov ar.bspstore=r8
+ 	;;
+@@ -66,7 +58,7 @@ GLOBAL_ENTRY(relocate_new_kernel)
+ 1:
+ 	//physical mode code begin
+ 	mov b6=in1
+-	tpa r28=in2			// tpa must before TLB purge
++	dep r28=0,in2,61,3	//to physical address
+ 
+ 	// purge all TC entries
+ #define O(member)       IA64_CPUINFO_##member##_OFFSET
+@@ -145,10 +137,10 @@ GLOBAL_ENTRY(relocate_new_kernel)
+         srlz.i
+ 	;;
+ 
+-	// copy kexec kernel segments
++	//copy segments
+ 	movl r16=PAGE_MASK
+-	ld8  r30=[in0],8;;			// in0 is page_list
+-	br.sptk.few .dest_page
++        mov  r30=in0                    // in0 is page_list
++        br.sptk.few .dest_page
+ 	;;
+ .loop:
+ 	ld8  r30=[in0], 8;;
+@@ -188,6 +180,8 @@ GLOBAL_ENTRY(relocate_new_kernel)
+ 	srlz.d
+ 	;;
+ 	br.call.sptk.many b0=b6;;
++
++.align  32
+ memory_stack:
+ 	.fill           8192, 1, 0
+ memory_stack_end:
+@@ -310,7 +304,7 @@ check_irr0:
+ 	cmp.eq	p6,p0=0,r8
+ (p6)	br.cond.sptk.few	check_irr0
+ 	br.few	call_start
+-
++	
+ check_irr1:
+ 	mov	r8=cr.irr1
+ 	;;
+@@ -319,7 +313,7 @@ check_irr1:
+ 	cmp.eq	p6,p0=0,r8
+ (p6)	br.cond.sptk.few	check_irr1
+ 	br.few	call_start
+-
++	
+ check_irr2:
+ 	mov	r8=cr.irr2
+ 	;;
+@@ -328,7 +322,7 @@ check_irr2:
+ 	cmp.eq	p6,p0=0,r8
+ (p6)	br.cond.sptk.few	check_irr2
+ 	br.few	call_start
+-
++	
+ check_irr3:
+ 	mov	r8=cr.irr3
+ 	;;
+@@ -337,7 +331,7 @@ check_irr3:
+ 	cmp.eq	p6,p0=0,r8
+ (p6)	br.cond.sptk.few	check_irr3
+ 	br.few	call_start
+-
++	
+ call_start:
+ 	mov	cr.eoi=r0
+ 	;;
+--- x/arch/ia64/kernel/setup.c
++++ x/arch/ia64/kernel/setup.c
+@@ -44,6 +44,8 @@
+ #include <linux/platform.h>
+ #include <linux/pm.h>
+ #include <linux/cpufreq.h>
++#include <linux/kexec.h>
++#include <linux/crash_dump.h>
+ 
+ #include <asm/ia32.h>
+ #include <asm/machvec.h>
+@@ -251,6 +253,32 @@ reserve_memory (void)
+ 	}
+ #endif
+ 
++#ifdef CONFIG_KEXEC
++	/* crashkernel=size@addr specifies the location to reserve for
++	 * a crash kernel.  By reserving this memory we guarantee
++	 * that linux never set's it up as a DMA target.
++	 * Useful for holding code to do something appropriate
++	 * after a kernel panic.
++	 */
++	{
++		char *from = strstr(saved_command_line, "crashkernel=");
++		if (from) {
++			unsigned long size, base;
++			size = memparse(from + 12, &from);
++			if (*from == '@') {
++				base = memparse(from + 1, &from);
++				rsvd_region[n].start =
++					(unsigned long)__va(base);
++				rsvd_region[n].end =
++					(unsigned long)__va(base + size);
++				crashk_res.start = base;
++				crashk_res.end = base + size - 1;
++				n++;
++			}
++		}
++	}
++#endif
++
+ 	efi_memmap_init(&rsvd_region[n].start, &rsvd_region[n].end);
+ 	n++;
+ 
+@@ -496,6 +524,16 @@ setup_arch (char **cmdline_p)
+ 	if (!strstr(saved_command_line, "nomca"))
+ 		ia64_mca_init();
+ 
++#ifdef CONFIG_CRASH_DUMP
++	{
++		char *from = strstr(saved_command_line, "elfcorehdr=");
++
++		if (from)
++			elfcorehdr_addr = memparse(from+11, &from);
++		saved_max_pfn = (unsigned long) -1;
++	}
++#endif
++
+ 	platform_setup(cmdline_p);
+ 	paging_init();
+ }
+--- x/include/asm-ia64/kexec.h
++++ x/include/asm-ia64/kexec.h
+@@ -21,14 +21,12 @@
+ #define POW2(n)		(1ULL << (n))
+ 
+ DECLARE_PER_CPU(u64, ia64_mca_pal_base);
+-
+ const extern unsigned int relocate_new_kernel_size;
+ volatile extern long kexec_rendez;
+-extern void relocate_new_kernel(unsigned long, unsigned long,
++extern void relocate_new_kernel(unsigned long, unsigned long, 
+ 		struct ia64_boot_param *, unsigned long);
+ extern void kexec_fake_sal_rendez(void *start, unsigned long wake_up,
+ 		unsigned long pal_base);
+-
+ static inline void
+ crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs)
+ {
+--- x/include/asm-ia64/meminit.h
++++ x/include/asm-ia64/meminit.h
+@@ -16,11 +16,12 @@
+  * 	- initrd (optional)
+  * 	- command line string
+  * 	- kernel code & data
++ * 	- crash dumping code reserved region
+  * 	- Kernel memory map built from EFI memory map
+  *
+  * More could be added if necessary
+  */
+-#define IA64_MAX_RSVD_REGIONS 6
++#define IA64_MAX_RSVD_REGIONS 7
+ 
+ struct rsvd_region {
+ 	unsigned long start;	/* virtual address of beginning of element */
+--- x/include/linux/irq.h
++++ x/include/linux/irq.h
+@@ -94,6 +94,7 @@ irq_descp (int irq)
+ #include <asm/hw_irq.h> /* the arch dependent stuff */
+ 
+ extern int setup_irq(unsigned int irq, struct irqaction * new);
++extern void terminate_irqs(void);
+ 
+ #ifdef CONFIG_GENERIC_HARDIRQS
+ extern cpumask_t irq_affinity[NR_IRQS];
+--- x/kernel/irq/manage.c
++++ x/kernel/irq/manage.c
+@@ -377,3 +377,22 @@ int request_irq(unsigned int irq,
+ 
+ EXPORT_SYMBOL(request_irq);
+ 
++/*
++ * Terminate any outstanding interrupts
++ */
++void terminate_irqs(void)
++{
++	struct irqaction * action;
++	irq_desc_t *idesc;
++	int i;
++
++	for (i=0; i < NR_IRQS; i++) {
++		idesc = irq_descp(i);
++		action = idesc->action;
++		if (!action)
++			continue;
++		if (idesc->handler->end)
++			idesc->handler->end(i);
++	}
++}
++
--- /dev/null
+++ x/patches/linux-2.6.16.13/kexec-ia64-xen.patch
@@ -0,0 +1,266 @@
+kexec: ia64
+
+This is the kernel patch to make kexec work for xen
+
+Signed-Off-By: Magnus Damm <magnus@valinux.co.jp>
+Signed-Off-By: Horms <horms@verge.net.au>
+
+ arch/ia64/kernel/crash.c           |    8 ++++++
+ arch/ia64/kernel/machine_kexec.c   |   36 +++++++++++++++++++++++++++
+ arch/ia64/kernel/relocate_kernel.S |   47 +++++++++++++++++++++++++++++++++++-
+ include/asm-ia64/kexec-xen.h       |    7 +++++
+ include/asm-ia64/kexec.h           |    2 +
+ 5 files changed, 99 insertions(+), 1 deletion(-)
+
+--- x/include/asm-ia64/kexec-xen.h
++++ x/include/asm-ia64/kexec-xen.h
+@@ -18,10 +18,17 @@ static inline void crash_translate_regs(
+ /* Kexec needs to know about the actual physical addresss.
+  * But in xen, on some architectures, a physical address is a
+  * pseudo-physical addresss. */
++#ifdef CONFIG_XEN
++#define kexec_page_to_pfn(page)  pfn_to_mfn_for_dma(page_to_pfn(page))
++#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn_to_mfn_for_dma(pfn))
++#define kexec_virt_to_phys(addr) phys_to_machine_for_dma(__pa(addr))
++#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys_for_dma(addr))
++#else
+ #define kexec_page_to_pfn(page)  page_to_pfn(page)
+ #define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+ #define kexec_virt_to_phys(addr) virt_to_phys(addr)
+ #define kexec_phys_to_virt(addr) phys_to_virt(addr)
++#endif
+ 
+ #endif /* _IA64_KEXEC_XEN_H */
+ 
+--- x/arch/ia64/kernel/crash.c
++++ x/arch/ia64/kernel/crash.c
+@@ -8,6 +8,11 @@
+  * Copyright (C) 2005 Intel Corp	Zou Nan hai <nanhai.zou@intel.com>
+  *
+  */
++#ifdef CONFIG_XEN
++#include <linux/mm.h>
++#include <linux/string.h>
++#include <asm/uaccess.h>
++#else /* !CONFIG_XEN */
+ #include <linux/init.h>
+ #include <linux/types.h>
+ #include <linux/kernel.h>
+@@ -22,6 +27,7 @@
+ #include <linux/elfcore.h>
+ #include <linux/device.h>
+ #include <asm/uaccess.h>
++#endif /* !CONFIG_XEN */
+ 
+ size_t copy_oldmem_page(unsigned long pfn, char *buf,
+                                size_t csize, unsigned long offset, int userbuf)
+@@ -41,6 +47,7 @@ size_t copy_oldmem_page(unsigned long pf
+         return csize;
+ }
+ 
++#ifndef CONFIG_XEN
+ static void device_shootdown(void)
+ {
+        struct pci_dev *dev;
+@@ -150,3 +157,4 @@ machine_crash_shutdown(struct pt_regs *p
+ 	ioc_iova_disable();
+ #endif
+ }
++#endif /* !CONFIG_XEN */
+--- x/arch/ia64/kernel/machine_kexec.c
++++ x/arch/ia64/kernel/machine_kexec.c
+@@ -24,6 +24,10 @@
+ #include <asm/tlbflush.h>
+ #include <asm/delay.h>
+ #include <asm/meminit.h>
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#include <asm/kexec-xen.h>
++#endif
+ 
+ typedef void (*relocate_new_kernel_t)(unsigned long, unsigned long,
+ 		struct ia64_boot_param *, unsigned long);
+@@ -53,6 +57,7 @@ void machine_kexec_cleanup(struct kimage
+ {
+ }
+ 
++#ifndef CONFIG_XEN
+ void machine_shutdown(void)
+ {
+ #ifdef CONFIG_PCI
+@@ -113,3 +118,34 @@ void machine_kexec(struct kimage *image)
+ 	BUG();
+ 	for (;;);
+ }
++#else /* !CONFIG_XEN */
++void machine_shutdown(void){ }
++
++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki,struct kimage *image)
++{
++	const extern unsigned int kexec_fake_sal_rendez_offset;
++
++	unsigned long page_addr;
++	unsigned long offset;
++
++	page_addr = (unsigned long)page_address(image->control_code_page);
++	offset = kexec_fake_sal_rendez_offset;
++
++	/* This is a very nasty work around the fact that
++	 * kexec_fake_sal_rendez may be on a separate page
++	 * to relocate_new_kernel, or it may not depending
++	 * on the page size. 
++	 *
++	 * Fortunately kexec_fake_sal_rendez shouldn't cross a page boundary.
++	 *
++	 * XXX: what about the stack going over a page boundry???
++	 */
++	while (offset > PAGE_SIZE) {
++		offset -= PAGE_SIZE;
++		page_addr += PAGE_SIZE;
++	}
++
++	xki->fake_sal_rendez = kexec_virt_to_phys(page_addr) | offset;
++}
++#endif /* CONFIG_XEN */
++
+--- x/arch/ia64/kernel/relocate_kernel.S
++++ x/arch/ia64/kernel/relocate_kernel.S
+@@ -21,7 +21,11 @@
+         */
+ GLOBAL_ENTRY(relocate_new_kernel)
+ 	.prologue
++#ifdef CONFIG_XEN
++	alloc r31=ar.pfs,8,0,0,0
++#else
+ 	alloc r31=ar.pfs,4,0,0,0
++#endif
+         .body
+ .reloc_entry:
+ {
+@@ -34,7 +38,11 @@ GLOBAL_ENTRY(relocate_new_kernel)
+         srlz.i
+ }
+ 	;;
++#ifdef CONFIG_XEN
++	dep r2=0,r2,60,4		//to physical address
++#else
+ 	dep r2=0,r2,61,3		//to physical address
++#endif
+ 	;;
+ 	//first switch to physical mode
+ 	add r3=1f-.reloc_entry, r2
+@@ -58,11 +66,19 @@ GLOBAL_ENTRY(relocate_new_kernel)
+ 1:
+ 	//physical mode code begin
+ 	mov b6=in1
++#ifdef CONFIG_XEN
++	dep r28=0,in2,60,4	//to physical address
++#else
+ 	dep r28=0,in2,61,3	//to physical address
++#endif
+ 
+ 	// purge all TC entries
+ #define O(member)       IA64_CPUINFO_##member##_OFFSET
++#ifdef CONFIG_XEN
++        mov r2=in4			// load phys addr of cpu_info into r2
++#else
+         GET_THIS_PADDR(r2, cpu_info)    // load phys addr of cpu_info into r2
++#endif
+         ;;
+         addl r17=O(PTCE_STRIDE),r2
+         addl r2=O(PTCE_BASE),r2
+@@ -73,7 +89,11 @@ GLOBAL_ENTRY(relocate_new_kernel)
+         ;;
+         ld4 r20=[r2]                                    // r20=ptce_count[1]
+         ld4 r22=[r17]                                   // r22=ptce_stride[1]
++#ifdef CONFIG_XEN
++        mov r24=0					// From xen's mca_asm.S
++#else
+         mov r24=r0
++#endif
+         ;;
+         adds r20=-1,r20
+         ;;
+@@ -96,7 +116,11 @@ GLOBAL_ENTRY(relocate_new_kernel)
+         srlz.i
+         ;;
+ 	//purge TR entry for kernel text and data
++#ifdef CONFIG_XEN
++        mov r16=in5
++#else
+         movl r16=KERNEL_START
++#endif
+         mov r18=KERNEL_TR_PAGE_SHIFT<<2
+         ;;
+         ptr.i r16, r18
+@@ -104,6 +128,10 @@ GLOBAL_ENTRY(relocate_new_kernel)
+         ;;
+         srlz.i
+         ;;
++#ifdef CONFIG_XEN					// From xen's mca_asm.S
++	srlz.d
++	;;
++#endif
+ 
+ 	// purge TR entry for percpu data
+         movl r16=PERCPU_ADDR
+@@ -127,7 +155,11 @@ GLOBAL_ENTRY(relocate_new_kernel)
+         mov r16=IA64_KR(CURRENT_STACK)
+         ;;
+         shl r16=r16,IA64_GRANULE_SHIFT
++#ifdef CONFIG_XEN
++	mov r19=in6
++#else
+         movl r19=PAGE_OFFSET
++#endif
+         ;;
+         add r16=r19,r16
+         mov r18=IA64_GRANULE_SHIFT<<2
+@@ -183,10 +215,18 @@ GLOBAL_ENTRY(relocate_new_kernel)
+ 
+ .align  32
+ memory_stack:
++#ifdef CONFIG_XEN
++	.fill           4096, 1, 0
++#else
+ 	.fill           8192, 1, 0
++#endif
+ memory_stack_end:
+ register_stack:
++#ifdef CONFIG_XEN
++	.fill           4096, 1, 0
++#else
+ 	.fill           8192, 1, 0
++#endif
+ register_stack_end:
+ relocate_new_kernel_end:
+ END(relocate_new_kernel)
+@@ -262,7 +302,7 @@ GLOBAL_ENTRY(kexec_fake_sal_rendez)
+ 	mov	r16=IA64_KR(CURRENT_STACK)
+ 	;;
+ 	shl	r16=r16,IA64_GRANULE_SHIFT
+-	movl	r19=PAGE_OFFSET
++	movl	r19=PAGE_OFFSET		// XXX: This will not work in XEN
+ 	;;
+ 	add	r16=r19,r16
+ 	mov	r18=IA64_GRANULE_SHIFT<<2
+@@ -351,3 +391,8 @@ END(kexec_fake_sal_rendez)
+ relocate_new_kernel_size:
+ 	data8	kexec_fake_sal_rendez_end - relocate_new_kernel
+ 
++#ifdef CONFIG_XEN
++	.global kexec_fake_sal_rendez_offset
++kexec_fake_sal_rendez_offset:
++	data8   kexec_fake_sal_rendez - relocate_new_kernel     
++#endif
+--- x/include/asm-ia64/kexec.h
++++ x/include/asm-ia64/kexec.h
+@@ -16,6 +16,8 @@
+ 
+ #define MAX_NOTE_BYTES 1024
+ 
++struct kimage_arch {};
++
+ #define pte_bits	3
+ #define vmlpt_bits	(impl_va_bits - PAGE_SHIFT + pte_bits)
+ #define POW2(n)		(1ULL << (n))

[-- Attachment #8: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] kexec: framework and i386 (Take XIV)
  2006-08-31  7:43                                                           ` [PATCH] kexec: framework and i386 (Take XIV) Horms
@ 2006-08-31  8:55                                                             ` Akio Takebe
  2006-09-01  2:56                                                               ` Horms
  2006-09-05 11:43                                                             ` [Xen-devel] " Kazuo Moriwaka
  1 sibling, 1 reply; 68+ messages in thread
From: Akio Takebe @ 2006-08-31  8:55 UTC (permalink / raw)
  To: Horms, Keir Fraser
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Zou, Nanhai, Isaku Yamahata,
	Magnus Damm, xen-ia64-devel

Hi, Horms and Magnus

Good work. :-)
I have one commet.

I believe crash_kexec should be directly called 
when unknown NMI is occurred.
In your patch, crash_kexec is called as the bellow.
  1. unknown NMI is occurred. (e.g. by pushing NMI botton)
  2. xen recieved NMI and call do_nmi.
  3. xen report to dom0 by using raise_softirq(NMI_SOFTIRQ).
  4. dom0 call crash_kexec of dom0.
  5. crash_kexec of dom0 call crash_kexec of xen

Am I correct?
The above process is not reliable if I'm correct.
So I belive crash_kexec of xen should be directly called like the 
following patch.

diff -r 9611a5c9e1a1 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Thu Aug 31 13:12:26 2006 +0900
+++ b/xen/arch/x86/traps.c      Thu Aug 31 17:40:19 2006 +0900
@@ -1612,6 +1612,7 @@ asmlinkage void do_nmi(struct cpu_user_r
         else if ( reason & 0x40 )
             io_check_error(regs);
         else if ( !nmi_watchdog )
+            crash_kexec(NULL);
             unknown_nmi_error((unsigned char)(reason&0xff));
     }
 }

What do you think about it?

Best Regards,

Akio Takebe

>Hi,
>
>here is an update of the kexec/kdump patchset.
>
>Summary:
>
>* Up port to xen-unstable.hg-11296 (45f6ee334fcc)
>  - kexec hypercall number fragment is now in xen-unstable
>* Make kexec_page_to_pfn and friends need to be architecture specific
>  - this abstraction is needed to support ia64
>* Use kexec_page_to_pfn in machine_kexec_setup_load_arg()
>  - this abstraction is needed to support ia64
>* Rename do_kexec to do_kexec_op to make it consistent with other
>  hypercalls
>* Add ppc stubs
>* Add ia64 support
>
>Architectures:
>
>x86_32: 
>
>Seems to be working fine
>
>x86_64:
>
>Probably working fine, but I can't test this as dom0 refuses to boot for
>me on xen-unstable-11388 (50aea0ec406b).  That is, even without the
>kexec patches. I'm not sure what the problem is and I've devicided to
>get these patches out rather and investigate later.
>
>ia64:
>
>This patchset also, for the first time, includes ia64 code.
>Please note that this currently does _not_ work. I am actually
>struggling to work out why, and would really appreaciate it
>if someone could cast an eye over it.
>
>One possible area of concern is that relocate_kernel wipes out TLB
>entries. However many of the entries instated in
>arch/ia64/xen/xenasm.S:ia64_new_rr7() are not wiped. In particular,
>VHPT_ADDR, Shared info, and Map mapped_reg are not handled by
>relocate_kernel(), and the handling of current seems to be different.
>
>There are also problems with constants inside kexec_fake_sal_rendez.
>However this function probably also suffers the same problems as
>relocate_kernel. And it is easy not ro run kexec_fake_sal_rendez
>by booting xen with maxcpus=1, thus avoiding calling
>kexec_fake_sal_rendez, which is used in cpu shutdown.
>
>ppc:
>
>stubs only
>
>Patches
>
>   1. 51.1-kexec-generic-upstream.patch
>      * Common code for all architectures,
>        the basic plumbing for kexec/kdump
>
>   2. 51.1.1-kexec-trigger_crash_dump.patch
>      * xen-console trigger crash_dump
>      * Depends on 1
>
>   3. 51.2.1-kexec-x86-upstream.patch
>      * Glue between 1, and 3 and 4.
>      * Depends on 1
>
>   4. 51.2.1.1-kexec-x86_32-upstream.patch
>      * Kexec/kdump for x86_32
>      * Depends on 3 (and 1)
>
>   5. 51.2.31.2-kexec-x86_64-upstream.patch
>      * Kexec/kdump for x86_64
>      * Depends on 3 (and 1)
>
>   6. 51.2.2-kexec-ia64-upstream.patch
>      * Kexec/kdump for ia64
>      * Depends 1
>
>Discussion:
>
>Email is always good. Also my partner in crime, Magnus Damm,
>will be at Xen Summit.
>
>-- 
>Horms
>  H: http://www.vergenet.net/~horms/
>  W: http://www.valinux.co.jp/en/

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] kexec: framework and i386 (Take XIV)
  2006-08-31  8:55                                                             ` Akio Takebe
@ 2006-09-01  2:56                                                               ` Horms
  2006-09-01  8:41                                                                 ` Akio Takebe
  0 siblings, 1 reply; 68+ messages in thread
From: Horms @ 2006-09-01  2:56 UTC (permalink / raw)
  To: Akio Takebe
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Zou, Nanhai, Magnus Damm,
	Isaku Yamahata, Mark Williamson, xen-ia64-devel

On Thu, Aug 31, 2006 at 05:55:52PM +0900, Akio Takebe wrote:
> Hi, Horms and Magnus
> 
> Good work. :-)
> I have one commet.
> 
> I believe crash_kexec should be directly called 
> when unknown NMI is occurred.
> In your patch, crash_kexec is called as the bellow.
>   1. unknown NMI is occurred. (e.g. by pushing NMI botton)
>   2. xen recieved NMI and call do_nmi.
>   3. xen report to dom0 by using raise_softirq(NMI_SOFTIRQ).
>   4. dom0 call crash_kexec of dom0.
>   5. crash_kexec of dom0 call crash_kexec of xen
> 
> Am I correct?
> The above process is not reliable if I'm correct.
> So I belive crash_kexec of xen should be directly called like the 
> following patch.
> 
> diff -r 9611a5c9e1a1 xen/arch/x86/traps.c
> --- a/xen/arch/x86/traps.c      Thu Aug 31 13:12:26 2006 +0900
> +++ b/xen/arch/x86/traps.c      Thu Aug 31 17:40:19 2006 +0900
> @@ -1612,6 +1612,7 @@ asmlinkage void do_nmi(struct cpu_user_r
>          else if ( reason & 0x40 )
>              io_check_error(regs);
>          else if ( !nmi_watchdog )
> +            crash_kexec(NULL);
>              unknown_nmi_error((unsigned char)(reason&0xff));
>      }
>  }
> 
> What do you think about it?

That seems like a good idea to me. Though I think you are missing { }.
Can you test to see if this works?

--- a/xen/arch/x86/traps.c	2006-09-01 11:53:44.000000000 +0900
+++ b/xen/arch/x86/traps.c	2006-09-01 11:53:56.000000000 +0900
@@ -1611,8 +1611,10 @@
             mem_parity_error(regs);
         else if ( reason & 0x40 )
             io_check_error(regs);
-        else if ( !nmi_watchdog )
+        else if ( !nmi_watchdog ) {
+	    crash_kexec(NULL);
             unknown_nmi_error((unsigned char)(reason&0xff));
+	}
     }
 }

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] kexec: framework and i386 (Take XIV)
  2006-09-01  2:56                                                               ` Horms
@ 2006-09-01  8:41                                                                 ` Akio Takebe
  2006-09-01  8:45                                                                   ` Akio Takebe
  0 siblings, 1 reply; 68+ messages in thread
From: Akio Takebe @ 2006-09-01  8:41 UTC (permalink / raw)
  To: Horms
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Zou, Nanhai, Magnus Damm,
	Isaku Yamahata, Keir Fraser, xen-ia64-devel

Hi, Horms

>That seems like a good idea to me. Though I think you are missing { }.
>Can you test to see if this works?
Oops, You're right. But I think unknown_nmi_error() is not called,
because crash_kexec() is called before that.

Yes, I'll test it. :-)

>
>--- a/xen/arch/x86/traps.c	2006-09-01 11:53:44.000000000 +0900
>+++ b/xen/arch/x86/traps.c	2006-09-01 11:53:56.000000000 +0900
>@@ -1611,8 +1611,10 @@
>             mem_parity_error(regs);
>         else if ( reason & 0x40 )
>             io_check_error(regs);
>-        else if ( !nmi_watchdog )
>+        else if ( !nmi_watchdog ) {
>+	    crash_kexec(NULL);
>             unknown_nmi_error((unsigned char)(reason&0xff));
>+	}
>     }
> }
> 

Best Regards,

Akio Takebe

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH] kexec: framework and i386 (Take XIV)
  2006-09-01  8:41                                                                 ` Akio Takebe
@ 2006-09-01  8:45                                                                   ` Akio Takebe
  2006-09-01 10:21                                                                     ` Horms
  0 siblings, 1 reply; 68+ messages in thread
From: Akio Takebe @ 2006-09-01  8:45 UTC (permalink / raw)
  To: Horms
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Zou, Nanhai, Akio Takebe,
	Isaku Yamahata, Magnus Damm, Mark Williamson, xen-ia64-devel

>Hi, Horms
>
>>That seems like a good idea to me. Though I think you are missing { }.
>>Can you test to see if this works?
>Oops, You're right. But I think unknown_nmi_error() is not called,
>because crash_kexec() is called before that.
Sorry.
In the only case of CONFIG_KEXEC=y, the above is right.

Best Regards,

Akio Takebe

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH] kexec: framework and i386 (Take XIV)
  2006-09-01  8:45                                                                   ` Akio Takebe
@ 2006-09-01 10:21                                                                     ` Horms
  2006-09-04 21:45                                                                       ` Akio Takebe
  0 siblings, 1 reply; 68+ messages in thread
From: Horms @ 2006-09-01 10:21 UTC (permalink / raw)
  To: Akio Takebe
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Zou, Nanhai, Isaku Yamahata,
	Magnus Damm, Mark Williamson, xen-ia64-devel

On Fri, Sep 01, 2006 at 05:45:59PM +0900, Akio Takebe wrote:
> >Hi, Horms
> >
> >>That seems like a good idea to me. Though I think you are missing { }.
> >>Can you test to see if this works?
> >Oops, You're right. But I think unknown_nmi_error() is not called,
> >because crash_kexec() is called before that.
> Sorry.
> In the only case of CONFIG_KEXEC=y, the above is right.

Yes, I think that is the case. I will put your patch into the kexec
series, as I think that it is a worthy addition.

-- 
Horms
  H: http://www.vergenet.net/~horms/
  W: http://www.valinux.co.jp/en/

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH] kexec: framework and i386 (Take XIV)
  2006-09-01 10:21                                                                     ` Horms
@ 2006-09-04 21:45                                                                       ` Akio Takebe
  2007-05-28  5:28                                                                         ` Horms
  0 siblings, 1 reply; 68+ messages in thread
From: Akio Takebe @ 2006-09-04 21:45 UTC (permalink / raw)
  To: Horms
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Zou, Nanhai, Akio Takebe,
	Isaku Yamahata, Magnus Damm, Mark Williamson, xen-ia64-devel

Hi, Horms

I tested the following patch with Horms kexec patch.

My tests is:
  push NMI bottun after loading kdump kernel.
  
The results is:
  OK, I could get vmcore

diff -r b688d4a68a3e xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Tue Aug 22 14:59:16 2006 +0100
+++ b/xen/arch/x86/traps.c      Tue Sep 05 06:37:49 2006 +0900
@@ -105,6 +105,8 @@ static int debug_stack_lines = 20;
 static int debug_stack_lines = 20;
 integer_param("debug_stack_lines", debug_stack_lines);
 
+extern void crash_kexec(struct cpu_user_regs *regs);
+
 #ifdef CONFIG_X86_32
 #define stack_words_per_line 8
 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
@@ -1611,8 +1613,10 @@ asmlinkage void do_nmi(struct cpu_user_r
             mem_parity_error(regs);
         else if ( reason & 0x40 )
             io_check_error(regs);
-        else if ( !nmi_watchdog )
+        else if ( !nmi_watchdog ){
+            crash_kexec(NULL);
             unknown_nmi_error((unsigned char)(reason&0xff));
+        }
     }
 }
 


Best Regards,

Akio Takebe

>On Fri, Sep 01, 2006 at 05:45:59PM +0900, Akio Takebe wrote:
>> >Hi, Horms
>> >
>> >>That seems like a good idea to me. Though I think you are missing { }.
>> >>Can you test to see if this works?
>> >Oops, You're right. But I think unknown_nmi_error() is not called,
>> >because crash_kexec() is called before that.
>> Sorry.
>> In the only case of CONFIG_KEXEC=y, the above is right.
>
>Yes, I think that is the case. I will put your patch into the kexec
>series, as I think that it is a worthy addition.
>
>-- 
>Horms
>  H: http://www.vergenet.net/~horms/
>  W: http://www.valinux.co.jp/en/
>
>
>_______________________________________________
>Xen-devel mailing list
>Xen-devel@lists.xensource.com
>http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [Xen-devel] [PATCH] kexec: framework and i386 (Take XIV)
  2006-08-31  7:43                                                           ` [PATCH] kexec: framework and i386 (Take XIV) Horms
  2006-08-31  8:55                                                             ` Akio Takebe
@ 2006-09-05 11:43                                                             ` Kazuo Moriwaka
  2006-09-05 13:06                                                               ` Horms
  1 sibling, 1 reply; 68+ messages in thread
From: Kazuo Moriwaka @ 2006-09-05 11:43 UTC (permalink / raw)
  To: Horms
  Cc: Ian Pratt, xen-devel, Zou, Nanhai, Magnus Damm, Isaku Yamahata,
	Keir Fraser, xen-ia64-devel

On 8/31/06, Horms <horms@verge.net.au> wrote:

> x86_64:
>
> Probably working fine, but I can't test this as dom0 refuses to boot for
> me on xen-unstable-11388 (50aea0ec406b).  That is, even without the
> kexec patches. I'm not sure what the problem is and I've devicided to
> get these patches out rather and investigate later.

I tried some versions of xen with kdump patches on x86_64,
following is the result.
I'm sorry for it wasn't done in systematic style.

chengeset  result
11414       doesn't boot
11251       doesn't boot
11134       doesn't boot
11076        boot

-- 
Kazuo Moriwaka

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH] kexec: framework and i386 (Take XIV)
  2006-09-05 11:43                                                             ` [Xen-devel] " Kazuo Moriwaka
@ 2006-09-05 13:06                                                               ` Horms
  0 siblings, 0 replies; 68+ messages in thread
From: Horms @ 2006-09-05 13:06 UTC (permalink / raw)
  To: Kazuo Moriwaka
  Cc: Ian Pratt, xen-devel, Zou, Nanhai, Akio Takebe, Magnus Damm,
	Isaku Yamahata, Mark Williamson, xen-ia64-devel

On Tue, Sep 05, 2006 at 08:43:44PM +0900, Kazuo Moriwaka wrote:
> On 8/31/06, Horms <horms@verge.net.au> wrote:
> 
> >x86_64:
> >
> >Probably working fine, but I can't test this as dom0 refuses to boot for
> >me on xen-unstable-11388 (50aea0ec406b).  That is, even without the
> >kexec patches. I'm not sure what the problem is and I've devicided to
> >get these patches out rather and investigate later.
> 
> I tried some versions of xen with kdump patches on x86_64,
> following is the result.
> I'm sorry for it wasn't done in systematic style.
> 
> chengeset  result
> 11414       doesn't boot
> 11251       doesn't boot
> 11134       doesn't boot
> 11076        boot

Thanks, that is valuable information.
I am guessing that doing a bisection between 11134 and 11076
would help shed some light and what has gone astray.

-- 
Horms
  H: http://www.vergenet.net/~horms/
  W: http://www.valinux.co.jp/en/

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH] kexec: framework and i386 (Take XIV)
  2006-09-04 21:45                                                                       ` Akio Takebe
@ 2007-05-28  5:28                                                                         ` Horms
  2007-05-28  6:25                                                                           ` [Xen-devel] " Akio Takebe
  2007-05-29  9:04                                                                           ` Ian Campbell
  0 siblings, 2 replies; 68+ messages in thread
From: Horms @ 2007-05-28  5:28 UTC (permalink / raw)
  To: Akio Takebe
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Ian Campbell, Zou, Nanhai,
	Isaku Yamahata, Magnus Damm, Mark Williamson, xen-ia64-devel

[ Ian Campbell added to CC list ]

On Tue, Sep 05, 2006 at 06:45:35AM +0900, Akio Takebe wrote:
> Hi, Horms
> 
> I tested the following patch with Horms kexec patch.
> 
> My tests is:
>   push NMI bottun after loading kdump kernel.
>   
> The results is:
>   OK, I could get vmcore


Hi Takebe-san,

this patch seems ok to me, but it seems that it never went into the
tree. Ian, what are your thoughts on it?

> diff -r b688d4a68a3e xen/arch/x86/traps.c
> --- a/xen/arch/x86/traps.c      Tue Aug 22 14:59:16 2006 +0100
> +++ b/xen/arch/x86/traps.c      Tue Sep 05 06:37:49 2006 +0900
> @@ -105,6 +105,8 @@ static int debug_stack_lines = 20;
>  static int debug_stack_lines = 20;
>  integer_param("debug_stack_lines", debug_stack_lines);
>  
> +extern void crash_kexec(struct cpu_user_regs *regs);
> +
>  #ifdef CONFIG_X86_32
>  #define stack_words_per_line 8
>  #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
> @@ -1611,8 +1613,10 @@ asmlinkage void do_nmi(struct cpu_user_r
>              mem_parity_error(regs);
>          else if ( reason & 0x40 )
>              io_check_error(regs);
> -        else if ( !nmi_watchdog )
> +        else if ( !nmi_watchdog ){
> +            crash_kexec(NULL);
>              unknown_nmi_error((unsigned char)(reason&0xff));
> +        }
>      }
>  }
>  
> 
> 
> Best Regards,
> 
> Akio Takebe
> 
> >On Fri, Sep 01, 2006 at 05:45:59PM +0900, Akio Takebe wrote:
> >> >Hi, Horms
> >> >
> >> >>That seems like a good idea to me. Though I think you are missing { }.
> >> >>Can you test to see if this works?
> >> >Oops, You're right. But I think unknown_nmi_error() is not called,
> >> >because crash_kexec() is called before that.
> >> Sorry.
> >> In the only case of CONFIG_KEXEC=y, the above is right.
> >
> >Yes, I think that is the case. I will put your patch into the kexec
> >series, as I think that it is a worthy addition.
> >
> >-- 
> >Horms
> >  H: http://www.vergenet.net/~horms/
> >  W: http://www.valinux.co.jp/en/
> >
> >
> >_______________________________________________
> >Xen-devel mailing list
> >Xen-devel@lists.xensource.com
> >http://lists.xensource.com/xen-devel

-- 
Horms
  H: http://www.vergenet.net/~horms/
  W: http://www.valinux.co.jp/en/

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [Xen-devel] Re: [PATCH] kexec: framework and i386 (Take XIV)
  2007-05-28  5:28                                                                         ` Horms
@ 2007-05-28  6:25                                                                           ` Akio Takebe
  2007-05-29  1:05                                                                             ` Horms
  2007-05-29  9:04                                                                           ` Ian Campbell
  1 sibling, 1 reply; 68+ messages in thread
From: Akio Takebe @ 2007-05-28  6:25 UTC (permalink / raw)
  To: Horms
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Ian Campbell, Zou, Nanhai,
	Isaku Yamahata, Magnus Damm, xen-ia64-devel

Hi, Horms and Ian

Thank you for your reply, Horms.
I forgot Signed-off-by of the patch.

Signed-off-by: Horms <horms@verge.net.au>
Signed-off-by: Akio Takebe <takebe_akio@jp.fujitsu.com>

Is the Signed-off-by OK, Horms?

Best Regards,

Akio Takebe

>[ Ian Campbell added to CC list ]
>
>On Tue, Sep 05, 2006 at 06:45:35AM +0900, Akio Takebe wrote:
>> Hi, Horms
>> 
>> I tested the following patch with Horms kexec patch.
>> 
>> My tests is:
>>   push NMI bottun after loading kdump kernel.
>>   
>> The results is:
>>   OK, I could get vmcore
>
>
>Hi Takebe-san,
>
>this patch seems ok to me, but it seems that it never went into the
>tree. Ian, what are your thoughts on it?
>
>> diff -r b688d4a68a3e xen/arch/x86/traps.c
>> --- a/xen/arch/x86/traps.c      Tue Aug 22 14:59:16 2006 +0100
>> +++ b/xen/arch/x86/traps.c      Tue Sep 05 06:37:49 2006 +0900
>> @@ -105,6 +105,8 @@ static int debug_stack_lines = 20;
>>  static int debug_stack_lines = 20;
>>  integer_param("debug_stack_lines", debug_stack_lines);
>>  
>> +extern void crash_kexec(struct cpu_user_regs *regs);
>> +
>>  #ifdef CONFIG_X86_32
>>  #define stack_words_per_line 8
>>  #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)&regs->esp)
>> @@ -1611,8 +1613,10 @@ asmlinkage void do_nmi(struct cpu_user_r
>>              mem_parity_error(regs);
>>          else if ( reason & 0x40 )
>>              io_check_error(regs);
>> -        else if ( !nmi_watchdog )
>> +        else if ( !nmi_watchdog ){
>> +            crash_kexec(NULL);
>>              unknown_nmi_error((unsigned char)(reason&0xff));
>> +        }
>>      }
>>  }
>>  
>> 
>> 
>> Best Regards,
>> 
>> Akio Takebe

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH] kexec: framework and i386 (Take XIV)
  2007-05-28  6:25                                                                           ` [Xen-devel] " Akio Takebe
@ 2007-05-29  1:05                                                                             ` Horms
  0 siblings, 0 replies; 68+ messages in thread
From: Horms @ 2007-05-29  1:05 UTC (permalink / raw)
  To: Akio Takebe
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Ian Campbell, Zou, Nanhai,
	Isaku Yamahata, Magnus Damm, Mark Williamson, xen-ia64-devel

On Mon, May 28, 2007 at 03:25:04PM +0900, Akio Takebe wrote:
> Hi, Horms and Ian
> 
> Thank you for your reply, Horms.
> I forgot Signed-off-by of the patch.
> 
> Signed-off-by: Horms <horms@verge.net.au>
> Signed-off-by: Akio Takebe <takebe_akio@jp.fujitsu.com>
> 
> Is the Signed-off-by OK, Horms?

Actually, i think this might be better:

Acked-by: Simon Horman <horms@verge.net.au>

-- 
Horms
  H: http://www.vergenet.net/~horms/
  W: http://www.valinux.co.jp/en/

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH] kexec: framework and i386 (Take XIV)
  2007-05-28  5:28                                                                         ` Horms
  2007-05-28  6:25                                                                           ` [Xen-devel] " Akio Takebe
@ 2007-05-29  9:04                                                                           ` Ian Campbell
  2007-05-31 10:43                                                                             ` [Xen-devel] " Akio Takebe
  1 sibling, 1 reply; 68+ messages in thread
From: Ian Campbell @ 2007-05-29  9:04 UTC (permalink / raw)
  To: Horms
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Zou, Nanhai, Akio Takebe,
	Isaku Yamahata, Magnus Damm, Mark Williamson, xen-ia64-devel

On Mon, 2007-05-28 at 14:28 +0900, Horms wrote:
> [ Ian Campbell added to CC list ]
> 
> On Tue, Sep 05, 2006 at 06:45:35AM +0900, Akio Takebe wrote:
> > Hi, Horms
> > 
> > I tested the following patch with Horms kexec patch.
> > 
> > My tests is:
> >   push NMI bottun after loading kdump kernel.
> >   
> > The results is:
> >   OK, I could get vmcore
> 
> 
> Hi Takebe-san,
> 
> this patch seems ok to me, but it seems that it never went into the
> tree. Ian, what are your thoughts on it?

The default in non-debug builds is to forward the crash to domain 0 so
we'd never get here, although I'd expect domain 0 probably does a kdump
itself nowadays when an NMI is received.

For debug builds I guess it does make sense. Assuming crash_kexec
gracefully returns if no crash kernel has been loaded, so that the old
behaviour is preserved, then the behaviour would be fine with me.
Alternatively "nmi=kdump" on the command line might be nice.

> > +extern void crash_kexec(struct cpu_user_regs *regs);

I can't find kexec_crash in xen-unstable. Is it now crash_kexec, with no
parameters? Whatever the function is now called it should probably be in
a header somewhere therefore no local prototype required.

> > +        else if ( !nmi_watchdog ){

Needs a space between ) and {.

Ian.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [Xen-devel] Re: [PATCH] kexec: framework and i386 (Take XIV)
  2007-05-29  9:04                                                                           ` Ian Campbell
@ 2007-05-31 10:43                                                                             ` Akio Takebe
  2007-05-31 10:49                                                                               ` Keir Fraser
  0 siblings, 1 reply; 68+ messages in thread
From: Akio Takebe @ 2007-05-31 10:43 UTC (permalink / raw)
  To: Ian Campbell, Horms
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Zou, Nanhai, Isaku Yamahata,
	Magnus Damm, xen-ia64-devel

Hi, Ian and Horms

I add the nmi=kdump option as Ian suggested.
What do you think about it?

Signed-off-by: Akio Takebe <takebe_akio@jp.fujitsu.com>
---

diff -r 089696e0c603 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c	Thu May 17 11:42:46 2007 +0100
+++ b/xen/arch/x86/traps.c	Thu May 31 02:25:02 2007 +0900
@@ -1897,6 +1897,7 @@ asmlinkage void io_check_error(struct cp
     {
     case 'd': /* 'dom0' */
         nmi_dom0_report(_XEN_NMIREASON_io_error);
+    case 'k': /* 'kdump' */
     case 'i': /* 'ignore' */
         break;
     default:  /* 'fatal' */
@@ -1916,6 +1917,8 @@ static void unknown_nmi_error(unsigned c
     {
     case 'd': /* 'dom0' */
         nmi_dom0_report(_XEN_NMIREASON_unknown);
+    case 'k': /* 'kdump' */
+        kexec_crash();
     case 'i': /* 'ignore' */
         break;
     default:  /* 'fatal' */


Best Regards,

Akio Takebe

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: Re: [PATCH] kexec: framework and i386 (Take XIV)
  2007-05-31 10:43                                                                             ` [Xen-devel] " Akio Takebe
@ 2007-05-31 10:49                                                                               ` Keir Fraser
  2007-05-31 11:07                                                                                 ` [Xen-devel] " Akio Takebe
  0 siblings, 1 reply; 68+ messages in thread
From: Keir Fraser @ 2007-05-31 10:49 UTC (permalink / raw)
  To: Akio Takebe, Ian Campbell, Horms
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Zou, Nanhai, Isaku Yamahata,
	Magnus Damm, Mark Williamson, xen-ia64-devel

On 31/5/07 11:43, "Akio Takebe" <takebe_akio@jp.fujitsu.com> wrote:

> Hi, Ian and Horms
> 
> I add the nmi=kdump option as Ian suggested.
> What do you think about it?

Won't the default fatal_trap() behaviour cause you to drop into kdump code
anyway? fatal_trap -> panic -> kexec_crash.

 -- Keir

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [Xen-devel] Re: [PATCH] kexec: framework and i386 (Take XIV)
  2007-05-31 10:49                                                                               ` Keir Fraser
@ 2007-05-31 11:07                                                                                 ` Akio Takebe
  2007-05-31 11:17                                                                                   ` Akio Takebe
  0 siblings, 1 reply; 68+ messages in thread
From: Akio Takebe @ 2007-05-31 11:07 UTC (permalink / raw)
  To: Keir Fraser, Ian Campbell, Horms
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Zou, Nanhai, Isaku Yamahata,
	Magnus Damm, xen-ia64-devel

Hi, Keir

>On 31/5/07 11:43, "Akio Takebe" <takebe_akio@jp.fujitsu.com> wrote:
>
>> Hi, Ian and Horms
>> 
>> I add the nmi=kdump option as Ian suggested.
>> What do you think about it?
>
>Won't the default fatal_trap() behaviour cause you to drop into kdump code
>anyway? fatal_trap -> panic -> kexec_crash.
>
Oops, you're right.
All we do is just setting nmi=kdump.

Best Regards,

Akio Takebe

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [Xen-devel] Re: [PATCH] kexec: framework and i386 (Take XIV)
  2007-05-31 11:07                                                                                 ` [Xen-devel] " Akio Takebe
@ 2007-05-31 11:17                                                                                   ` Akio Takebe
  0 siblings, 0 replies; 68+ messages in thread
From: Akio Takebe @ 2007-05-31 11:17 UTC (permalink / raw)
  To: Akio Takebe, Keir Fraser, Ian Campbell, Horms
  Cc: Ian Pratt, Kazuo Moriwaka, xen-devel, Zou, Nanhai, Isaku Yamahata,
	xen-ia64-devel

Hi, Keir

>Hi, Keir
>
>>On 31/5/07 11:43, "Akio Takebe" <takebe_akio@jp.fujitsu.com> wrote:
>>
>>> Hi, Ian and Horms
>>> 
>>> I add the nmi=kdump option as Ian suggested.
>>> What do you think about it?
>>
>>Won't the default fatal_trap() behaviour cause you to drop into kdump code
>>anyway? fatal_trap -> panic -> kexec_crash.
>>
>Oops, you're right.
>All we do is just setting nmi=kdump.
>
Sorry, please ignore the previous mail.
Yes, as Keir said fatal_trap() should call panic.
All we do is just setting nmi=fatal.

Best Regards,

Akio Takebe

^ permalink raw reply	[flat|nested] 68+ messages in thread

end of thread, other threads:[~2007-05-31 11:17 UTC | newest]

Thread overview: 68+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-04-07  7:42 [PATCH]: kexec: framework and i386 Horms
2006-04-07 15:00 ` Don Zickus
2006-04-10  5:09   ` Hirokazu Takahashi
2006-04-10 15:38     ` Don Zickus
2006-04-11  1:44       ` Horms
2006-04-12 15:56         ` Don Zickus
2006-04-07 15:09 ` Gerd Hoffmann
2006-04-08  4:39   ` Horms
2006-04-12  9:12 ` Horms
2006-04-17  6:06   ` Horms
2006-04-21  1:28     ` [PATCH]: kexec: framework and i386 (Take IV) Horms
2006-04-21  6:10     ` Re: [PATCH]: kexec: framework and i386 Akio Takebe
2006-04-21  6:55       ` horms-home
2006-04-21  7:53         ` Akio Takebe
2006-04-23 14:45       ` Mark Williamson
2006-04-24  1:10         ` Akio Takebe
2006-04-24  1:53           ` Hypercall number assignment convension (was Re: Re: [PATCH]: kexec: framework and i386) Isaku Yamahata
2006-04-24  7:32             ` Keir Fraser
2006-04-24 11:20               ` Muli Ben-Yehuda
2006-04-25  0:11               ` Horms
2006-04-25  9:57                 ` Keir Fraser
2006-04-26  6:08                   ` [PATCH]: kexec: framework and i386 Take V Horms
2006-05-02  8:17                     ` [PATCH]: kexec: framework and i386 (Take VI) Simon Horman [Horms]
2006-05-03  7:16                       ` Akio Takebe
2006-05-05  1:03                         ` horms
2006-05-06  8:46                           ` Akio Takebe
2006-05-07  4:46                             ` Horms
2006-05-07  9:45                               ` Akio Takebe
2006-05-08  9:02                                 ` Ian Campbell
2006-05-11 11:35                                   ` horms
2006-05-15  8:29                         ` Akio Takebe
2006-05-06  8:44                       ` Akio Takebe
2006-05-07  4:45                         ` Horms
2006-05-09  4:16                           ` [PATCH]: kexec: framework and i386 (Take VII) Horms
2006-05-09  9:18                             ` [PATCH]: kexec: framework and i386 (Take VIII) Horms
2006-05-09 13:28                               ` Akio Takebe
2006-05-16 10:43                                 ` Akio Takebe
2006-05-16 10:44                                   ` Keir Fraser
2006-05-16 11:03                                     ` Akio Takebe
2006-05-16 12:39                                       ` Keir Fraser
2006-05-17  2:44                                         ` Horms
2006-05-17  4:53                                           ` Horms
2006-05-17  9:52                                             ` Re: [PATCH]: kexec: framework and i386 (Take IX) Horms
2006-05-17 10:10                                               ` Keir Fraser
2006-05-18  3:37                                                 ` Horms
2006-05-25  7:20                                                   ` [PATCH] kexec: framework and i386 (Take X) Horms
2006-06-05  2:53                                                     ` Akio Takebe
2006-06-15  7:29                                                     ` [PATCH] kexec: framework and i386 (Take XI) Horms
2006-07-11  3:39                                                       ` [PATCH] kexec: framework and i386 (Take XII) Horms
2006-08-11  7:48                                                         ` [PATCH] kexec: framework and i386 (Take XIII) Horms
2006-08-31  7:43                                                           ` [PATCH] kexec: framework and i386 (Take XIV) Horms
2006-08-31  8:55                                                             ` Akio Takebe
2006-09-01  2:56                                                               ` Horms
2006-09-01  8:41                                                                 ` Akio Takebe
2006-09-01  8:45                                                                   ` Akio Takebe
2006-09-01 10:21                                                                     ` Horms
2006-09-04 21:45                                                                       ` Akio Takebe
2007-05-28  5:28                                                                         ` Horms
2007-05-28  6:25                                                                           ` [Xen-devel] " Akio Takebe
2007-05-29  1:05                                                                             ` Horms
2007-05-29  9:04                                                                           ` Ian Campbell
2007-05-31 10:43                                                                             ` [Xen-devel] " Akio Takebe
2007-05-31 10:49                                                                               ` Keir Fraser
2007-05-31 11:07                                                                                 ` [Xen-devel] " Akio Takebe
2007-05-31 11:17                                                                                   ` Akio Takebe
2006-09-05 11:43                                                             ` [Xen-devel] " Kazuo Moriwaka
2006-09-05 13:06                                                               ` Horms
2006-04-26  2:09               ` Hypercall number assignment convension (was Re: Re: [PATCH]: kexec: framework and i386) Isaku Yamahata

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.