All of lore.kernel.org
 help / color / mirror / Atom feed
* hvm save restore support
@ 2006-12-13 15:45 Zhai, Edwin
  2006-12-13 17:04 ` Keir Fraser
  2006-12-13 20:09 ` Anthony Liguori
  0 siblings, 2 replies; 10+ messages in thread
From: Zhai, Edwin @ 2006-12-13 15:45 UTC (permalink / raw)
  To: Keir, Ian Pratt; +Cc: xen-devel, Zhai, Edwin

[-- Attachment #1: Type: text/plain, Size: 370 bytes --]

ian/keir,
this is the new hvm save/restore patch on top of r12898.
although not tested intensively across all platform combinations, general 
linux/windows guest works fine W/O break anything.

because recent changes in unstable make rebase hard, we submit this patch now 
with hope that check in first and fix bug over time if possible.

thanks,


-- 
best rgds,
edwin

[-- Attachment #2: hvm_save_restore_r12898.patch --]
[-- Type: text/plain, Size: 105089 bytes --]

Signed-off-by: Zhai Edwin <edwin.zhai@intel.com>
Signed-off-by: Nakajima Jun <jun.nakajima@intel.com>

diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/ioemu/hw/cirrus_vga.c
--- a/tools/ioemu/hw/cirrus_vga.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/ioemu/hw/cirrus_vga.c	Wed Dec 13 22:52:02 2006 +0800
@@ -3010,11 +3010,44 @@ static CPUWriteMemoryFunc *cirrus_mmio_w
     cirrus_mmio_writel,
 };
 
+void cirrus_stop_acc(CirrusVGAState *s)
+{
+    if (s->map_addr){
+        int error;
+        s->map_addr = 0;
+        error = unset_vram_mapping(s->cirrus_lfb_addr,
+                s->cirrus_lfb_end);
+        fprintf(stderr, "cirrus_stop_acc:unset_vram_mapping.\n");
+
+        munmap(s->vram_ptr, VGA_RAM_SIZE);
+    }
+}
+
+void cirrus_restart_acc(CirrusVGAState *s)
+{
+    if (s->cirrus_lfb_addr && s->cirrus_lfb_end) {
+        void *vram_pointer, *old_vram;
+        fprintf(stderr, "cirrus_vga_load:re-enable vga acc.lfb_addr=0x%lx, lfb_end=0x%lx.\n",
+                s->cirrus_lfb_addr, s->cirrus_lfb_end);
+        vram_pointer = set_vram_mapping(s->cirrus_lfb_addr ,s->cirrus_lfb_end);
+        if (!vram_pointer){
+            fprintf(stderr, "cirrus_vga_load:NULL vram_pointer\n");
+        } else {
+            old_vram = vga_update_vram((VGAState *)s, vram_pointer,
+                    VGA_RAM_SIZE);
+            qemu_free(old_vram);
+            s->map_addr = s->cirrus_lfb_addr;
+            s->map_end = s->cirrus_lfb_end;
+        }
+    }
+}
+
 /* load/save state */
 
 static void cirrus_vga_save(QEMUFile *f, void *opaque)
 {
     CirrusVGAState *s = opaque;
+    uint8_t vga_acc;
 
     qemu_put_be32s(f, &s->latch);
     qemu_put_8s(f, &s->sr_index);
@@ -3049,11 +3082,20 @@ static void cirrus_vga_save(QEMUFile *f,
     qemu_put_be32s(f, &s->hw_cursor_y);
     /* XXX: we do not save the bitblt state - we assume we do not save
        the state when the blitter is active */
+
+    vga_acc = (!!s->map_addr);
+    qemu_put_8s(f, &vga_acc);
+    qemu_put_be64s(f, (uint64_t*)&s->cirrus_lfb_addr);
+    qemu_put_be64s(f, (uint64_t*)&s->cirrus_lfb_end);
+    qemu_put_buffer(f, s->vram_ptr, VGA_RAM_SIZE); 
+    if (vga_acc)
+        cirrus_stop_acc(s);
 }
 
 static int cirrus_vga_load(QEMUFile *f, void *opaque, int version_id)
 {
     CirrusVGAState *s = opaque;
+    uint8_t vga_acc = 0;
 
     if (version_id != 1)
         return -EINVAL;
@@ -3091,6 +3133,14 @@ static int cirrus_vga_load(QEMUFile *f, 
 
     qemu_get_be32s(f, &s->hw_cursor_x);
     qemu_get_be32s(f, &s->hw_cursor_y);
+
+    qemu_get_8s(f, &vga_acc);
+    qemu_get_be64s(f, (uint64_t*)&s->cirrus_lfb_addr);
+    qemu_get_be64s(f, (uint64_t*)&s->cirrus_lfb_end);
+    qemu_get_buffer(f, s->vram_ptr, VGA_RAM_SIZE); 
+    if (vga_acc){
+        cirrus_restart_acc(s);
+    }
 
     /* force refresh */
     s->graphic_mode = -1;
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/ioemu/target-i386-dm/helper2.c
--- a/tools/ioemu/target-i386-dm/helper2.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/ioemu/target-i386-dm/helper2.c	Wed Dec 13 22:52:02 2006 +0800
@@ -525,6 +525,7 @@ int main_loop(void)
 {
     extern int vm_running;
     extern int shutdown_requested;
+    extern int suspend_requested;
     CPUState *env = cpu_single_env;
     int evtchn_fd = xc_evtchn_fd(xce_handle);
 
@@ -542,12 +543,24 @@ int main_loop(void)
                 qemu_system_reset();
                 reset_requested = 0;
             }
+            if (suspend_requested) {
+                fprintf(logfile, "device model received suspend signal!\n");
+                break;
+            }
         }
 
         /* Wait up to 10 msec. */
         main_loop_wait(10);
     }
-    destroy_hvm_domain();
+    if (!suspend_requested)
+        destroy_hvm_domain();
+    else {
+        char qemu_file[20];
+        sprintf(qemu_file, "/tmp/xen.qemu-dm.%d", domid);
+        if (qemu_savevm(qemu_file) < 0)
+            fprintf(stderr, "qemu save fail.\n");
+    }
+
     return 0;
 }
 
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/ioemu/target-i386-dm/piix_pci-dm.c
--- a/tools/ioemu/target-i386-dm/piix_pci-dm.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/ioemu/target-i386-dm/piix_pci-dm.c	Wed Dec 13 22:52:02 2006 +0800
@@ -83,6 +83,11 @@ PCIBus *i440fx_init(void)
 /* PIIX3 PCI to ISA bridge */
 
 static PCIDevice *piix3_dev;
+static int pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num)
+{
+    /* This is the barber's pole mapping used by Xen. */
+    return (irq_num + (pci_dev->devfn >> 3)) & 3;
+}
 
 static void piix3_write_config(PCIDevice *d, 
                                uint32_t address, uint32_t val, int len)
@@ -150,3 +155,227 @@ int piix3_init(PCIBus *bus)
 }
 
 void pci_bios_init(void) {}
+
+/***********************************************************/
+/* XXX: the following should be moved to the PC BIOS */
+
+static __attribute__((unused)) uint32_t isa_inb(uint32_t addr)
+{
+    return cpu_inb(NULL, addr);
+}
+
+static void isa_outb(uint32_t val, uint32_t addr)
+{
+    cpu_outb(NULL, addr, val);
+}
+
+static __attribute__((unused)) uint32_t isa_inw(uint32_t addr)
+{
+    return cpu_inw(NULL, addr);
+}
+
+static __attribute__((unused)) void isa_outw(uint32_t val, uint32_t addr)
+{
+    cpu_outw(NULL, addr, val);
+}
+
+static __attribute__((unused)) uint32_t isa_inl(uint32_t addr)
+{
+    return cpu_inl(NULL, addr);
+}
+
+static __attribute__((unused)) void isa_outl(uint32_t val, uint32_t addr)
+{
+    cpu_outl(NULL, addr, val);
+}
+
+static uint32_t pci_bios_io_addr;
+static uint32_t pci_bios_mem_addr;
+/* host irqs corresponding to PCI irqs A-D */
+static uint8_t pci_irqs[4] = { 5, 6, 10, 11 };
+
+static void pci_config_writel(PCIDevice *d, uint32_t addr, uint32_t val)
+{
+    PCIBus *s = d->bus;
+    addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
+    pci_data_write(s, addr, val, 4);
+}
+
+static void pci_config_writew(PCIDevice *d, uint32_t addr, uint32_t val)
+{
+    PCIBus *s = d->bus;
+    addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
+    pci_data_write(s, addr, val, 2);
+}
+
+static void pci_config_writeb(PCIDevice *d, uint32_t addr, uint32_t val)
+{
+    PCIBus *s = d->bus;
+    addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
+    pci_data_write(s, addr, val, 1);
+}
+
+static __attribute__((unused)) uint32_t pci_config_readl(PCIDevice *d, uint32_t addr)
+{
+    PCIBus *s = d->bus;
+    addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
+    return pci_data_read(s, addr, 4);
+}
+
+static uint32_t pci_config_readw(PCIDevice *d, uint32_t addr)
+{
+    PCIBus *s = d->bus;
+    addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
+    return pci_data_read(s, addr, 2);
+}
+
+static uint32_t pci_config_readb(PCIDevice *d, uint32_t addr)
+{
+    PCIBus *s = d->bus;
+    addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
+    return pci_data_read(s, addr, 1);
+}
+
+static void pci_set_io_region_addr(PCIDevice *d, int region_num, uint32_t addr)
+{
+    PCIIORegion *r;
+    uint16_t cmd;
+    uint32_t ofs;
+
+    if ( region_num == PCI_ROM_SLOT ) {
+        ofs = 0x30;
+    }else{
+        ofs = 0x10 + region_num * 4;
+    }
+
+    pci_config_writel(d, ofs, addr);
+    r = &d->io_regions[region_num];
+
+    /* enable memory mappings */
+    cmd = pci_config_readw(d, PCI_COMMAND);
+    if ( region_num == PCI_ROM_SLOT )
+        cmd |= 2;
+    else if (r->type & PCI_ADDRESS_SPACE_IO)
+        cmd |= 1;
+    else
+        cmd |= 2;
+    pci_config_writew(d, PCI_COMMAND, cmd);
+}
+
+static void pci_bios_init_device(PCIDevice *d)
+{
+    int class;
+    PCIIORegion *r;
+    uint32_t *paddr;
+    int i, pin, pic_irq, vendor_id, device_id;
+
+    class = pci_config_readw(d, PCI_CLASS_DEVICE);
+    vendor_id = pci_config_readw(d, PCI_VENDOR_ID);
+    device_id = pci_config_readw(d, PCI_DEVICE_ID);
+    switch(class) {
+    case 0x0101:
+        if (vendor_id == 0x8086 && device_id == 0x7010) {
+            /* PIIX3 IDE */
+            pci_config_writew(d, 0x40, 0x8000); // enable IDE0
+            pci_config_writew(d, 0x42, 0x8000); // enable IDE1
+            goto default_map;
+        } else {
+            /* IDE: we map it as in ISA mode */
+            pci_set_io_region_addr(d, 0, 0x1f0);
+            pci_set_io_region_addr(d, 1, 0x3f4);
+            pci_set_io_region_addr(d, 2, 0x170);
+            pci_set_io_region_addr(d, 3, 0x374);
+        }
+        break;
+    case 0x0680:
+        if (vendor_id == 0x8086 && device_id == 0x7113) {
+            /*
+             * PIIX4 ACPI PM.
+             * Special device with special PCI config space. No ordinary BARs.
+             */
+            pci_config_writew(d, 0x20, 0x0000); // No smb bus IO enable
+            pci_config_writew(d, 0x22, 0x0000);
+            pci_config_writew(d, 0x3c, 0x0009); // Hardcoded IRQ9
+            pci_config_writew(d, 0x3d, 0x0001);
+        }
+        break;
+    case 0x0300:
+        if (vendor_id != 0x1234)
+            goto default_map;
+        /* VGA: map frame buffer to default Bochs VBE address */
+        pci_set_io_region_addr(d, 0, 0xE0000000);
+        break;
+    case 0x0800:
+        /* PIC */
+        vendor_id = pci_config_readw(d, PCI_VENDOR_ID);
+        device_id = pci_config_readw(d, PCI_DEVICE_ID);
+        if (vendor_id == 0x1014) {
+            /* IBM */
+            if (device_id == 0x0046 || device_id == 0xFFFF) {
+                /* MPIC & MPIC2 */
+                pci_set_io_region_addr(d, 0, 0x80800000 + 0x00040000);
+            }
+        }
+        break;
+    case 0xff00:
+        if (vendor_id == 0x0106b &&
+            (device_id == 0x0017 || device_id == 0x0022)) {
+            /* macio bridge */
+            pci_set_io_region_addr(d, 0, 0x80800000);
+        }
+        break;
+    default:
+    default_map:
+        /* default memory mappings */
+        for(i = 0; i < PCI_NUM_REGIONS; i++) {
+            r = &d->io_regions[i];
+            if (r->size) {
+                if (r->type & PCI_ADDRESS_SPACE_IO)
+                    paddr = &pci_bios_io_addr;
+                else
+                    paddr = &pci_bios_mem_addr;
+                *paddr = (*paddr + r->size - 1) & ~(r->size - 1);
+                pci_set_io_region_addr(d, i, *paddr);
+                *paddr += r->size;
+            }
+        }
+        break;
+    }
+
+    /* map the interrupt */
+    pin = pci_config_readb(d, PCI_INTERRUPT_PIN);
+    if (pin != 0) {
+        pin = pci_slot_get_pirq(d, pin - 1);
+        pic_irq = pci_irqs[pin];
+        pci_config_writeb(d, PCI_INTERRUPT_LINE, pic_irq);
+    }
+}
+
+/*
+ * This function initializes the PCI devices as a normal PCI BIOS
+ * would do. It is provided just in case the BIOS has no support for
+ * PCI.
+ */
+void pci_setup(void)
+{
+    int i, irq;
+    uint8_t elcr[2];
+
+    pci_bios_io_addr = 0xc000;
+    pci_bios_mem_addr = HVM_BELOW_4G_MMIO_START;
+
+    /* activate IRQ mappings */
+    elcr[0] = 0x00;
+    elcr[1] = 0x00;
+    for(i = 0; i < 4; i++) {
+        irq = pci_irqs[i];
+        /* set to trigger level */
+        elcr[irq >> 3] |= (1 << (irq & 7));
+        /* activate irq remapping in PIIX */
+        pci_config_writeb(piix3_dev, 0x60 + i, irq);
+    }
+    isa_outb(elcr[0], 0x4d0);
+    isa_outb(elcr[1], 0x4d1);
+
+    pci_for_each_device(pci_bios_init_device);
+}
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/ioemu/vl.c
--- a/tools/ioemu/vl.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/ioemu/vl.c	Wed Dec 13 22:52:02 2006 +0800
@@ -4441,6 +4441,11 @@ int qemu_loadvm(const char *filename)
         qemu_fseek(f, cur_pos + record_len, SEEK_SET);
     }
     fclose(f);
+
+    /* del tmp file */
+    if (unlink(filename) == -1)
+        fprintf(stderr, "delete tmp qemu state file failed.\n");
+
     ret = 0;
  the_end:
     if (saved_vm_running)
@@ -5027,6 +5032,7 @@ static QEMUResetEntry *first_reset_entry
 static QEMUResetEntry *first_reset_entry;
 int reset_requested;
 int shutdown_requested;
+int suspend_requested;
 static int powerdown_requested;
 
 void qemu_register_reset(QEMUResetHandler *func, void *opaque)
@@ -5806,6 +5812,14 @@ int set_mm_mapping(int xc_handle, uint32
     }
 
     return 0;
+}
+
+void suspend(int sig)
+{
+   fprintf(logfile, "suspend sig handler called with requested=%d!\n", suspend_requested);
+    if (sig != SIGUSR1)
+        fprintf(logfile, "suspend signal dismatch, get sig=%d!\n", sig);
+    suspend_requested = 1;
 }
 
 #if defined(__i386__) || defined(__x86_64__)
@@ -6709,8 +6723,12 @@ int main(int argc, char **argv)
         }
     } else 
 #endif
-    if (loadvm)
+    if (loadvm) {
+        /*XXX: ugly, since pci_bios_init are moved to hvmloader*/
+        extern void pci_setup(void);
+        pci_setup();
         qemu_loadvm(loadvm);
+    }
 
     {
         /* XXX: simplify init */
@@ -6719,6 +6737,26 @@ int main(int argc, char **argv)
             vm_start();
         }
     }
+
+    /* register signal for the suspend request when save */
+    {
+        struct sigaction act;
+        sigset_t set;
+        act.sa_handler = suspend;
+        act.sa_flags = SA_RESTART;
+        sigemptyset(&act.sa_mask);
+
+        sigaction(SIGUSR1, &act, NULL);
+
+        /* control panel mask some signals when spawn qemu, need unmask here*/
+        sigemptyset(&set);
+        sigaddset(&set, SIGUSR1);
+        sigaddset(&set, SIGTERM);
+        if (sigprocmask(SIG_UNBLOCK, &set, NULL) == -1)
+            fprintf(stderr, "unblock signal fail, possible issue for HVM save!\n");
+
+    }
+
     main_loop();
     quit_timers();
     return 0;
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/Makefile
--- a/tools/libxc/Makefile	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/libxc/Makefile	Wed Dec 13 22:52:02 2006 +0800
@@ -27,7 +27,7 @@ GUEST_SRCS-$(CONFIG_X86) += xc_linux_bui
 GUEST_SRCS-$(CONFIG_X86) += xc_linux_build.c
 GUEST_SRCS-$(CONFIG_IA64) += xc_linux_build.c
 GUEST_SRCS-$(CONFIG_MIGRATE) += xc_linux_restore.c xc_linux_save.c
-GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c
+GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c xc_hvm_restore.c xc_hvm_save.c
 
 -include $(XEN_TARGET_ARCH)/Makefile
 
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/libxc/xc_domain.c	Wed Dec 13 22:52:02 2006 +0800
@@ -233,6 +233,50 @@ int xc_domain_getinfolist(int xc_handle,
     unlock_pages(info, max_domains*sizeof(xc_domaininfo_t));
 
     return ret;
+}
+
+/* get info from hvm guest for save */
+int xc_domain_hvm_getcontext(int xc_handle,
+                             uint32_t domid,
+                             hvm_domain_context_t *hvm_ctxt)
+{
+    int rc;
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_gethvmcontext;
+    domctl.domain = (domid_t)domid;
+    set_xen_guest_handle(domctl.u.hvmcontext.ctxt, hvm_ctxt);
+
+    if ( (rc = mlock(hvm_ctxt, sizeof(*hvm_ctxt))) != 0 )
+        return rc;
+
+    rc = do_domctl(xc_handle, &domctl);
+
+    safe_munlock(hvm_ctxt, sizeof(*hvm_ctxt));
+
+    return rc;
+}
+
+/* set info to hvm guest for restore */
+int xc_domain_hvm_setcontext(int xc_handle,
+                             uint32_t domid,
+                             hvm_domain_context_t *hvm_ctxt)
+{
+    int rc;
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_sethvmcontext;
+    domctl.domain = domid;
+    set_xen_guest_handle(domctl.u.hvmcontext.ctxt, hvm_ctxt);
+
+    if ( (rc = mlock(hvm_ctxt, sizeof(*hvm_ctxt))) != 0 )
+        return rc;
+
+    rc = do_domctl(xc_handle, &domctl);
+
+    safe_munlock(hvm_ctxt, sizeof(*hvm_ctxt));
+
+    return rc;
 }
 
 int xc_vcpu_getcontext(int xc_handle,
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/libxc/xc_hvm_build.c	Wed Dec 13 22:52:02 2006 +0800
@@ -86,7 +86,7 @@ static void build_e820map(void *e820_pag
 
     /* 0x0-0x9F000: Ordinary RAM. */
     e820entry[nr_map].addr = 0x0;
-    e820entry[nr_map].size = 0x9F000;
+    e820entry[nr_map].size = 0x90000;
     e820entry[nr_map].type = E820_RAM;
     nr_map++;
 
@@ -96,7 +96,7 @@ static void build_e820map(void *e820_pag
      * TODO: SMBIOS tables should be moved higher (>=0xE0000).
      *       They are unusually low in our memory map: could cause problems?
      */
-    e820entry[nr_map].addr = 0x9F000;
+    e820entry[nr_map].addr = 0x90000;
     e820entry[nr_map].size = 0x1000;
     e820entry[nr_map].type = E820_RESERVED;
     nr_map++;
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_linux_save.c
--- a/tools/libxc/xc_linux_save.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/libxc/xc_linux_save.c	Wed Dec 13 22:52:02 2006 +0800
@@ -261,15 +261,6 @@ static int ratewrite(int io_fd, void *bu
 #endif
 
 
-static inline ssize_t write_exact(int fd, void *buf, size_t count)
-{
-    if(write(fd, buf, count) != count)
-        return 0;
-    return 1;
-}
-
-
-
 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
                        xc_shadow_op_stats_t *stats, int print)
 {
@@ -356,7 +347,7 @@ static int analysis_phase(int xc_handle,
 }
 
 
-static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
+int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
                              int dom, xc_dominfo_t *info,
                              vcpu_guest_context_t *ctxt)
 {
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/libxc/xenctrl.h	Wed Dec 13 22:52:02 2006 +0800
@@ -313,6 +313,30 @@ int xc_domain_getinfolist(int xc_handle,
                           xc_domaininfo_t *info);
 
 /**
+ * This function returns information about the context of a hvm domain
+ * @parm xc_handle a handle to an open hypervisor interface
+ * @parm domid the domain to get information from
+ * @parm hvm_ctxt a pointer to a structure to store the execution context of the
+ *            hvm domain
+ * @return 0 on success, -1 on failure
+ */
+int xc_domain_hvm_getcontext(int xc_handle,
+                             uint32_t domid,
+                             hvm_domain_context_t *hvm_ctxt);
+
+/**
+ * This function will set the context for hvm domain
+ *
+ * @parm xc_handle a handle to an open hypervisor interface
+ * @parm domid the domain to set the hvm domain context for
+ * @parm hvm_ctxt pointer to the the hvm context with the values to set
+ * @return 0 on success, -1 on failure
+ */
+int xc_domain_hvm_setcontext(int xc_handle,
+                             uint32_t domid,
+                             hvm_domain_context_t *hvm_ctxt);
+
+/**
  * This function returns information about the execution context of a
  * particular vcpu of a domain.
  *
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/libxc/xenguest.h	Wed Dec 13 22:52:02 2006 +0800
@@ -11,6 +11,7 @@
 
 #define XCFLAGS_LIVE      1
 #define XCFLAGS_DEBUG     2
+#define XCFLAGS_HVM       4
 
 
 /**
@@ -25,6 +26,13 @@ int xc_linux_save(int xc_handle, int io_
                   uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
                   int (*suspend)(int domid));
 
+/**
+ * This function will save a hvm domain running unmodified guest.
+ * @return 0 on success, -1 on failure
+ */
+int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+                  uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
+                  int (*suspend)(int domid));
 
 /**
  * This function will restore a saved domain running Linux.
@@ -41,6 +49,18 @@ int xc_linux_restore(int xc_handle, int 
                      unsigned long nr_pfns, unsigned int store_evtchn,
                      unsigned long *store_mfn, unsigned int console_evtchn,
                      unsigned long *console_mfn);
+
+/**
+ * This function will restore a saved hvm domain running unmodified guest.
+ *
+ * @parm store_mfn pass mem size & returned with the mfn of the store page
+ * @return 0 on success, -1 on failure
+ */
+int xc_hvm_restore(int xc_handle, int io_fd, uint32_t dom,
+                      unsigned long nr_pfns, unsigned int store_evtchn,
+                      unsigned long *store_mfn, unsigned int console_evtchn,
+                      unsigned long *console_mfn,
+                      unsigned int pae, unsigned int apic);
 
 /**
  * This function will create a domain for a paravirtualized Linux
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/libxc/xg_save_restore.h	Wed Dec 13 22:52:02 2006 +0800
@@ -65,6 +65,16 @@ static int get_platform_info(int xc_hand
     return 1;
 }
 
+static inline ssize_t write_exact(int fd, void *buf, size_t count)
+{
+    if(write(fd, buf, count) != count)
+        return 0;
+    return 1;
+}
+
+extern int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
+                             int dom, xc_dominfo_t *info,
+                             vcpu_guest_context_t *ctxt);
 
 /*
 ** Save/restore deal with the mfn_to_pfn (M2P) and pfn_to_mfn (P2M) tables.
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/python/xen/lowlevel/xc/xc.c	Wed Dec 13 22:52:02 2006 +0800
@@ -158,6 +158,20 @@ static PyObject *pyxc_domain_destroy(XcO
 static PyObject *pyxc_domain_destroy(XcObject *self, PyObject *args)
 {
     return dom_op(self, args, xc_domain_destroy);
+}
+
+static PyObject *pyxc_domain_shutdown(XcObject *self, PyObject *args)
+{
+    uint32_t dom, reason;
+
+    if (!PyArg_ParseTuple(args, "ii", &dom, &reason))
+      return NULL;
+
+    if (xc_domain_shutdown(self->xc_handle, dom, reason) != 0)
+        return pyxc_error_to_exception();
+    
+    Py_INCREF(zero);
+    return zero;
 }
 
 
@@ -969,6 +983,14 @@ static PyMethodDef pyxc_methods[] = {
       METH_VARARGS, "\n"
       "Destroy a domain.\n"
       " dom [int]:    Identifier of domain to be destroyed.\n\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
+    { "domain_shutdown", 
+      (PyCFunction)pyxc_domain_shutdown,
+      METH_VARARGS, "\n"
+      "Shutdown a domain.\n"
+      " dom       [int, 0]:      Domain identifier to use.\n"
+      " reason     [int, 0]:      Reason for shutdown.\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
     { "vcpu_setaffinity", 
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/python/xen/xend/XendCheckpoint.py	Wed Dec 13 22:52:02 2006 +0800
@@ -22,11 +22,14 @@ from xen.xend.XendConstants import *
 from xen.xend.XendConstants import *
 
 SIGNATURE = "LinuxGuestRecord"
+QEMU_SIGNATURE = "QemuDeviceModelRecord"
+dm_batch = 512
 XC_SAVE = "xc_save"
 XC_RESTORE = "xc_restore"
 
 
 sizeof_int = calcsize("i")
+sizeof_unsigned_int = calcsize("I")
 sizeof_unsigned_long = calcsize("L")
 
 
@@ -69,6 +72,11 @@ def save(fd, dominfo, network, live, dst
                     "could not write guest state file: config len")
         write_exact(fd, config, "could not write guest state file: config")
 
+        image_cfg = dominfo.info.get('image', {})
+        hvm = image_cfg.has_key('hvm')
+
+        if hvm:
+            log.info("save hvm domain")
         # xc_save takes three customization parameters: maxit, max_f, and
         # flags the last controls whether or not save is 'live', while the
         # first two further customize behaviour when 'live' save is
@@ -76,7 +84,7 @@ def save(fd, dominfo, network, live, dst
         # libxenguest; see the comments and/or code in xc_linux_save() for
         # more information.
         cmd = [xen.util.auxbin.pathTo(XC_SAVE), str(fd),
-               str(dominfo.getDomid()), "0", "0", str(int(live)) ]
+               str(dominfo.getDomid()), "0", "0", str(int(live) | (int(hvm) << 2)) ]
         log.debug("[xc_save]: %s", string.join(cmd))
 
         def saveInputHandler(line, tochild):
@@ -90,11 +98,28 @@ def save(fd, dominfo, network, live, dst
                 log.info("Domain %d suspended.", dominfo.getDomid())
                 dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP3,
                                        domain_name)
+                #send signal to device model for save
+                if hvm:
+                    log.info("release_devices for hvm domain")
+                    dominfo._releaseDevices(True)
                 tochild.write("done\n")
                 tochild.flush()
                 log.debug('Written done')
 
         forkHelper(cmd, fd, saveInputHandler, False)
+
+        # put qemu device model state
+        if hvm:
+            write_exact(fd, QEMU_SIGNATURE, "could not write qemu signature")
+            qemu_fd = os.open("/tmp/xen.qemu-dm.%d" % dominfo.getDomid(), os.O_RDONLY)
+            while True:
+                buf = os.read(qemu_fd, dm_batch)
+                if len(buf):
+                    write_exact(fd, buf, "could not write device model state")
+                else:
+                    break
+            os.close(qemu_fd)
+            os.remove("/tmp/xen.qemu-dm.%d" % dominfo.getDomid())
 
         dominfo.destroyDomain()
         try:
@@ -147,19 +172,38 @@ def restore(xd, fd, dominfo = None, paus
     assert store_port
     assert console_port
 
+    #if hvm, pass mem size to calculate the store_mfn
+    hvm = 0
+    apic = 0
+    pae = 0
+    image_cfg = dominfo.info.get('image', {})
+    hvm = image_cfg.has_key('hvm')
+    if hvm:
+        #the 'memory' in config has been removed
+        hvm  = dominfo.info['memory_static_min']
+        apic = dominfo.info['image']['hvm'].get('apic', 0)
+        pae  = dominfo.info['image']['hvm'].get('pae',  0)
+        log.info("restore hvm domain %d, mem=%d, apic=%d, pae=%d", dominfo.domid, hvm, apic, pae)
+
     try:
-        l = read_exact(fd, sizeof_unsigned_long,
-                       "not a valid guest state file: pfn count read")
-        nr_pfns = unpack("L", l)[0]    # native sizeof long
+        if hvm:
+            l = read_exact(fd, sizeof_unsigned_int,
+                    "not a valid hvm guest state file: pfn count read")
+            nr_pfns = unpack("I", l)[0]    # native sizeof int
+        else:
+            l = read_exact(fd, sizeof_unsigned_long,
+                           "not a valid guest state file: pfn count read")
+            nr_pfns = unpack("L", l)[0]    # native sizeof long
         if nr_pfns > 16*1024*1024:     # XXX 
             raise XendError(
                 "not a valid guest state file: pfn count out of range")
 
         balloon.free(xc.pages_to_kib(nr_pfns))
+        log.info("HVM restore:balloon free 0x%x pages.", nr_pfns)
 
         cmd = map(str, [xen.util.auxbin.pathTo(XC_RESTORE),
                         fd, dominfo.getDomid(), nr_pfns,
-                        store_port, console_port])
+                        store_port, console_port, hvm, pae, apic])
         log.debug("[xc_restore]: %s", string.join(cmd))
 
         handler = RestoreInputHandler()
@@ -169,10 +213,29 @@ def restore(xd, fd, dominfo = None, paus
         if handler.store_mfn is None or handler.console_mfn is None:
             raise XendError('Could not read store/console MFN')
 
-        os.read(fd, 1)           # Wait for source to close connection
         dominfo.waitForDevices() # Wait for backends to set up
         if not paused:
             dominfo.unpause()
+
+         # get qemu state and create a tmp file for dm restore
+        if hvm:
+            qemu_signature = read_exact(fd, len(QEMU_SIGNATURE),
+                    "not a valid device model state: signature read")
+            if qemu_signature != QEMU_SIGNATURE:
+                raise XendError("not a valid device model state: found '%s'" %
+                        qemu_signature)
+            qemu_fd = os.open("/tmp/xen.qemu-dm.%d" % dominfo.getDomid(),
+                    os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
+            while True:
+                buf = os.read(fd, dm_batch)
+                if len(buf):
+                    write_exact(qemu_fd, buf, "could not write dm state to tmp file")
+                else:
+                    break
+            os.close(qemu_fd)
+
+        os.read(fd, 1)           # Wait for source to close connection
+
         
         dominfo.completeRestore(handler.store_mfn, handler.console_mfn)
         
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/python/xen/xend/XendDomainInfo.py	Wed Dec 13 22:52:02 2006 +0800
@@ -488,6 +488,16 @@ class XendDomainInfo:
         self._removeVm('xend/previous_restart_time')
         self.storeDom("control/shutdown", reason)
 
+        ## shutdown hypercall for hvm domain desides xenstore write
+        image_cfg = self.info.get('image', {})
+        hvm = image_cfg.has_key('hvm')
+        if hvm:
+            for code in DOMAIN_SHUTDOWN_REASONS.keys():
+                if DOMAIN_SHUTDOWN_REASONS[code] == reason:
+                    break
+            xc.domain_shutdown(self.domid, code)
+
+
     def pause(self):
         """Pause domain
         
@@ -1203,8 +1213,11 @@ class XendDomainInfo:
         if self.image:
             self.image.createDeviceModel()
 
-    def _releaseDevices(self):
+    def _releaseDevices(self, suspend = False):
         """Release all domain's devices.  Nothrow guarantee."""
+        if suspend and self.image:
+            self.image.destroy(suspend)
+            return
 
         while True:
             t = xstransact("%s/device" % self.dompath)
@@ -1473,6 +1486,16 @@ class XendDomainInfo:
         self.console_mfn = console_mfn
 
         self._introduceDomain()
+        image_cfg = self.info.get('image', {})
+        hvm = image_cfg.has_key('hvm')
+        if hvm:
+            self.image = image.create(self,
+                    self.info,
+                    self.info['image'],
+                    self.info['devices'])
+            if self.image:
+                self.image.createDeviceModel(True)
+                self.image.register_shutdown_watch()
         self._storeDomDetails()
         self._registerWatches()
         self.refreshShutdown()
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/python/xen/xend/image.py	Wed Dec 13 22:52:02 2006 +0800
@@ -157,7 +157,7 @@ class ImageHandler:
         """Build the domain. Define in subclass."""
         raise NotImplementedError()
 
-    def createDeviceModel(self):
+    def createDeviceModel(self, restore = False):
         """Create device model for the domain (define in subclass if needed)."""
         pass
     
@@ -405,7 +405,7 @@ class HVMImageHandler(ImageHandler):
 
         return ret
 
-    def createDeviceModel(self):
+    def createDeviceModel(self, restore = False):
         if self.pid:
             return
         # Execute device model.
@@ -414,6 +414,8 @@ class HVMImageHandler(ImageHandler):
         args = args + ([ "-d",  "%d" % self.vm.getDomid(),
                   "-m", "%s" % (self.getRequiredInitialReservation() / 1024)])
         args = args + self.dmargs
+        if restore:
+            args = args + ([ "-loadvm", "/tmp/xen.qemu-dm.%d" % self.vm.getDomid() ])
         env = dict(os.environ)
         if self.display:
             env['DISPLAY'] = self.display
@@ -432,12 +434,16 @@ class HVMImageHandler(ImageHandler):
         self.register_reboot_feature_watch()
         self.pid = self.vm.gatherDom(('image/device-model-pid', int))
 
-    def destroy(self):
+    def destroy(self, suspend = False):
         self.unregister_shutdown_watch()
         self.unregister_reboot_feature_watch();
         if self.pid:
             try:
-                os.kill(self.pid, signal.SIGKILL)
+                sig = signal.SIGKILL
+                if suspend:
+                    log.info("use sigusr1 to signal qemu %d", self.pid)
+                    sig = signal.SIGUSR1
+                os.kill(self.pid, sig)
             except OSError, exn:
                 log.exception(exn)
             try:
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/xcutils/xc_restore.c
--- a/tools/xcutils/xc_restore.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/xcutils/xc_restore.c	Wed Dec 13 22:52:02 2006 +0800
@@ -19,12 +19,13 @@ main(int argc, char **argv)
 main(int argc, char **argv)
 {
     unsigned int xc_fd, io_fd, domid, nr_pfns, store_evtchn, console_evtchn;
+    unsigned int hvm, pae, apic;
     int ret;
     unsigned long store_mfn, console_mfn;
 
-    if (argc != 6)
+    if (argc != 9)
 	errx(1,
-	     "usage: %s iofd domid nr_pfns store_evtchn console_evtchn",
+	     "usage: %s iofd domid nr_pfns store_evtchn console_evtchn hvm pae apic",
 	     argv[0]);
 
     xc_fd = xc_interface_open();
@@ -36,9 +37,19 @@ main(int argc, char **argv)
     nr_pfns = atoi(argv[3]);
     store_evtchn = atoi(argv[4]);
     console_evtchn = atoi(argv[5]);
+    hvm  = atoi(argv[6]);
+    pae  = atoi(argv[7]);
+    apic = atoi(argv[8]);
 
-    ret = xc_linux_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn,
-			   &store_mfn, console_evtchn, &console_mfn);
+    if (hvm) {
+         /* pass the memsize to xc_hvm_restore to find the store_mfn */
+        store_mfn = hvm;
+        ret = xc_hvm_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn,
+                &store_mfn, console_evtchn, &console_mfn, pae, apic);
+    } else 
+        ret = xc_linux_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn,
+                &store_mfn, console_evtchn, &console_mfn);
+
     if (ret == 0) {
 	printf("store-mfn %li\n", store_mfn);
 	printf("console-mfn %li\n", console_mfn);
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/xcutils/xc_save.c
--- a/tools/xcutils/xc_save.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/xcutils/xc_save.c	Wed Dec 13 22:52:02 2006 +0800
@@ -51,7 +51,10 @@ main(int argc, char **argv)
     max_f = atoi(argv[4]);
     flags = atoi(argv[5]);
 
-    ret = xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags, &suspend);
+    if (flags & XCFLAGS_HVM)
+        ret = xc_hvm_save(xc_fd, io_fd, domid, maxit, max_f, flags, &suspend);
+    else 
+        ret = xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags, &suspend);
 
     xc_interface_close(xc_fd);
 
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/domain.c	Wed Dec 13 22:52:02 2006 +0800
@@ -330,6 +330,7 @@ int arch_set_info_guest(
     else
     {
         hvm_load_cpu_guest_regs(v, &v->arch.guest_context.user_regs);
+        hvm_load_cpu_context(v, &v->arch.guest_context.hvmcpu_ctxt);
     }
 
     if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/domctl.c	Wed Dec 13 22:52:02 2006 +0800
@@ -297,6 +297,7 @@ void arch_getdomaininfo_ctxt(
     if ( is_hvm_vcpu(v) )
     {
         hvm_store_cpu_guest_regs(v, &c->user_regs, c->ctrlreg);
+        hvm_save_cpu_context(v, &c->hvmcpu_ctxt);
     }
     else
     {
@@ -314,6 +315,22 @@ void arch_getdomaininfo_ctxt(
     c->ctrlreg[3] = xen_pfn_to_cr3(pagetable_get_pfn(v->arch.guest_table));
 
     c->vm_assist = v->domain->vm_assist;
+}
+
+int arch_gethvm_ctxt(
+    struct vcpu *v, struct hvm_domain_context *c)
+{
+    if ( !is_hvm_vcpu(v) )
+        return -1;
+
+    return hvm_save(v, c);
+
+}
+
+int arch_sethvm_ctxt(
+        struct vcpu *v, struct hvm_domain_context *c)
+{
+    return hvm_load(v, c);
 }
 
 /*
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/hvm/hvm.c	Wed Dec 13 22:52:02 2006 +0800
@@ -182,9 +182,18 @@ int hvm_domain_initialise(struct domain 
 
 void hvm_domain_destroy(struct domain *d)
 {
+    HVMStateEntry *se, *dse;
     kill_timer(&d->arch.hvm_domain.pl_time.periodic_tm.timer);
     rtc_deinit(d);
     pmtimer_deinit(d);
+
+    se = d->arch.hvm_domain.first_se;
+    while (se) {
+        dse = se;
+        se = se->next;
+        xfree(dse);
+    }
+ 
 
     if ( d->arch.hvm_domain.shared_page_va )
         unmap_domain_page_global(
@@ -225,6 +234,9 @@ int hvm_vcpu_initialise(struct vcpu *v)
     pit_init(v, cpu_khz);
     rtc_init(v, RTC_PORT(0), RTC_IRQ);
     pmtimer_init(v, ACPI_PM_TMR_BLK_ADDRESS);
+
+    /* init hvm sharepage */
+    shpage_init(v->domain, get_sp(v->domain));
 
     /* Init guest TSC to start from zero. */
     hvm_set_guest_time(v, 0);
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/i8254.c
--- a/xen/arch/x86/hvm/i8254.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/hvm/i8254.c	Wed Dec 13 22:52:02 2006 +0800
@@ -203,11 +203,11 @@ static inline void pit_load_count(PITCha
     switch (s->mode) {
         case 2:
             /* create periodic time */
-            s->pt = create_periodic_time (period, 0, 0, pit_time_fired, s);
+            s->pt = create_periodic_time (current->domain, period, 0, 0, pit_time_fired, s);
             break;
         case 1:
             /* create one shot time */
-            s->pt = create_periodic_time (period, 0, 1, pit_time_fired, s);
+            s->pt = create_periodic_time (current->domain, period, 0, 1, pit_time_fired, s);
 #ifdef DEBUG_PIT
             printk("HVM_PIT: create one shot time.\n");
 #endif
@@ -345,6 +345,152 @@ static uint32_t pit_ioport_read(void *op
     return ret;
 }
 
+#ifdef HVM_DEBUG_SUSPEND
+static void pit_info(PITState *pit)
+{
+    PITChannelState *s;
+    int i;
+
+    for(i = 0; i < 3; i++) {
+        printk("*****pit channel %d's state:*****\n", i);
+        s = &pit->channels[i];
+        printk("pit 0x%x.\n", s->count);
+        printk("pit 0x%x.\n", s->latched_count);
+        printk("pit 0x%x.\n", s->count_latched);
+        printk("pit 0x%x.\n", s->status_latched);
+        printk("pit 0x%x.\n", s->status);
+        printk("pit 0x%x.\n", s->read_state);
+        printk("pit 0x%x.\n", s->write_state);
+        printk("pit 0x%x.\n", s->write_latch);
+        printk("pit 0x%x.\n", s->rw_mode);
+        printk("pit 0x%x.\n", s->mode);
+        printk("pit 0x%x.\n", s->bcd);
+        printk("pit 0x%x.\n", s->gate);
+        printk("pit %"PRId64"\n", s->count_load_time);
+
+        if (s->pt) {
+            struct periodic_time *pt = s->pt;
+            printk("pit channel %d has a periodic timer:\n", i);
+            printk("pt %d.\n", pt->enabled);
+            printk("pt %d.\n", pt->one_shot);
+            printk("pt %d.\n", pt->irq);
+            printk("pt %d.\n", pt->first_injected);
+
+            printk("pt %d.\n", pt->pending_intr_nr);
+            printk("pt %d.\n", pt->period);
+            printk("pt %"PRId64"\n", pt->period_cycles);
+            printk("pt %"PRId64"\n", pt->last_plt_gtime);
+        }
+    }
+
+}
+#else
+static void pit_info(PITState *pit)
+{
+}
+#endif
+
+static void pit_save(hvm_domain_context_t *h, void *opaque)
+{
+    struct domain *d = opaque;
+    PITState *pit = &d->arch.hvm_domain.pl_time.vpit;
+    PITChannelState *s;
+    struct periodic_time *pt;
+    int i, pti = -1;
+    
+    pit_info(pit);
+
+    for(i = 0; i < 3; i++) {
+        s = &pit->channels[i];
+        hvm_put_32u(h, s->count);
+        hvm_put_16u(h, s->latched_count);
+        hvm_put_8u(h, s->count_latched);
+        hvm_put_8u(h, s->status_latched);
+        hvm_put_8u(h, s->status);
+        hvm_put_8u(h, s->read_state);
+        hvm_put_8u(h, s->write_state);
+        hvm_put_8u(h, s->write_latch);
+        hvm_put_8u(h, s->rw_mode);
+        hvm_put_8u(h, s->mode);
+        hvm_put_8u(h, s->bcd);
+        hvm_put_8u(h, s->gate);
+        hvm_put_64u(h, s->count_load_time);
+
+        if (s->pt && pti == -1)
+            pti = i;
+    }
+
+    /* save guest time */
+    pt = pit->channels[pti].pt;
+    hvm_put_8u(h, pti);
+    hvm_put_8u(h, pt->first_injected);
+    hvm_put_32u(h, pt->pending_intr_nr);
+    hvm_put_64u(h, pt->last_plt_gtime);
+
+}
+
+static int pit_load(hvm_domain_context_t *h, void *opaque, int version_id)
+{
+    struct domain *d = opaque;
+    PITState *pit = &d->arch.hvm_domain.pl_time.vpit;
+    PITChannelState *s;
+    int i, pti;
+    u32 period;
+
+    if (version_id != 1)
+        return -EINVAL;
+
+    for(i = 0; i < 3; i++) {
+        s = &pit->channels[i];
+        s->count = hvm_get_32u(h);
+        s->latched_count = hvm_get_16u(h);
+        s->count_latched = hvm_get_8u(h);
+        s->status_latched = hvm_get_8u(h);
+        s->status = hvm_get_8u(h);
+        s->read_state = hvm_get_8u(h);
+        s->write_state = hvm_get_8u(h);
+        s->write_latch = hvm_get_8u(h);
+        s->rw_mode = hvm_get_8u(h);
+        s->mode = hvm_get_8u(h);
+        s->bcd = hvm_get_8u(h);
+        s->gate = hvm_get_8u(h);
+        s->count_load_time = hvm_get_64u(h);
+    }
+
+    pti = hvm_get_8u(h);
+    if ( pti < 0 || pti > 2) {
+        printk("pit load get a wrong channel %d when HVM resume.\n", pti);
+        return -EINVAL;
+    }
+
+    s = &pit->channels[pti];
+    period = DIV_ROUND((s->count * 1000000000ULL), PIT_FREQ);
+
+    printk("recreate periodic timer %d in mode %d, freq=%d.\n", pti, s->mode, period);
+    switch (s->mode) {
+        case 2:
+            /* create periodic time */
+            s->pt = create_periodic_time (d, period, 0, 0, pit_time_fired, s);
+            s->pt->first_injected = hvm_get_8u(h);
+            s->pt->pending_intr_nr = hvm_get_32u(h);
+            s->pt->last_plt_gtime = hvm_get_64u(h);
+            break;
+        case 1:
+            /* create one shot time */
+            s->pt = create_periodic_time (d, period, 0, 1, pit_time_fired, s);
+            break;
+        default:
+            printk("pit mode %"PRId8" should not use periodic timer!\n", s->mode);
+            return -EINVAL;
+    }
+
+    /*XXX: need set_guest_time here or do this when post_inject? */
+ 
+    pit_info(pit);
+
+    return 0;
+}
+
 static void pit_reset(void *opaque)
 {
     PITState *pit = opaque;
@@ -373,6 +519,8 @@ void pit_init(struct vcpu *v, unsigned l
     s->vcpu = v;
     s++; s->vcpu = v;
     s++; s->vcpu = v;
+
+    hvm_register_savevm(v->domain, "xen_hvm_i8254", PIT_BASE, 1, pit_save, pit_load, v->domain);
 
     register_portio_handler(v->domain, PIT_BASE, 4, handle_pit_io);
     /* register the speaker port */
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/intercept.c
--- a/xen/arch/x86/hvm/intercept.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/hvm/intercept.c	Wed Dec 13 22:52:02 2006 +0800
@@ -29,6 +29,8 @@
 #include <asm/current.h>
 #include <io_ports.h>
 #include <xen/event.h>
+#include <xen/compile.h>
+#include <public/version.h>
 
 
 extern struct hvm_mmio_handler vlapic_mmio_handler;
@@ -314,13 +316,14 @@ void pickup_deactive_ticks(struct period
  * period: fire frequency in ns.
  */
 struct periodic_time * create_periodic_time(
+        struct domain *d,
         u32 period, 
         char irq,
         char one_shot,
         time_cb *cb,
         void *data)
 {
-    struct periodic_time *pt = &(current->domain->arch.hvm_domain.pl_time.periodic_tm);
+    struct periodic_time *pt = &(d->arch.hvm_domain.pl_time.periodic_tm);
     if ( pt->enabled ) {
         stop_timer (&pt->timer);
         pt->enabled = 0;
@@ -353,6 +356,278 @@ void destroy_periodic_time(struct period
         stop_timer(&pt->timer);
         pt->enabled = 0;
     }
+}
+
+/* save/restore support */
+#define HVM_FILE_MAGIC   0x54381286
+#define HVM_FILE_VERSION 0x00000001
+
+int hvm_register_savevm(struct domain *d,
+                    const char *idstr,
+                    int instance_id,
+                    int version_id,
+                    SaveStateHandler *save_state,
+                    LoadStateHandler *load_state,
+                    void *opaque)
+{
+    HVMStateEntry *se, **pse;
+
+    if ( (se = xmalloc(struct HVMStateEntry)) == NULL ){
+        printk("allocat hvmstate entry fail.\n");
+        return -1;
+    }
+
+    strncpy(se->idstr, idstr, HVM_SE_IDSTR_LEN);
+
+    se->instance_id = instance_id;
+    se->version_id = version_id;
+    se->save_state = save_state;
+    se->load_state = load_state;
+    se->opaque = opaque;
+    se->next = NULL;
+
+    /* add at the end of list */
+    pse = &d->arch.hvm_domain.first_se;
+    while (*pse != NULL)
+        pse = &(*pse)->next;
+    *pse = se;
+    return 0;
+}
+
+int hvm_save(struct vcpu *v, hvm_domain_context_t *h)
+{
+    uint32_t len, len_pos, cur_pos;
+    uint32_t eax, ebx, ecx, edx;
+    HVMStateEntry *se;
+    char *chgset;
+
+    if (!is_hvm_vcpu(v)) {
+        printk("hvm_save only for hvm guest!\n");
+        return -1;
+    }
+
+    memset(h, 0, sizeof(hvm_domain_context_t));
+    hvm_put_32u(h, HVM_FILE_MAGIC);
+    hvm_put_32u(h, HVM_FILE_VERSION);
+
+    /* save xen changeset */
+    chgset = strrchr(XEN_CHANGESET, ' ') + 1;
+
+    len = strlen(chgset);
+    hvm_put_8u(h, len);
+    hvm_put_buffer(h, chgset, len);
+
+    /* save cpuid */
+    cpuid(1, &eax, &ebx, &ecx, &edx);
+    hvm_put_32u(h, eax);
+
+    for(se = v->domain->arch.hvm_domain.first_se; se != NULL; se = se->next) {
+        /* ID string */
+        len = strnlen(se->idstr, HVM_SE_IDSTR_LEN);
+        hvm_put_8u(h, len);
+        hvm_put_buffer(h, se->idstr, len);
+
+        hvm_put_32u(h, se->instance_id);
+        hvm_put_32u(h, se->version_id);
+
+        /* record size */
+        len_pos = hvm_ctxt_tell(h);
+        hvm_put_32u(h, 0);
+
+        se->save_state(h, se->opaque);
+
+        cur_pos = hvm_ctxt_tell(h);
+        len = cur_pos - len_pos - 4;
+        hvm_ctxt_seek(h, len_pos);
+        hvm_put_32u(h, len);
+        hvm_ctxt_seek(h, cur_pos);
+
+    }
+
+    h->size = hvm_ctxt_tell(h);
+    hvm_ctxt_seek(h, 0);
+
+    if (h->size >= HVM_CTXT_SIZE) {
+        printk("hvm_domain_context overflow when hvm_save! need %"PRId32" bytes for use.\n", h->size);
+        return -1;
+    }
+
+    return 0;
+
+}
+
+static HVMStateEntry *find_se(struct domain *d, const char *idstr, int instance_id)
+{
+    HVMStateEntry *se;
+
+    for(se = d->arch.hvm_domain.first_se; se != NULL; se = se->next) {
+        if (!strncmp(se->idstr, idstr, HVM_SE_IDSTR_LEN) &&
+            instance_id == se->instance_id){
+            return se;
+        }
+    }
+    return NULL;
+}
+
+int hvm_load(struct vcpu *v, hvm_domain_context_t *h)
+{
+    uint32_t len, rec_len, rec_pos, magic, instance_id, version_id;
+    uint32_t eax, ebx, ecx, edx;
+    HVMStateEntry *se;
+    char idstr[HVM_SE_IDSTR_LEN];
+    xen_changeset_info_t chgset;
+    char *cur_chgset;
+    int ret;
+
+    if (!is_hvm_vcpu(v)) {
+        printk("hvm_load only for hvm guest!\n");
+        return -1;
+    }
+
+    if (h->size >= HVM_CTXT_SIZE) {
+        printk("hvm_load fail! seems hvm_domain_context overflow when hvm_save! need %"PRId32" bytes.\n", h->size);
+        return -1;
+    }
+
+    hvm_ctxt_seek(h, 0);
+
+    magic = hvm_get_32u(h);
+    if (magic != HVM_FILE_MAGIC) {
+        printk("HVM restore magic dismatch!\n");
+        return -1;
+    }
+
+    magic = hvm_get_32u(h);
+    if (magic != HVM_FILE_VERSION) {
+        printk("HVM restore version dismatch!\n");
+        return -1;
+    }
+
+    /* check xen change set */
+    cur_chgset = strrchr(XEN_CHANGESET, ' ') + 1;
+
+    len = hvm_get_8u(h);
+    if (len > 20) { /*typical length is 18 -- "revision number:changeset id" */
+        printk("wrong change set length %d when hvm restore!\n", len);
+        return -1;
+    }
+
+    hvm_get_buffer(h, chgset, len);
+    chgset[len] = '\0';
+    if (strncmp(cur_chgset, chgset, len + 1))
+        printk("warnings: try to restore hvm guest(%s) on a different changeset %s.\n",
+                chgset, cur_chgset);
+
+    /* check cpuid */
+    cpuid(1, &eax, &ebx, &ecx, &edx);
+    ebx = hvm_get_32u(h);
+    /*TODO: need difine how big difference is acceptable */
+    if (ebx != eax)
+        printk("warnings: try to restore hvm guest(0x%"PRIx32") "
+               "on a different type processor(0x%"PRIx32").\n",
+                ebx,
+                eax);
+
+    while(1) {
+        if (hvm_ctxt_end(h)) {
+            break;
+        }
+
+        /* ID string */
+        len = hvm_get_8u(h);
+        if (len > HVM_SE_IDSTR_LEN) {
+            printk("wrong HVM save entry idstr len %d!", len);
+            return -1;
+        }
+
+        hvm_get_buffer(h, idstr, len);
+        idstr[len] = '\0';
+
+        instance_id = hvm_get_32u(h);
+        version_id = hvm_get_32u(h);
+
+        rec_len = hvm_get_32u(h);
+        rec_pos = hvm_ctxt_tell(h);
+
+        se = find_se(v->domain, idstr, instance_id);
+        if (se == NULL) {
+            printk("warnings: hvm load can't find device %s's instance %d!\n",
+                    idstr, instance_id);
+        } else {
+            ret = se->load_state(h, se->opaque, version_id);
+            if (ret < 0)
+                printk("warnings: loading state fail for device %s instance %d!\n",
+                        idstr, instance_id);
+        }
+                    
+
+        /* make sure to jump end of record */
+        if ( hvm_ctxt_tell(h) - rec_pos != rec_len) {
+            printk("wrong hvm record size, maybe some dismatch between save&restore handler!\n");
+        }
+        hvm_ctxt_seek(h, rec_pos + rec_len);
+    }
+
+    return 0;
+}
+
+#ifdef HVM_DEBUG_SUSPEND
+static void shpage_info(shared_iopage_t *sh)
+{
+
+    vcpu_iodata_t *p = &sh->vcpu_iodata[0];
+    ioreq_t *req = &p->vp_ioreq;
+    printk("*****sharepage_info******!\n");
+    printk("vp_eport=%d\n", p->vp_eport);
+    printk("io packet: "
+                     "state:%x, pvalid: %x, dir:%x, port: %"PRIx64", "
+                     "data: %"PRIx64", count: %"PRIx64", size: %"PRIx64"\n",
+                     req->state, req->data_is_ptr, req->dir, req->addr,
+                     req->data, req->count, req->size);
+}
+#else
+static void shpage_info(shared_iopage_t *sh)
+{
+}
+#endif
+
+static void shpage_save(hvm_domain_context_t *h, void *opaque)
+{
+    /* XXX:no action required for shpage save/restore, since it's in guest memory
+     * keep it for debug purpose only */
+
+#if 0
+    struct shared_iopage *s = opaque;
+    /* XXX:smp */
+    struct ioreq *req = &s->vcpu_iodata[0].vp_ioreq;
+    
+    shpage_info(s);
+
+    hvm_put_buffer(h, (char*)req, sizeof(struct ioreq));
+#endif
+}
+
+static int shpage_load(hvm_domain_context_t *h, void *opaque, int version_id)
+{
+    struct shared_iopage *s = opaque;
+#if 0
+    /* XXX:smp */
+    struct ioreq *req = &s->vcpu_iodata[0].vp_ioreq;
+
+    if (version_id != 1)
+        return -EINVAL;
+
+    hvm_get_buffer(h, (char*)req, sizeof(struct ioreq));
+
+
+#endif
+    shpage_info(s);
+    return 0;
+}
+
+void shpage_init(struct domain *d, shared_iopage_t *sp)
+{
+    hvm_register_savevm(d, "xen_hvm_shpage", 0x10, 1, shpage_save, shpage_load, sp);
 }
 
 /*
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/vioapic.c
--- a/xen/arch/x86/hvm/vioapic.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/hvm/vioapic.c	Wed Dec 13 22:52:02 2006 +0800
@@ -466,10 +466,138 @@ void vioapic_update_EOI(struct domain *d
     spin_unlock(&hvm_irq->lock);
 }
 
+#ifdef HVM_DEBUG_SUSPEND
+static void ioapic_info(struct vioapic *s)
+{
+    int i;
+    printk("*****ioapic state:*****\n");
+    printk("ioapic 0x%x.\n", s->ioregsel);
+    printk("ioapic 0x%x.\n", s->id);
+    printk("ioapic 0x%lx.\n", s->base_address);
+    for (i = 0; i < VIOAPIC_NUM_PINS; i++) {
+        printk("ioapic redirtbl[%d]:0x%"PRIx64"\n", i, s->redirtbl[i].bits);
+    }
+
+}
+static void hvmirq_info(struct hvm_irq *hvm_irq)
+{
+    int i;
+    printk("*****hvmirq state:*****\n");
+    for (i = 0; i < BITS_TO_LONGS(32*4); i++)
+        printk("hvmirq pci_intx[%d]:0x%lx.\n", i, hvm_irq->pci_intx[i]);
+
+    for (i = 0; i < BITS_TO_LONGS(16); i++)
+        printk("hvmirq isa_irq[%d]:0x%lx.\n", i, hvm_irq->isa_irq[i]);
+
+    for (i = 0; i < BITS_TO_LONGS(1); i++)
+        printk("hvmirq callback_irq_wire[%d]:0x%lx.\n", i, hvm_irq->callback_irq_wire[i]);
+
+    printk("hvmirq callback_gsi:0x%x.\n", hvm_irq->callback_gsi);
+
+    for (i = 0; i < 4; i++)
+        printk("hvmirq pci_link_route[%d]:0x%"PRIx8".\n", i, hvm_irq->pci_link_route[i]);
+
+    for (i = 0; i < 4; i++)
+        printk("hvmirq pci_link_assert_count[%d]:0x%"PRIx8".\n", i, hvm_irq->pci_link_assert_count[i]);
+
+    for (i = 0; i < 4; i++)
+        printk("hvmirq gsi_assert_count[%d]:0x%"PRIx8".\n", i, hvm_irq->gsi_assert_count[i]);
+
+    printk("hvmirq round_robin_prev_vcpu:0x%"PRIx8".\n", hvm_irq->round_robin_prev_vcpu);
+}
+#else
+static void ioapic_info(struct vioapic *s)
+{
+}
+static void hvmirq_info(struct hvm_irq *hvm_irq)
+{
+}
+#endif
+
+static void ioapic_save(hvm_domain_context_t *h, void *opaque)
+{
+    int i;
+    struct domain *d = opaque;
+    struct vioapic *s = domain_vioapic(d);
+    struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
+
+    ioapic_info(s);
+    hvmirq_info(hvm_irq);
+
+    /* save iopaic state*/
+    hvm_put_32u(h, s->ioregsel);
+    hvm_put_32u(h, s->id);
+    hvm_put_64u(h, s->base_address);
+    for (i = 0; i < VIOAPIC_NUM_PINS; i++) {
+        hvm_put_64u(h, s->redirtbl[i].bits);
+    }
+
+    /* save hvm irq state */
+    hvm_put_buffer(h, (char*)hvm_irq->pci_intx, 16);
+    hvm_put_buffer(h, (char*)hvm_irq->isa_irq, 2);
+    hvm_put_buffer(h, (char*)hvm_irq->callback_irq_wire, 1);
+    hvm_put_32u(h, hvm_irq->callback_gsi);
+
+    for (i = 0; i < 4; i++)
+        hvm_put_8u(h, hvm_irq->pci_link_route[i]);
+
+    for (i = 0; i < 4; i++)
+        hvm_put_8u(h, hvm_irq->pci_link_assert_count[i]);
+
+    for (i = 0; i < VIOAPIC_NUM_PINS; i++)
+        hvm_put_8u(h, hvm_irq->gsi_assert_count[i]);
+
+    hvm_put_8u(h, hvm_irq->round_robin_prev_vcpu);
+
+}
+
+static int ioapic_load(hvm_domain_context_t *h, void *opaque, int version_id)
+{
+    int i;
+    struct domain *d = opaque;
+    struct vioapic *s = domain_vioapic(d);
+    struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
+    
+    if (version_id != 1)
+        return -EINVAL;
+
+    /* restore ioapic state */
+    s->ioregsel = hvm_get_32u(h);
+    s->id = hvm_get_32u(h);
+    s->base_address = hvm_get_64u(h);
+    for (i = 0; i < VIOAPIC_NUM_PINS; i++) {
+        s->redirtbl[i].bits = hvm_get_64u(h);
+    }
+
+    /* restore irq state */
+    hvm_get_buffer(h, (char*)hvm_irq->pci_intx, 16);
+    hvm_get_buffer(h, (char*)hvm_irq->isa_irq, 2);
+    hvm_get_buffer(h, (char*)hvm_irq->callback_irq_wire, 1);
+    hvm_irq->callback_gsi = hvm_get_32u(h);
+
+    for (i = 0; i < 4; i++)
+        hvm_irq->pci_link_route[i] = hvm_get_8u(h);
+
+    for (i = 0; i < 4; i++)
+        hvm_irq->pci_link_assert_count[i] = hvm_get_8u(h);
+
+    for (i = 0; i < VIOAPIC_NUM_PINS; i++)
+        hvm_irq->gsi_assert_count[i] = hvm_get_8u(h);
+
+    hvm_irq->round_robin_prev_vcpu = hvm_get_8u(h);
+
+    ioapic_info(s);
+    hvmirq_info(hvm_irq);
+
+    return 0;
+}
+
 void vioapic_init(struct domain *d)
 {
     struct vioapic *vioapic = domain_vioapic(d);
     int i;
+
+    hvm_register_savevm(d, "xen_hvm_ioapic", 0, 1, ioapic_save, ioapic_load, d);
 
     memset(vioapic, 0, sizeof(*vioapic));
     for ( i = 0; i < VIOAPIC_NUM_PINS; i++ )
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/vlapic.c
--- a/xen/arch/x86/hvm/vlapic.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/hvm/vlapic.c	Wed Dec 13 22:52:02 2006 +0800
@@ -921,6 +921,82 @@ static int vlapic_reset(struct vlapic *v
     return 1;
 }
 
+#ifdef HVM_DEBUG_SUSPEND
+static void lapic_info(struct vlapic *s)
+{
+    printk("*****lapic state:*****\n");
+    printk("lapic 0x%"PRIx64".\n", s->apic_base_msr);
+    printk("lapic 0x%x.\n", s->disabled);
+    printk("lapic 0x%x.\n", s->timer_divisor);
+    printk("lapic 0x%x.\n", s->timer_pending_count);
+}
+#else
+static void lapic_info(struct vlapic *s)
+{
+}
+#endif
+
+static void lapic_save(hvm_domain_context_t *h, void *opaque)
+{
+    struct vlapic *s = opaque;
+
+    lapic_info(s);
+
+    hvm_put_64u(h, s->apic_base_msr);
+    hvm_put_32u(h, s->disabled);
+    hvm_put_32u(h, s->timer_divisor);
+
+    /*XXX: need this?*/
+    hvm_put_32u(h, s->timer_pending_count);
+
+    hvm_put_buffer(h, (char*)s->regs, 0x3f0);
+
+}
+
+static int lapic_load(hvm_domain_context_t *h, void *opaque, int version_id)
+{
+    struct vlapic *s = opaque;
+    uint32_t tmict;
+
+    if (version_id != 1)
+        return -EINVAL;
+
+    s->apic_base_msr = hvm_get_64u(h);
+    s->disabled = hvm_get_32u(h);
+    s->timer_divisor = hvm_get_32u(h);
+
+    /*XXX: need this?*/
+    s->timer_pending_count = hvm_get_32u(h);
+
+    hvm_get_buffer(h, (char*)s->regs, 0x3f0);
+
+    /* rearm the actiemr if needed */
+    tmict = vlapic_get_reg(s, APIC_TMICT);
+    if (tmict > 0) {
+        s_time_t now = NOW(), offset;
+        stop_timer(&s->vlapic_timer);
+        vlapic_set_reg(s, APIC_TMCCT, tmict);
+        s->timer_last_update = now;
+
+        offset = APIC_BUS_CYCLE_NS * s->timer_divisor * tmict;
+
+        set_timer(&s->vlapic_timer, now + offset);
+
+        printk("lapic_load to rearm the actimer:"
+                    "bus cycle is %"PRId64"ns, now 0x%016"PRIx64", "
+                    "timer initial count 0x%x, offset 0x%016"PRIx64", "
+                    "expire @ 0x%016"PRIx64".",
+                    APIC_BUS_CYCLE_NS, now,
+                    vlapic_get_reg(s, APIC_TMICT),
+                    offset, now + offset);
+    }
+
+
+    lapic_info(s);
+
+    return 0;
+}
+
 int vlapic_init(struct vcpu *v)
 {
     struct vlapic *vlapic = vcpu_vlapic(v);
@@ -939,6 +1015,7 @@ int vlapic_init(struct vcpu *v)
     vlapic->regs = map_domain_page_global(page_to_mfn(vlapic->regs_page));
     memset(vlapic->regs, 0, PAGE_SIZE);
 
+    hvm_register_savevm(v->domain, "xen_hvm_lapic", v->vcpu_id, 1, lapic_save, lapic_load, vlapic);
     vlapic_reset(vlapic);
 
     vlapic->apic_base_msr = MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/hvm/vmx/vmx.c	Wed Dec 13 22:52:02 2006 +0800
@@ -426,6 +426,319 @@ static void vmx_store_cpu_guest_regs(
     vmx_vmcs_exit(v);
 }
 
+static int __get_instruction_length(void);
+int vmx_vmcs_save(struct vcpu *v, struct vmcs_data *c)
+{
+    unsigned long inst_len;
+
+    inst_len = __get_instruction_length();
+    c->eip = __vmread(GUEST_RIP);
+
+#ifdef HVM_DEBUG_SUSPEND
+    printk("vmx_vmcs_save: inst_len=0x%lx, eip=0x%"PRIx64".\n", 
+            inst_len, c->eip);
+#endif
+
+    c->esp = __vmread(GUEST_RSP);
+    c->eflags = __vmread(GUEST_RFLAGS);
+
+    c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
+    c->cr3 = v->arch.hvm_vmx.cpu_cr3;
+    c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
+
+#ifdef HVM_DEBUG_SUSPEND
+    printk("vmx_vmcs_save: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
+            c->cr3,
+            c->cr0,
+            c->cr4);
+#endif
+
+    c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
+    c->idtr_base = __vmread(GUEST_IDTR_BASE);
+
+    c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
+    c->gdtr_base = __vmread(GUEST_GDTR_BASE);
+
+    c->cs_sel = __vmread(GUEST_CS_SELECTOR);
+    c->cs_limit = __vmread(GUEST_CS_LIMIT);
+    c->cs_base = __vmread(GUEST_CS_BASE);
+    c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
+
+    c->ds_sel = __vmread(GUEST_DS_SELECTOR);
+    c->ds_limit = __vmread(GUEST_DS_LIMIT);
+    c->ds_base = __vmread(GUEST_DS_BASE);
+    c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
+
+    c->es_sel = __vmread(GUEST_ES_SELECTOR);
+    c->es_limit = __vmread(GUEST_ES_LIMIT);
+    c->es_base = __vmread(GUEST_ES_BASE);
+    c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
+
+    c->ss_sel = __vmread(GUEST_SS_SELECTOR);
+    c->ss_limit = __vmread(GUEST_SS_LIMIT);
+    c->ss_base = __vmread(GUEST_SS_BASE);
+    c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
+
+    c->fs_sel = __vmread(GUEST_FS_SELECTOR);
+    c->fs_limit = __vmread(GUEST_FS_LIMIT);
+    c->fs_base = __vmread(GUEST_FS_BASE);
+    c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
+
+    c->gs_sel = __vmread(GUEST_GS_SELECTOR);
+    c->gs_limit = __vmread(GUEST_GS_LIMIT);
+    c->gs_base = __vmread(GUEST_GS_BASE);
+    c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
+
+    c->tr_sel = __vmread(GUEST_TR_SELECTOR);
+    c->tr_limit = __vmread(GUEST_TR_LIMIT);
+    c->tr_base = __vmread(GUEST_TR_BASE);
+    c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
+
+    c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
+    c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
+    c->ldtr_base = __vmread(GUEST_LDTR_BASE);
+    c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
+
+    c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
+    c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
+    c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
+
+    return 1;
+}
+
+int vmx_vmcs_restore(struct vcpu *v, struct vmcs_data *c)
+{
+    unsigned long mfn, old_cr4, old_base_mfn;
+    int error = 0;
+
+    __vmwrite(GUEST_RIP, c->eip);
+    __vmwrite(GUEST_RSP, c->esp);
+    __vmwrite(GUEST_RFLAGS, c->eflags);
+
+    v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
+    __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
+
+    old_cr4 = __vmread(CR4_READ_SHADOW);
+    __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
+
+    v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
+    __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
+
+#ifdef HVM_DEBUG_SUSPEND
+    printk("vmx_vmcs_restore: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n",
+            c->cr3,
+            c->cr0,
+            c->cr4);
+#endif
+
+    if (!vmx_paging_enabled(v)) {
+        HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
+        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
+        goto skip_cr3;
+    }
+
+    if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
+        /*
+         * This is simple TLB flush, implying the guest has
+         * removed some translation or changed page attributes.
+         * We simply invalidate the shadow.
+         */
+        mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
+        if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
+            printk("Invalid CR3 value=%"PRIx64"", c->cr3);
+            domain_crash(v->domain);
+            return 0;
+        }
+    } else {
+        /*
+         * If different, make a shadow. Check if the PDBR is valid
+         * first.
+         */
+        HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %"PRIx64"", c->cr3);
+        if ((c->cr3 >> PAGE_SHIFT) > v->domain->max_pages) {
+            printk("Invalid CR3 value=%"PRIx64"", c->cr3);
+            domain_crash(v->domain);
+            return 0;
+        }
+
+        /* current!=vcpu as not called by arch_vmx_do_launch */
+        mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
+        if(!get_page(mfn_to_page(mfn), v->domain)) {
+            struct page_info *page = mfn_to_page(mfn);
+            printk("get_page for mfn failed. CR3 value=%"PRIx64", count_info=0x%"PRIx32", type_info=0x%lx, owner=%d.\n", c->cr3,
+                    page->count_info,
+                    page->u.inuse.type_info,
+                    page->u.inuse._domain);
+            return 0;
+        }
+        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+        v->arch.guest_table = pagetable_from_pfn(mfn);
+        if (old_base_mfn)
+             put_page(mfn_to_page(old_base_mfn));
+        /*
+         * arch.shadow_table should now hold the next CR3 for shadow
+         */
+        v->arch.hvm_vmx.cpu_cr3 = c->cr3;
+    }
+
+ skip_cr3:
+#if defined(__x86_64__)
+    if (vmx_long_mode_enabled(v)) {
+        unsigned long vm_entry_value;
+        vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
+        vm_entry_value |= VM_ENTRY_IA32E_MODE;
+        __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
+    }
+#endif
+
+    shadow_update_paging_modes(v);
+    __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
+
+    __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
+    __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
+
+    __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
+    __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
+
+    __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
+    __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
+    __vmwrite(GUEST_CS_BASE, c->cs_base);
+    __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
+
+    __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
+    __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
+    __vmwrite(GUEST_DS_BASE, c->ds_base);
+    __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
+
+    __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
+    __vmwrite(GUEST_ES_LIMIT, c->es_limit);
+    __vmwrite(GUEST_ES_BASE, c->es_base);
+    __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
+
+    __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
+    __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
+    __vmwrite(GUEST_SS_BASE, c->ss_base);
+    __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
+
+    __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
+    __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
+    __vmwrite(GUEST_FS_BASE, c->fs_base);
+    __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
+
+    __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
+    __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
+    __vmwrite(GUEST_GS_BASE, c->gs_base);
+    __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
+
+    __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
+    __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
+    __vmwrite(GUEST_TR_BASE, c->tr_base);
+    __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
+
+    __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
+    __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
+    __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
+    __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
+
+    __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
+    __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
+    __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
+
+    return !error;
+}
+
+#ifdef HVM_DEBUG_SUSPEND
+static void dump_msr_state(struct vmx_msr_state *m)
+{
+    int i = 0;
+    printk("**** msr state ****\n");
+    printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
+    for (i = 0; i < VMX_MSR_COUNT; i++)
+        printk("0x%lx,", m->msrs[i]);
+    printk("\n");
+}
+#else
+static void dump_msr_state(struct vmx_msr_state *m)
+{
+}
+#endif
+        
+void vmx_save_cpu_state(struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    struct vmcs_data *data = &ctxt->data;
+    struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
+    unsigned long guest_flags = guest_state->flags;
+    int i = 0;
+
+    data->shadow_gs = guest_state->shadow_gs;
+    data->vmxassist_enabled = v->arch.hvm_vmx.vmxassist_enabled;
+    /* save msrs */
+    data->flags = guest_flags;
+    for (i = 0; i < VMX_MSR_COUNT; i++)
+        data->msr_items[i] = guest_state->msrs[i];
+
+    dump_msr_state(guest_state);
+}
+
+void vmx_load_cpu_state(struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    int i = 0;
+    struct vmcs_data *data = &ctxt->data;
+    struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
+
+    /* restore msrs */
+    guest_state->flags = data->flags;
+    for (i = 0; i < VMX_MSR_COUNT; i++)
+        guest_state->msrs[i] = data->msr_items[i];
+
+    guest_state->shadow_gs = data->shadow_gs;
+
+    /*XXX:no need to restore msrs, current!=vcpu as not called by arch_vmx_do_launch */
+/*    vmx_restore_guest_msrs(v);*/
+
+    v->arch.hvm_vmx.vmxassist_enabled = data->vmxassist_enabled;
+
+    dump_msr_state(guest_state);
+}
+
+void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    struct vmcs_data *data = &ctxt->data;
+
+    /* set valid flag to recover whole vmcs when restore */
+    ctxt->valid = 1;
+
+    vmx_save_cpu_state(v, ctxt);
+
+    vmx_vmcs_enter(v);
+
+    vmx_vmcs_save(v, data);
+
+    vmx_vmcs_exit(v);
+
+}
+
+void vmx_load_vmcs_ctxt(struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    if (!ctxt->valid)
+        return;
+
+    vmx_load_cpu_state(v, ctxt);
+
+    vmx_vmcs_enter(v);
+
+    if (!vmx_vmcs_restore(v, &ctxt->data)) {
+        printk("vmx_vmcs restore failed!\n");
+        domain_crash(v->domain);
+    }
+
+    /* only load vmcs once */
+    ctxt->valid = 0;
+
+    vmx_vmcs_exit(v);
+
+}
+
 /*
  * The VMX spec (section 4.3.1.2, Checks on Guest Segment
  * Registers) says that virtual-8086 mode guests' segment
@@ -737,6 +1050,9 @@ static void vmx_setup_hvm_funcs(void)
 
     hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
     hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
+
+    hvm_funcs.save_cpu_ctxt = vmx_save_vmcs_ctxt;
+    hvm_funcs.load_cpu_ctxt = vmx_load_vmcs_ctxt;
 
     hvm_funcs.paging_enabled = vmx_paging_enabled;
     hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/vpic.c
--- a/xen/arch/x86/hvm/vpic.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/hvm/vpic.c	Wed Dec 13 22:52:02 2006 +0800
@@ -378,6 +378,87 @@ static int vpic_intercept_elcr_io(ioreq_
     return 1;
 }
 
+#ifdef HVM_DEBUG_SUSPEND
+static void vpic_info(struct vpic *s)
+{
+    printk("*****pic state:*****\n");
+    printk("pic 0x%x.\n", s->irr);
+    printk("pic 0x%x.\n", s->imr);
+    printk("pic 0x%x.\n", s->isr);
+    printk("pic 0x%x.\n", s->irq_base);
+    printk("pic 0x%x.\n", s->init_state);
+    printk("pic 0x%x.\n", s->priority_add);
+    printk("pic 0x%x.\n", s->readsel_isr);
+    printk("pic 0x%x.\n", s->poll);
+    printk("pic 0x%x.\n", s->auto_eoi);
+    printk("pic 0x%x.\n", s->rotate_on_auto_eoi);
+    printk("pic 0x%x.\n", s->special_fully_nested_mode);
+    printk("pic 0x%x.\n", s->special_mask_mode);
+    printk("pic 0x%x.\n", s->elcr);
+    printk("pic 0x%x.\n", s->int_output);
+    printk("pic 0x%x.\n", s->is_master);
+}
+#else
+static void vpic_info(struct vpic *s)
+{
+}
+#endif
+
+static void vpic_save(hvm_domain_context_t *h, void *opaque)
+{
+    struct vpic *s = opaque;
+    
+    vpic_info(s);
+
+    hvm_put_8u(h, s->irr);
+    hvm_put_8u(h, s->imr);
+    hvm_put_8u(h, s->isr);
+    hvm_put_8u(h, s->irq_base);
+    hvm_put_8u(h, s->init_state);
+    hvm_put_8u(h, s->priority_add);
+    hvm_put_8u(h, s->readsel_isr);
+
+    hvm_put_8u(h, s->poll);
+    hvm_put_8u(h, s->auto_eoi);
+
+    hvm_put_8u(h, s->rotate_on_auto_eoi);
+    hvm_put_8u(h, s->special_fully_nested_mode);
+    hvm_put_8u(h, s->special_mask_mode);
+
+    hvm_put_8u(h, s->elcr);
+    hvm_put_8u(h, s->int_output);
+}
+
+static int vpic_load(hvm_domain_context_t *h, void *opaque, int version_id)
+{
+    struct vpic *s = opaque;
+    
+    if (version_id != 1)
+        return -EINVAL;
+
+    s->irr = hvm_get_8u(h);
+    s->imr = hvm_get_8u(h);
+    s->isr = hvm_get_8u(h);
+    s->irq_base = hvm_get_8u(h);
+    s->init_state = hvm_get_8u(h);
+    s->priority_add = hvm_get_8u(h);
+    s->readsel_isr = hvm_get_8u(h);
+
+    s->poll = hvm_get_8u(h);
+    s->auto_eoi = hvm_get_8u(h);
+
+    s->rotate_on_auto_eoi = hvm_get_8u(h);
+    s->special_fully_nested_mode = hvm_get_8u(h);
+    s->special_mask_mode = hvm_get_8u(h);
+
+    s->elcr = hvm_get_8u(h);
+    s->int_output = hvm_get_8u(h);
+
+    vpic_info(s);
+
+    return 0;
+}
+
 void vpic_init(struct domain *d)
 {
     struct vpic *vpic;
@@ -387,12 +468,14 @@ void vpic_init(struct domain *d)
     memset(vpic, 0, sizeof(*vpic));
     vpic->is_master = 1;
     vpic->elcr      = 1 << 2;
+    hvm_register_savevm(d, "xen_hvm_i8259", 0x20, 1, vpic_save, vpic_load, vpic);
     register_portio_handler(d, 0x20, 2, vpic_intercept_pic_io);
     register_portio_handler(d, 0x4d0, 1, vpic_intercept_elcr_io);
 
     /* Slave PIC. */
     vpic++;
     memset(vpic, 0, sizeof(*vpic));
+    hvm_register_savevm(d, "xen_hvm_i8259", 0xa0, 1, vpic_save, vpic_load, vpic);
     register_portio_handler(d, 0xa0, 2, vpic_intercept_pic_io);
     register_portio_handler(d, 0x4d1, 1, vpic_intercept_elcr_io);
 }
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/mm/shadow/common.c
--- a/xen/arch/x86/mm/shadow/common.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/mm/shadow/common.c	Wed Dec 13 22:52:02 2006 +0800
@@ -2145,7 +2145,7 @@ int shadow_remove_all_mappings(struct vc
         /* Don't complain if we're in HVM and there's one extra mapping: 
          * The qemu helper process has an untyped mapping of this dom's RAM */
         if ( !(shadow_mode_external(v->domain)
-               && (page->count_info & PGC_count_mask) <= 2
+               && (page->count_info & PGC_count_mask) <= 3 /* vmx restore add one extra mapping*/
                && (page->u.inuse.type_info & PGT_count_mask) == 0) )
         {
             SHADOW_ERROR("can't find all mappings of mfn %lx: "
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/mm/shadow/multi.c	Wed Dec 13 22:52:02 2006 +0800
@@ -1613,6 +1613,14 @@ sh_make_shadow(struct vcpu *v, mfn_t gmf
         }
     }
     
+    {
+        struct page_info *page = mfn_to_page(gmfn);
+        /* XXX: add it to emulate a touched page */
+        if ((page->u.inuse.type_info & PGT_type_mask) == PGT_none){
+            page->u.inuse.type_info |= (PGT_writable_page | PGT_validated);
+        }
+    }
+
     shadow_promote(v, gmfn, shadow_type);
     set_shadow_status(v, gmfn, shadow_type, smfn);
 
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/common/domain.c
--- a/xen/common/domain.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/common/domain.c	Wed Dec 13 22:52:02 2006 +0800
@@ -24,6 +24,7 @@
 #include <xen/percpu.h>
 #include <xen/multicall.h>
 #include <asm/debugger.h>
+#include <asm/hvm/support.h>
 #include <public/sched.h>
 #include <public/vcpu.h>
 
@@ -454,8 +455,14 @@ int set_info_guest(struct domain *d,
     domain_pause(d);
 
     rc = -EFAULT;
-    if ( copy_from_guest(c, vcpucontext->ctxt, 1) == 0 )
+    if ( copy_from_guest(c, vcpucontext->ctxt, 1) == 0 ) {
         rc = arch_set_info_guest(v, c);
+        if ( v->vcpu_id != 0 &&
+                is_hvm_vcpu(v) &&
+                test_and_clear_bit(_VCPUF_down, &v->vcpu_flags) ) {
+            vcpu_wake(v);
+        }
+    }
 
     domain_unpause(d);
 
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/common/domctl.c
--- a/xen/common/domctl.c	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/common/domctl.c	Wed Dec 13 22:52:02 2006 +0800
@@ -26,6 +26,10 @@ extern long arch_do_domctl(
     struct xen_domctl *op, XEN_GUEST_HANDLE(xen_domctl_t) u_domctl);
 extern void arch_getdomaininfo_ctxt(
     struct vcpu *, struct vcpu_guest_context *);
+extern int arch_gethvm_ctxt(
+    struct vcpu *, struct hvm_domain_context *);
+extern int arch_sethvm_ctxt(
+        struct vcpu *, struct hvm_domain_context *);
 
 void cpumask_to_xenctl_cpumap(
     struct xenctl_cpumap *xenctl_cpumap, cpumask_t *cpumask)
@@ -205,6 +209,37 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
     }
     break;
 
+    case XEN_DOMCTL_sethvmcontext:
+    { 
+        struct hvm_domain_context *c;
+        struct domain             *d;
+        struct vcpu               *v;
+
+        ret = -ESRCH;
+        if ( (d = find_domain_by_id(op->domain)) == NULL )
+            break;
+
+        ret = -ENOMEM;
+        if ( (c = xmalloc(struct hvm_domain_context)) == NULL )
+            goto sethvmcontext_out;
+
+        /*XXX: need check input vcpu when smp */
+        v = d->vcpu[0];
+        
+        ret = -EFAULT;
+        if ( copy_from_guest(c, op->u.hvmcontext.ctxt, 1) != 0 )
+            goto sethvmcontext_out;
+
+        ret = arch_sethvm_ctxt(v, c);
+
+        xfree(c);
+
+    sethvmcontext_out:
+        put_domain(d);
+
+    }
+    break;
+
     case XEN_DOMCTL_pausedomain:
     {
         struct domain *d = find_domain_by_id(op->domain);
@@ -489,6 +524,44 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
 
     getvcpucontext_out:
         put_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_gethvmcontext:
+    { 
+        struct hvm_domain_context *c;
+        struct domain             *d;
+        struct vcpu               *v;
+
+        ret = -ESRCH;
+        if ( (d = find_domain_by_id(op->domain)) == NULL )
+            break;
+
+        ret = -ENOMEM;
+        if ( (c = xmalloc(struct hvm_domain_context)) == NULL )
+            goto gethvmcontext_out;
+
+        v = d->vcpu[0];
+
+        ret = -ENODATA;
+        if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+            goto gethvmcontext_out;
+        
+        ret = 0;
+        if (arch_gethvm_ctxt(v, c) == -1)
+            ret = -EFAULT;
+
+        if ( copy_to_guest(op->u.hvmcontext.ctxt, c, 1) )
+            ret = -EFAULT;
+
+        xfree(c);
+
+        if ( copy_to_guest(u_domctl, op, 1) )
+            ret = -EFAULT;
+
+    gethvmcontext_out:
+        put_domain(d);
+
     }
     break;
 
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/asm-x86/hvm/domain.h
--- a/xen/include/asm-x86/hvm/domain.h	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/include/asm-x86/hvm/domain.h	Wed Dec 13 22:52:02 2006 +0800
@@ -27,6 +27,20 @@
 #include <asm/hvm/io.h>
 #include <public/hvm/params.h>
 
+typedef void SaveStateHandler(hvm_domain_context_t *h, void *opaque);
+typedef int LoadStateHandler(hvm_domain_context_t *h, void *opaque, int version_id);
+
+#define HVM_SE_IDSTR_LEN 32
+typedef struct HVMStateEntry {
+    char idstr[HVM_SE_IDSTR_LEN];
+    int instance_id;
+    int version_id;
+    SaveStateHandler *save_state;
+    LoadStateHandler *load_state;
+    void *opaque;
+    struct HVMStateEntry *next;
+} HVMStateEntry;
+
 struct hvm_domain {
     unsigned long          shared_page_va;
     unsigned long          buffered_io_va;
@@ -44,6 +58,9 @@ struct hvm_domain {
     spinlock_t             pbuf_lock;
 
     uint64_t               params[HVM_NR_PARAMS];
+
+    struct hvm_domain_context *hvm_ctxt;
+    HVMStateEntry *first_se;
 };
 
 #endif /* __ASM_X86_HVM_DOMAIN_H__ */
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/include/asm-x86/hvm/hvm.h	Wed Dec 13 22:52:02 2006 +0800
@@ -79,6 +79,13 @@ struct hvm_function_table {
         struct vcpu *v, struct cpu_user_regs *r, unsigned long *crs);
     void (*load_cpu_guest_regs)(
         struct vcpu *v, struct cpu_user_regs *r);
+
+    /* save and load hvm guest cpu context for save/restore */
+    void (*save_cpu_ctxt)(
+        struct vcpu *v, struct hvmcpu_context *ctxt);
+    void (*load_cpu_ctxt)(
+        struct vcpu *v, struct hvmcpu_context *ctxt);
+
     /*
      * Examine specifics of the guest state:
      * 1) determine whether paging is enabled,
@@ -152,6 +159,20 @@ hvm_load_cpu_guest_regs(struct vcpu *v, 
     hvm_funcs.load_cpu_guest_regs(v, r);
 }
 
+static inline void
+hvm_save_cpu_context(
+        struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    hvm_funcs.save_cpu_ctxt(v, ctxt);
+}
+
+static inline void
+hvm_load_cpu_context(
+        struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    hvm_funcs.load_cpu_ctxt(v, ctxt);
+}
+
 static inline int
 hvm_paging_enabled(struct vcpu *v)
 {
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/asm-x86/hvm/support.h
--- a/xen/include/asm-x86/hvm/support.h	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/include/asm-x86/hvm/support.h	Wed Dec 13 22:52:02 2006 +0800
@@ -121,6 +121,130 @@ extern unsigned int opt_hvm_debug_level;
 #define TRACE_VMEXIT(index, value)                              \
     current->arch.hvm_vcpu.hvm_trace_values[index] = (value)
 
+/* save/restore support */
+
+//#define HVM_DEBUG_SUSPEND
+
+extern int hvm_register_savevm(struct domain *d,
+                    const char *idstr,
+                    int instance_id,
+                    int version_id,
+                    SaveStateHandler *save_state,
+                    LoadStateHandler *load_state,
+                    void *opaque);
+
+static inline void hvm_ctxt_seek(hvm_domain_context_t *h, unsigned int pos)
+{
+    h->cur = pos;
+}
+
+static inline uint32_t hvm_ctxt_tell(hvm_domain_context_t *h)
+{
+    return h->cur;
+}
+
+static inline int hvm_ctxt_end(hvm_domain_context_t *h)
+{
+    return (h->cur >= h->size || h->cur >= HVM_CTXT_SIZE);
+}
+
+static inline void hvm_put_byte(hvm_domain_context_t *h, unsigned int i)
+{
+    if (h->cur >= HVM_CTXT_SIZE) {
+        h->cur++;
+        return;
+    }
+    h->data[h->cur++] = (char)i;
+}
+
+static inline void hvm_put_8u(hvm_domain_context_t *h, uint8_t b)
+{
+    hvm_put_byte(h, b);
+}
+
+static inline void hvm_put_16u(hvm_domain_context_t *h, uint16_t b)
+{
+    hvm_put_8u(h, b >> 8);
+    hvm_put_8u(h, b);
+}
+
+static inline void hvm_put_32u(hvm_domain_context_t *h, uint32_t b)
+{
+    hvm_put_16u(h, b >> 16);
+    hvm_put_16u(h, b);
+}
+
+static inline void hvm_put_64u(hvm_domain_context_t *h, uint64_t b)
+{
+    hvm_put_32u(h, b >> 32);
+    hvm_put_32u(h, b);
+}
+
+static inline void hvm_put_buffer(hvm_domain_context_t *h, const char *buf, int len)
+{
+    memcpy(&h->data[h->cur], buf, len);
+    h->cur += len;
+}
+
+
+static inline char hvm_get_byte(hvm_domain_context_t *h)
+{
+    if (h->cur >= HVM_CTXT_SIZE) {
+        printk("hvm_get_byte overflow.\n");
+        return -1;
+    }
+
+    if (h->cur >= h->size) {
+        printk("hvm_get_byte exceed data area.\n");
+        return -1;
+    }
+
+    return h->data[h->cur++];
+}
+
+static inline uint8_t hvm_get_8u(hvm_domain_context_t *h)
+{
+    return hvm_get_byte(h);
+}
+
+static inline uint16_t hvm_get_16u(hvm_domain_context_t *h)
+{
+    uint16_t v;
+    v =  hvm_get_8u(h) << 8;
+    v |= hvm_get_8u(h);
+
+    return v;
+}
+
+static inline uint32_t hvm_get_32u(hvm_domain_context_t *h)
+{
+    uint32_t v;
+    v =  hvm_get_16u(h) << 16;
+    v |= hvm_get_16u(h);
+
+    return v;
+}
+
+static inline uint64_t hvm_get_64u(hvm_domain_context_t *h)
+{
+    uint64_t v;
+    v =  (uint64_t)hvm_get_32u(h) << 32;
+    v |= hvm_get_32u(h);
+
+    return v;
+}
+
+static inline void hvm_get_buffer(hvm_domain_context_t *h, char *buf, int len)
+{
+    memcpy(buf, &h->data[h->cur], len);
+    h->cur += len;
+}
+
+extern int hvm_save(struct vcpu*, hvm_domain_context_t *h);
+extern int hvm_load(struct vcpu*, hvm_domain_context_t *h);
+
+extern void shpage_init(struct domain *d, shared_iopage_t *sp);
+
 extern int hvm_enabled;
 
 int hvm_copy_to_guest_phys(paddr_t paddr, void *buf, int size);
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/asm-x86/hvm/vpt.h
--- a/xen/include/asm-x86/hvm/vpt.h	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/include/asm-x86/hvm/vpt.h	Wed Dec 13 22:52:02 2006 +0800
@@ -123,7 +123,7 @@ extern void hvm_hooks_assist(struct vcpu
 extern void hvm_hooks_assist(struct vcpu *v);
 extern void pickup_deactive_ticks(struct periodic_time *vpit);
 extern struct periodic_time *create_periodic_time(
-    u32 period, char irq, char one_shot, time_cb *cb, void *data);
+    struct domain* d, u32 period, char irq, char one_shot, time_cb *cb, void *data);
 extern void destroy_periodic_time(struct periodic_time *pt);
 void pit_init(struct vcpu *v, unsigned long cpu_khz);
 void rtc_init(struct vcpu *v, int base, int irq);
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/public/arch-x86_32.h
--- a/xen/include/public/arch-x86_32.h	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/include/public/arch-x86_32.h	Wed Dec 13 22:52:02 2006 +0800
@@ -181,6 +181,13 @@ DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t)
 DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
 
 typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+#include "vmcs_data.h"
+
+struct hvmcpu_context {
+    uint32_t valid;
+    struct vmcs_data data;
+};
 
 /*
  * The following is all CPU context. Note that the fpu_ctxt block is filled 
@@ -210,6 +217,7 @@ struct vcpu_guest_context {
     unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
     unsigned long failsafe_callback_eip;
     unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
+    struct hvmcpu_context hvmcpu_ctxt;          /* whole vmcs region */
 };
 typedef struct vcpu_guest_context vcpu_guest_context_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/public/arch-x86_64.h
--- a/xen/include/public/arch-x86_64.h	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/include/public/arch-x86_64.h	Wed Dec 13 22:52:02 2006 +0800
@@ -255,6 +255,13 @@ DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t)
 
 typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
 
+#include "vmcs_data.h"
+
+struct hvmcpu_context {
+    uint32_t valid;
+    struct vmcs_data data;
+};
+
 /*
  * The following is all CPU context. Note that the fpu_ctxt block is filled 
  * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
@@ -288,6 +295,7 @@ struct vcpu_guest_context {
     uint64_t      fs_base;
     uint64_t      gs_base_kernel;
     uint64_t      gs_base_user;
+    struct hvmcpu_context hvmcpu_ctxt;          /* whole vmcs region */
 };
 typedef struct vcpu_guest_context vcpu_guest_context_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/public/domctl.h
--- a/xen/include/public/domctl.h	Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/include/public/domctl.h	Wed Dec 13 22:52:02 2006 +0800
@@ -384,6 +384,21 @@ struct xen_domctl_settimeoffset {
 };
 typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
+ 
+#define HVM_CTXT_SIZE        6144
+typedef struct hvm_domain_context {
+    uint32_t cur;
+    uint32_t size;
+    uint8_t data[HVM_CTXT_SIZE];
+} hvm_domain_context_t;
+DEFINE_XEN_GUEST_HANDLE(hvm_domain_context_t);
+
+#define XEN_DOMCTL_gethvmcontext   33
+#define XEN_DOMCTL_sethvmcontext   34
+typedef struct xen_domctl_hvmcontext {
+    XEN_GUEST_HANDLE(hvm_domain_context_t) ctxt;  /* IN/OUT */
+} xen_domctl_hvmcontext_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_t);
 
 struct xen_domctl {
     uint32_t cmd;
@@ -410,6 +425,7 @@ struct xen_domctl {
         struct xen_domctl_hypercall_init    hypercall_init;
         struct xen_domctl_arch_setup        arch_setup;
         struct xen_domctl_settimeoffset     settimeoffset;
+        struct xen_domctl_hvmcontext        hvmcontext;
         uint8_t                             pad[128];
     } u;
 };
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_hvm_restore.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_hvm_restore.c	Wed Dec 13 22:52:02 2006 +0800
@@ -0,0 +1,280 @@
+/******************************************************************************
+ * xc_hvm_restore.c
+ *
+ * Restore the state of a HVM guest.
+ *
+ * Copyright (c) 2003, K A Fraser.
+ * Copyright (c) 2006 Intel Corperation
+ * rewriten for hvm guest by Zhai Edwin <edwin.zhai@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+#include <xen/hvm/ioreq.h>
+#include <xen/hvm/params.h>
+#include <xen/hvm/e820.h>
+
+/* max mfn of the whole machine */
+static unsigned long max_mfn;
+
+/* virtual starting address of the hypervisor */
+static unsigned long hvirt_start;
+
+/* #levels of page tables used by the currrent guest */
+static unsigned int pt_levels;
+
+/* total number of pages used by the current guest */
+static unsigned long max_pfn;
+
+/* A table mapping each PFN to its new MFN. */
+static xen_pfn_t *p2m = NULL;
+
+static ssize_t
+read_exact(int fd, void *buf, size_t count)
+{
+    int r = 0, s;
+    unsigned char *b = buf;
+
+    while (r < count) {
+        s = read(fd, &b[r], count - r);
+        if ((s == -1) && (errno == EINTR))
+            continue;
+        if (s <= 0) {
+            break;
+        }
+        r += s;
+    }
+
+    return (r == count) ? 1 : 0;
+}
+
+int xc_hvm_restore(int xc_handle, int io_fd,
+                     uint32_t dom, unsigned long nr_pfns,
+                     unsigned int store_evtchn, unsigned long *store_mfn,
+                     unsigned int console_evtchn, unsigned long *console_mfn,
+                     unsigned int pae, unsigned int apic)
+{
+    DECLARE_DOMCTL;
+
+    /* The new domain's shared-info frame number. */
+    unsigned long shared_info_frame;
+
+    /* A copy of the CPU context of the guest. */
+    vcpu_guest_context_t ctxt;
+
+    char *region_base;
+
+    xc_mmu_t *mmu = NULL;
+
+    xc_dominfo_t info;
+    unsigned int rc = 1, i;
+    uint32_t rec_len, nr_vcpus;
+    hvm_domain_context_t hvm_ctxt;
+    unsigned long long v_end, memsize;
+    unsigned long shared_page_nr;
+
+    /* hvm guest mem size (Mb) */
+    memsize = (unsigned long long)*store_mfn;
+    v_end = memsize << 20;
+
+    DPRINTF("xc_hvm_restore:dom=%d, nr_pfns=0x%lx, store_evtchn=%d, *store_mfn=%ld, console_evtchn=%d, *console_mfn=%ld, pae=%u, apic=%u.\n", 
+            dom, nr_pfns, store_evtchn, *store_mfn, console_evtchn, *console_mfn, pae, apic);
+
+
+
+    /*XXX: caculate the VGA hole, it's better derived from memsize*/
+    max_pfn = nr_pfns + 0x20;
+
+    if(!get_platform_info(xc_handle, dom,
+                          &max_mfn, &hvirt_start, &pt_levels)) {
+        ERROR("Unable to get platform info.");
+        return 1;
+    }
+
+    DPRINTF("xc_hvm_restore start: max_pfn = %lx, max_mfn = %lx, hvirt_start=%lx, pt_levels=%d\n",
+            max_pfn,
+            max_mfn,
+            hvirt_start,
+            pt_levels);
+
+    if (mlock(&ctxt, sizeof(ctxt))) {
+        /* needed for build dom0 op, but might as well do early */
+        ERROR("Unable to mlock ctxt");
+        return 1;
+    }
+
+
+    p2m        = malloc(max_pfn * sizeof(xen_pfn_t));
+
+    if (p2m == NULL) {
+        ERROR("memory alloc failed");
+        errno = ENOMEM;
+        goto out;
+    }
+
+    /* Get the domain's shared-info frame. */
+    domctl.cmd = XEN_DOMCTL_getdomaininfo;
+    domctl.domain = (domid_t)dom;
+    if (xc_domctl(xc_handle, &domctl) < 0) {
+        ERROR("Could not get information on new domain");
+        goto out;
+    }
+    shared_info_frame = domctl.u.getdomaininfo.shared_info_frame;
+
+    if(xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) {
+        errno = ENOMEM;
+        goto out;
+    }
+
+    for ( i = 0; i < max_pfn; i++ )
+        p2m[i] = i;
+    for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < max_pfn; i++ )
+        p2m[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT;
+
+    /* Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000. */
+    rc = xc_domain_memory_populate_physmap(
+        xc_handle, dom, (max_pfn > 0xa0) ? 0xa0 : max_pfn,
+        0, 0, &p2m[0x00]);
+    if ( (rc == 0) && (max_pfn > 0xc0) )
+        rc = xc_domain_memory_populate_physmap(
+            xc_handle, dom, max_pfn - 0xc0, 0, 0, &p2m[0xc0]);
+    if ( rc != 0 )
+    {
+        PERROR("Could not allocate memory for HVM guest.\n");
+        goto out;
+    }
+
+
+    /**********XXXXXXXXXXXXXXXX******************/
+    if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
+        ERROR("Could not get domain info");
+        return 1;
+    }
+
+    domctl.cmd = XEN_DOMCTL_getdomaininfo;
+    domctl.domain = (domid_t)dom;
+    if (xc_domctl(xc_handle, &domctl) < 0) {
+        ERROR("Could not get information on new domain");
+        goto out;
+    }
+
+    for ( i = 0; i < max_pfn; i++)
+        p2m[i] = i;
+
+    /* resotre memory */
+    if ( (region_base = xc_map_foreign_batch(xc_handle, dom, PROT_READ | PROT_WRITE, p2m, max_pfn) ) == 0) {
+        ERROR("HVM:map page_array failed!\n");
+        goto out;
+    }
+
+    for (i = 0; i < max_pfn; i++) {
+        void *zpage = region_base + i * PAGE_SIZE;
+        if ( p2m[i] == (~0UL)) { /*invalid mfn*/
+            continue;
+        }
+        if (i >= 0xa0 && i < 0xc0) {
+            continue;
+        }
+
+        if (!read_exact(io_fd, zpage, PAGE_SIZE)) {
+            ERROR("HVM:read page %d failed!\n", i);
+            goto out;
+        }
+    }
+
+    (void)munmap(region_base, max_pfn*PAGE_SIZE);
+
+
+/*    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_APIC_ENABLED, apic);*/
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_PAE_ENABLED, pae);
+
+    if ( v_end > HVM_BELOW_4G_RAM_END )
+        shared_page_nr = (HVM_BELOW_4G_RAM_END >> PAGE_SHIFT) - 1;
+    else
+        shared_page_nr = (v_end >> PAGE_SHIFT) - 1;
+
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, shared_page_nr-2);
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, shared_page_nr);
+
+    /* caculate the store_mfn , wrong val cause hang when introduceDomain */
+    *store_mfn = p2m[(v_end >> PAGE_SHIFT) - 2];
+    DPRINTF("hvm restore:calculate new store_mfn=0x%lx,v_end=0x%llx..\n", *store_mfn, v_end);
+
+    /* restore hvm context including pic/pit/shpage */
+    if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) {
+        ERROR("error read hvm context size!\n");
+        goto out;
+    }
+    if (rec_len != sizeof(hvm_ctxt)) {
+        ERROR("hvm context size dismatch!\n");
+        goto out;
+    }
+
+    if (!read_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt))) {
+        ERROR("error read hvm context!\n");
+        goto out;
+    }
+
+    if (( rc = xc_domain_hvm_setcontext(xc_handle, dom, &hvm_ctxt))) {
+        ERROR("error set hvm context!\n");
+        goto out;
+    }
+
+    if (!read_exact(io_fd, &nr_vcpus, sizeof(uint32_t))) {
+        ERROR("error read nr vcpu !\n");
+        goto out;
+    }
+    DPRINTF("hvm restore:get nr_vcpus=%d.\n", nr_vcpus);
+
+    for (i =0; i < nr_vcpus; i++) {
+        if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) {
+            ERROR("error read vcpu context size!\n");
+            goto out;
+        }
+        if (rec_len != sizeof(ctxt)) {
+            ERROR("vcpu context size dismatch!\n");
+            goto out;
+        }
+
+        if (!read_exact(io_fd, &(ctxt), sizeof(ctxt))) {
+            ERROR("error read vcpu context.\n");
+            goto out;
+        }
+
+        if ( (rc = xc_vcpu_setcontext(xc_handle, dom, i, &ctxt)) ) {
+            ERROR("Could not set vcpu context, rc=%d", rc);
+            goto out;
+        }
+    }
+
+    rc = 0;
+    goto out;
+
+ out:
+    if ( (rc != 0) && (dom != 0) )
+        xc_domain_destroy(xc_handle, dom);
+    free(mmu);
+    free(p2m);
+
+    DPRINTF("Restore exit with rc=%d\n", rc);
+
+    return rc;
+}
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_hvm_save.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_hvm_save.c	Wed Dec 13 22:52:02 2006 +0800
@@ -0,0 +1,248 @@
+/******************************************************************************
+ * xc_hvm_save.c
+ *
+ * Save the state of a running HVM guest.
+ *
+ * Copyright (c) 2003, K A Fraser.
+ * Copyright (c) 2006 Intel Corperation
+ * rewriten for hvm guest by Zhai Edwin <edwin.zhai@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <inttypes.h>
+#include <time.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include "xc_private.h"
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+#define DEF_MAX_ITERS    (4 - 1)	/* limit us to 4 times round loop  */
+#define DEF_MAX_FACTOR   3		/* never send more than 3x nr_pfns */
+
+/* max mfn of the whole machine */
+static unsigned long max_mfn;
+
+/* virtual starting address of the hypervisor */
+static unsigned long hvirt_start;
+
+/* #levels of page tables used by the currrent guest */
+static unsigned int pt_levels;
+
+/* total number of pages used by the current guest */
+static unsigned long max_pfn;
+
+#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
+
+int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+                  uint32_t max_factor, uint32_t flags, int (*suspend)(int))
+{
+    xc_dominfo_t info;
+
+    int rc = 1, i;
+    int live  = (flags & XCFLAGS_LIVE);
+    int debug = (flags & XCFLAGS_DEBUG);
+
+    /* The new domain's shared-info frame number. */
+    unsigned long shared_info_frame;
+
+    /* A copy of the CPU context of the guest. */
+    vcpu_guest_context_t ctxt;
+
+    /* A copy of hvm domain context */
+    hvm_domain_context_t hvm_ctxt;
+
+    /* Live mapping of shared info structure */
+    shared_info_t *live_shinfo = NULL;
+
+    /* base of the region in which domain memory is mapped */
+    unsigned char *region_base = NULL;
+
+    uint32_t nr_pfns, max_pfns, rec_size, nr_vcpus;
+    unsigned long *page_array;
+
+    DPRINTF("xc_hvm_save:dom=%d, max_iters=%d, max_factor=%d, flags=0x%x.\n",
+            dom, max_iters, max_factor, flags);
+
+    /* If no explicit control parameters given, use defaults */
+    if(!max_iters)
+        max_iters = DEF_MAX_ITERS;
+    if(!max_factor)
+        max_factor = DEF_MAX_FACTOR;
+
+/*    initialize_mbit_rate();*/
+
+    if(!get_platform_info(xc_handle, dom,
+                          &max_mfn, &hvirt_start, &pt_levels)) {
+        ERROR("HVM:Unable to get platform info.");
+        return 1;
+    }
+
+    if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
+        ERROR("HVM:Could not get domain info");
+        return 1;
+    }
+    nr_vcpus = info.nr_online_vcpus;
+
+    if (mlock(&ctxt, sizeof(ctxt))) {
+        ERROR("HVM:Unable to mlock ctxt");
+        return 1;
+    }
+
+    /* Only have to worry about vcpu 0 even for SMP */
+    if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) {
+        ERROR("HVM:Could not get vcpu context");
+        goto out;
+    }
+    shared_info_frame = info.shared_info_frame;
+
+    /* A cheesy test to see whether the domain contains valid state. */
+    if (ctxt.ctrlreg[3] == 0)
+    {
+        ERROR("Domain is not in a valid HVM guest state");
+        goto out;
+    }
+
+   /* cheesy sanity check */
+    if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
+        ERROR("Invalid HVM state record -- pfn count out of range: %lu",
+            (info.max_memkb >> (PAGE_SHIFT - 10)));
+        goto out;
+    }
+
+    /* Map the shared info frame */
+    if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+                                            PROT_READ, shared_info_frame))) {
+        ERROR("HVM:Couldn't map live_shinfo");
+        goto out;
+    }
+
+    max_pfn = live_shinfo->arch.max_pfn;
+
+    DPRINTF("saved hvm domain info:max_memkb=0x%lx, max_mfn=0x%lx, nr_pages=0x%lx\n", info.max_memkb, max_mfn, info.nr_pages); 
+
+    if (live) {
+        ERROR("hvm domain doesn't support live migration now.\n");
+        if (debug)
+            ERROR("hvm domain debug on.\n");
+        goto out;
+    }
+
+    /* suspend hvm domain */
+    if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt)) {
+        ERROR("HVM Domain appears not to have suspended");
+        goto out;
+    }
+
+    nr_pfns = info.nr_pages;
+    DPRINTF("after suspend hvm domain nr_pages=0x%x, max_memkb=0x%lx.\n", nr_pfns, info.max_memkb);
+
+    /*XXX: caculate the VGA hole*/
+    max_pfns = nr_pfns + 0x20;
+
+    /* get all the HVM domain pfns */
+    if ( (page_array = (unsigned long *) malloc (sizeof(unsigned long) * max_pfns)) == NULL) {
+        ERROR("HVM:malloc fail!\n");
+        goto out;
+    }
+
+    for ( i = 0; i < max_pfns; i++)
+        page_array[i] = i;
+
+    if ( (region_base = xc_map_foreign_batch(xc_handle, dom, PROT_READ | PROT_WRITE, page_array, max_pfns) ) == 0) {
+        ERROR("HVM domain map pages failed!\n");
+        goto out;
+    }
+
+
+    /* Start writing out the saved-domain record. begin with mem */
+    if (!write_exact(io_fd, &nr_pfns, sizeof(unsigned int))) {
+        ERROR("write: nr_pfns");
+        goto out;
+    }
+
+    for (i = 0; i < max_pfns; i++) {
+        int ret;
+        void *zpage = region_base + i * PAGE_SIZE;
+        if ( page_array[i] == (~0UL)) {
+            continue;
+        }
+        if (i >= 0xa0 && i < 0xc0) {
+            continue;
+        }
+            
+        if ((ret = ratewrite(io_fd, zpage, PAGE_SIZE)) != PAGE_SIZE) {
+            ERROR("HVM:read page %d failed, mfn=0x%lx.\n", i, page_array[i]);
+            goto out;
+        }
+    }
+
+    /* save hvm hypervisor state including pic/pit/shpage */
+    if (mlock(&hvm_ctxt, sizeof(hvm_ctxt))) {
+        ERROR("Unable to mlock ctxt");
+        return 1;
+    }
+
+    if (xc_domain_hvm_getcontext(xc_handle, dom, &hvm_ctxt)){
+        ERROR("HVM:Could not get hvm context");
+        goto out;
+    }
+
+    rec_size = sizeof(hvm_ctxt);
+    if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) {
+        ERROR("error write hvm ctxt size");
+        goto out;
+    }
+
+    if ( !write_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt)) ) {
+        ERROR("write HVM info failed!\n");
+    }
+
+    /* save vcpu/vmcs context */
+    if (!write_exact(io_fd, &nr_vcpus, sizeof(uint32_t))) {
+        ERROR("error write nr vcpus");
+        goto out;
+    }
+
+    /*XXX: need a online map to exclude down cpu */
+    for (i = 0; i < nr_vcpus; i++) {
+
+        if (xc_vcpu_getcontext(xc_handle, dom, i, &ctxt)) {
+            ERROR("HVM:Could not get vcpu context");
+            goto out;
+        }
+
+        rec_size = sizeof(ctxt);
+        DPRINTF("write %d vcpucontext of total %d.\n", i, nr_vcpus); 
+        if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) {
+            ERROR("error write vcpu ctxt size");
+            goto out;
+        }
+
+        if (!write_exact(io_fd, &(ctxt), sizeof(ctxt)) ) {
+            ERROR("write vmcs failed!\n");
+            goto out;
+        }
+    }
+ 
+    /* Success! */
+    rc = 0;
+
+ out:
+    return !!rc;
+}
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/public/vmcs_data.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/public/vmcs_data.h	Wed Dec 13 22:52:02 2006 +0800
@@ -0,0 +1,68 @@
+/******************************************************************************
+ * vmcs_data.h
+ * 
+ * Copyright (c) 2006 Intel Corperation
+ * 
+ */
+
+#ifndef __XEN_PUBLIC_VMCS_DATA_H__
+#define __XEN_PUBLIC_VMCS_DATA_H__
+
+/*
+ * World vmcs state
+ */
+struct vmcs_data {
+    uint64_t  eip;        /* execution pointer */
+    uint64_t  esp;        /* stack pointer */
+    uint64_t  eflags;     /* flags register */
+    uint64_t  cr0;
+    uint64_t  cr3;        /* page table directory */
+    uint64_t  cr4;
+    uint32_t  idtr_limit; /* idt */
+    uint64_t  idtr_base;
+    uint32_t  gdtr_limit; /* gdt */
+    uint64_t  gdtr_base;
+    uint32_t  cs_sel;     /* cs selector */
+    uint32_t  cs_limit;
+    uint64_t  cs_base;
+    uint32_t  cs_arbytes;
+    uint32_t  ds_sel;     /* ds selector */
+    uint32_t  ds_limit;
+    uint64_t  ds_base;
+    uint32_t  ds_arbytes;
+    uint32_t  es_sel;     /* es selector */
+    uint32_t  es_limit;
+    uint64_t  es_base;
+    uint32_t  es_arbytes;
+    uint32_t  ss_sel;     /* ss selector */
+    uint32_t  ss_limit;
+    uint64_t  ss_base;
+    uint32_t  ss_arbytes;
+    uint32_t  fs_sel;     /* fs selector */
+    uint32_t  fs_limit;
+    uint64_t  fs_base;
+    uint32_t  fs_arbytes;
+    uint32_t  gs_sel;     /* gs selector */
+    uint32_t  gs_limit;
+    uint64_t  gs_base;
+    uint32_t  gs_arbytes;
+    uint32_t  tr_sel;     /* task selector */
+    uint32_t  tr_limit;
+    uint64_t  tr_base;
+    uint32_t  tr_arbytes;
+    uint32_t  ldtr_sel;   /* ldtr selector */
+    uint32_t  ldtr_limit;
+    uint64_t  ldtr_base;
+    uint32_t  ldtr_arbytes;
+    uint32_t  sysenter_cs;
+    uint64_t  sysenter_esp;
+    uint64_t  sysenter_eip;
+    /* msr for em64t */
+    uint64_t shadow_gs;
+    uint64_t flags;
+    /* same size as VMX_MSR_COUNT */
+    uint64_t msr_items[6];
+    uint64_t vmxassist_enabled;
+};
+typedef struct vmcs_data vmcs_data_t;
+#endif

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2006-12-20 12:34 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-12-13 15:45 hvm save restore support Zhai, Edwin
2006-12-13 17:04 ` Keir Fraser
2006-12-14  2:46   ` Zhai, Edwin
2006-12-14  8:02     ` Keir Fraser
2006-12-15  8:25       ` Zhai, Edwin
2006-12-15  8:37         ` Keir Fraser
2006-12-20 12:34           ` Zhai, Edwin
2006-12-13 20:09 ` Anthony Liguori
2006-12-14  3:23   ` Zhai, Edwin
2006-12-14  8:34   ` Keir Fraser

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.