[PATCH 4/5] kvm-lite: "The Unbearable Liteness"

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

From: Rusty Russell <rusty-8n+1lVoiYb80n/F98K4Iww@public.gmane.org>
To: kvm-devel <kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org>
Subject: [PATCH 4/5] kvm-lite: "The Unbearable Liteness"
Date: Thu, 06 Sep 2007 01:42:53 +1000	[thread overview]
Message-ID: <1189006973.10802.140.camel@localhost.localdomain> (raw)
In-Reply-To: <1189005692.10802.132.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>

This patch is not for serious application, but makes interesting
reading.  Requires Anthony's new hypercall patches.

Comments:

1) asm-offsets are required for lite's switcher/trampoline (lite_switcher.S)
   and for guest's assembler code.
2) Includes proposed "platform_type" extension for boot protocol.  May not be
   necessary.
3) kvm/ now contains guest code, so needs to be obj-y.
4) Debugging flags just to prove code isn't ready
5) Changes emulator not to try to restore regs after inject_gp.  May refactor
   (we need to deal with this for inject_pf anyway).
6) Fixes "first_cpu()" macro.
7) Temporary hcalls, deliberately sucky.

diff -r 039995825488 arch/i386/kernel/asm-offsets.c
--- a/arch/i386/kernel/asm-offsets.c	Fri Aug 31 15:38:42 2007 +1000
+++ b/arch/i386/kernel/asm-offsets.c	Sun Sep 02 10:13:37 2007 +1000
@@ -22,6 +22,11 @@
 #ifdef CONFIG_LGUEST_GUEST
 #include <linux/lguest.h>
 #include "../../../drivers/lguest/lg.h"
+#endif
+
+#if defined(CONFIG_KVM_LITE) || defined(CONFIG_KVM_LITE_MODULE)
+#include <linux/kvm_lite.h>
+#include "../../../drivers/kvm/lite.h"
 #endif
 
 #define DEFINE(sym, val) \
@@ -144,4 +149,19 @@ void foo(void)
 	OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
 	OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
 #endif
+
+#if defined(CONFIG_KVM_LITE) || defined(CONFIG_KVM_LITE_MODULE)
+	BLANK();
+	OFFSET(LITE_DATA_irq_enabled, kvm_lite_data, irq_enabled);
+	OFFSET(LITE_PAGES_host_gdt_desc, lite_pages, state.host_gdt_desc);
+	OFFSET(LITE_PAGES_host_idt_desc, lite_pages, state.host_idt_desc);
+	OFFSET(LITE_PAGES_host_cr3, lite_pages, state.host_cr3);
+	OFFSET(LITE_PAGES_host_sp, lite_pages, state.host_sp);
+	OFFSET(LITE_PAGES_guest_gdt_desc, lite_pages,state.guest_gdt_desc);
+	OFFSET(LITE_PAGES_guest_idt_desc, lite_pages,state.guest_idt_desc);
+	OFFSET(LITE_PAGES_guest_gdt, lite_pages, state.guest_gdt);
+	OFFSET(LITE_PAGES_regs_trapnum, lite_pages, regs.trapnum);
+	OFFSET(LITE_PAGES_regs_errcode, lite_pages, regs.errcode);
+	OFFSET(LITE_PAGES_regs, lite_pages, regs);
+#endif
 }
diff -r 039995825488 arch/i386/kernel/head.S
--- a/arch/i386/kernel/head.S	Fri Aug 31 15:38:42 2007 +1000
+++ b/arch/i386/kernel/head.S	Fri Aug 31 15:42:48 2007 +1000
@@ -70,7 +70,14 @@ INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + 
  */
 .section .text.head,"ax",@progbits
 ENTRY(startup_32)
-
+/*
+ * Check if we're some special platform type.
+ * FIXME: check header version first!
+ */
+#ifdef CONFIG_KVM_LITE_GUEST
+	cmpl	$2, 0x23c(%esi)
+	je	kvm_lite_init_asm
+#endif
 /*
  * Set segments to known values.
  */
diff -r 039995825488 drivers/Makefile
--- a/drivers/Makefile	Fri Aug 31 15:38:42 2007 +1000
+++ b/drivers/Makefile	Fri Aug 31 15:42:48 2007 +1000
@@ -48,7 +48,7 @@ obj-$(CONFIG_PCCARD)		+= pcmcia/
 obj-$(CONFIG_PCCARD)		+= pcmcia/
 obj-$(CONFIG_DIO)		+= dio/
 obj-$(CONFIG_SBUS)		+= sbus/
-obj-$(CONFIG_KVM)		+= kvm/
+obj-y				+= kvm/
 obj-$(CONFIG_ZORRO)		+= zorro/
 obj-$(CONFIG_MAC)		+= macintosh/
 obj-$(CONFIG_ATA_OVER_ETH)	+= block/aoe/
diff -r 039995825488 drivers/kvm/Kconfig
--- a/drivers/kvm/Kconfig	Fri Aug 31 15:38:42 2007 +1000
+++ b/drivers/kvm/Kconfig	Fri Aug 31 15:42:48 2007 +1000
@@ -41,4 +41,22 @@ config KVM_AMD
 	  Provides support for KVM on AMD processors equipped with the AMD-V
 	  (SVM) extensions.
 
+config KVM_LITE
+	tristate "KVM-lite for unsupported processors"
+	depends on KVM
+	select X86_PAE
+	---help---
+	  Provides minimal support for booting paravirtualized guests.
+	  Does not require Intel VT or AMD AMD-V extensions, but does require
+	  a processor which support PAE (Pentium Pro and better).  If you say
+	  Y or M here, your kernel will not boot on a processor without PAE
+	  support.
+
+config KVM_LITE_GUEST
+	bool "KVM-lite guest support"
+	depends on PARAVIRT && EXPERIMENTAL
+	select HVC_DRIVER
+	---help---
+	  Allows this kernel to run under kvm-lite.
+
 endif # VIRTUALIZATION
diff -r 039995825488 drivers/kvm/Makefile
--- a/drivers/kvm/Makefile	Fri Aug 31 15:38:42 2007 +1000
+++ b/drivers/kvm/Makefile	Fri Aug 31 15:42:48 2007 +1000
@@ -8,3 +8,11 @@ obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
 kvm-amd-objs = svm.o
 obj-$(CONFIG_KVM_AMD) += kvm-amd.o
+kvm-lite-objs = lite.o lite_switcher.o
+obj-$(CONFIG_KVM_LITE) += kvm-lite.o
+obj-$(CONFIG_KVM_LITE_GUEST) += lite_guest.o lite_guest_asm.o
+CFLAGS_lite.o += -O0 -g
+CFLAGS_lite_guest.o += -O0 -g
+CFLAGS_mmu.o += -O0 -g
+CFLAGS_kvm_main.o += -O0 -g
+CFLAGS_x86_emulate.o += -O0 -g
diff -r 039995825488 drivers/kvm/kvm.h
--- a/drivers/kvm/kvm.h	Fri Aug 31 15:38:42 2007 +1000
+++ b/drivers/kvm/kvm.h	Fri Aug 31 15:42:48 2007 +1000
@@ -153,6 +153,7 @@ struct kvm_mmu {
 	hpa_t root_hpa;
 	int root_level;
 	int shadow_root_level;
+	int nailed;
 
 	u64 *pae_root;
 };
@@ -491,10 +492,14 @@ void kvm_exit_arch(void);
 
 int kvm_mmu_module_init(void);
 void kvm_mmu_module_exit(void);
+int kvm_mmu_init_nailed_mapping(int cpu, struct page *page[], unsigned num);
+void kvm_mmu_free_nailed_mapping(int cpu);
 
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_use_nailed_mappings(struct kvm_vcpu *vcpu);
+void kvm_remove_nailed_mappings(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
diff -r 039995825488 drivers/kvm/kvm_main.c
--- a/drivers/kvm/kvm_main.c	Fri Aug 31 15:38:42 2007 +1000
+++ b/drivers/kvm/kvm_main.c	Tue Sep 04 10:30:56 2007 +1000
@@ -530,6 +530,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsi
 		return;
 	}
 	kvm_arch_ops->set_cr4(vcpu, cr4);
+	vcpu->cr4 = cr4;
 	mutex_lock(&vcpu->kvm->lock);
 	kvm_mmu_reset_context(vcpu);
 	mutex_unlock(&vcpu->kvm->lock);
@@ -918,6 +919,7 @@ void mark_page_dirty(struct kvm *kvm, gf
 			set_bit(rel_gfn, memslot->dirty_bitmap);
 	}
 }
+EXPORT_SYMBOL_GPL(mark_page_dirty);
 
 int emulator_read_std(unsigned long addr,
 			     void *val,
@@ -1146,10 +1148,8 @@ int emulate_invlpg(struct kvm_vcpu *vcpu
 
 int emulate_clts(struct kvm_vcpu *vcpu)
 {
-	unsigned long cr0;
-
-	cr0 = vcpu->cr0 & ~X86_CR0_TS;
-	kvm_arch_ops->set_cr0(vcpu, cr0);
+	vcpu->cr0 &= ~X86_CR0_TS;
+	kvm_arch_ops->set_cr0(vcpu, vcpu->cr0);
 	return X86EMUL_CONTINUE;
 }
 
@@ -2033,6 +2033,7 @@ static int kvm_vcpu_ioctl_set_sregs(stru
 	kvm_arch_ops->decache_cr4_guest_bits(vcpu);
 
 	mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
+	vcpu->cr0 = sregs->cr0;
 	kvm_arch_ops->set_cr0(vcpu, sregs->cr0);
 
 	mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
diff -r 039995825488 drivers/kvm/lite.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/kvm/lite.c	Wed Sep 05 10:20:33 2007 +1000
@@ -0,0 +1,1716 @@
+/*
+ * Kernel-based Virtual Machine driver for old CPUs (paravirt OS only)
+ * Copyright 2007 Rusty Russell <rusty-8n+1lVoiYb80n/F98K4Iww@public.gmane.org>, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/kvm_lite.h>
+#include <asm/desc.h>
+
+#include "lite.h"
+#include "x86_emulate.h"
+
+/* Found in lite_switcher.S */
+extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
+extern unsigned long default_idt_entries[];
+
+/* Every guest maps the core switcher code. */
+#define SHARED_SWITCHER_PAGES \
+	DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
+/* Pages for switcher itself, then two pages per cpu */
+#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
+
+/* We map at -2M for ease of mapping into the guest (one PTE page). */
+#define SWITCHER_ADDR 0xFFE00000
+
+/* Guest runs in ring 1. */
+#define GUEST_PL 1
+
+static struct vm_struct *switcher_vma;
+static struct page **switcher_page;
+static int host_has_pge;
+
+/* Offset from where switcher.S was compiled to where we've copied it */
+static unsigned long switcher_offset(void)
+{
+	return SWITCHER_ADDR - (unsigned long)start_switcher_text;
+}
+
+static struct {
+	unsigned long offset;
+	unsigned short segment;
+} lite_entry;
+
+struct vcpu_lite
+{
+	/* If guest behaves badly, we refuse to run it. */
+	bool dead;
+
+	/* Should we inject a page fault? */
+	bool wants_page_fault;
+	unsigned long pf_addr;
+	unsigned int pf_errcode;
+
+	/* cpu we are running on. */
+	struct lite_pages *curr_pages;
+
+	struct kvm_vcpu vcpu;
+
+	/* GDT & IDT supplied by guest. */
+	struct descriptor_table gdt_desc, idt_desc;
+
+	/* The GDT entries copied into lite_ro_state when running. */
+	struct segment_descriptor gdt[GDT_ENTRIES];
+
+	/* The IDT entries: some copied into lite_ro_state when running. */
+	struct desc_struct idt[KVM_NR_INTERRUPTS];
+
+	/* We ignore all but the TS bit here. */
+	unsigned long cr0;
+
+	/* Head of the (shadow) page tables. */
+	unsigned long pgtable;
+
+	/* rflags of guest */
+	unsigned long rflags;
+
+	/* Guest kernel stack. */
+	unsigned long kstack;
+	u16 kstack_ss;
+
+	/* Guest physical address of lite_data */
+	/* FIXME: Permanently map page? */
+	gpa_t lite_data;
+
+	/* The segment registers. */
+	u32 sregs[6];
+};
+
+#define kill_guest(lite, fmt, ...)				\
+do {								\
+	if (!(lite)->dead) {					\
+		(lite)->dead = true;				\
+		if (printk_ratelimit())				\
+			printk(fmt"\n" , ## __VA_ARGS__);	\
+	}							\
+} while(0)
+
+#define get_lite_data(lite, member)					\
+({									\
+	typeof(((struct kvm_lite_data *)0)->member) __v;		\
+									\
+	if (emulator_read_std((lite)->lite_data				\
+			      + offsetof(struct kvm_lite_data, member),	\
+			      &__v, sizeof(__v), &(lite)->vcpu)		\
+	    != X86EMUL_CONTINUE)					\
+		kill_guest(lite, "Reading " #member);			\
+	__v;								\
+})
+
+#define set_lite_data(lite, member, val)				\
+({									\
+	typeof(((struct kvm_lite_data *)0)->member) __v = (val);	\
+	if (emulator_write_emulated((lite)->lite_data			\
+				    + offsetof(struct kvm_lite_data, member), \
+				    &(__v), sizeof(__v), &(lite)->vcpu)	\
+	    != X86EMUL_CONTINUE)					\
+		kill_guest(lite, "Writing " #member);			\
+})
+
+static void dump_regs(const char *str, int trap, const struct vcpu_lite *lite)
+{
+	if (trap >= 0)
+		printk("Trap %i: ", trap);
+#if 1
+	printk("%s@%#lx\n"
+	       " eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n"
+	       " esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
+	       str, lite->vcpu.rip,
+	       lite->vcpu.regs[VCPU_REGS_RAX],
+	       lite->vcpu.regs[VCPU_REGS_RBX],
+	       lite->vcpu.regs[VCPU_REGS_RCX],
+	       lite->vcpu.regs[VCPU_REGS_RDX],
+	       lite->vcpu.regs[VCPU_REGS_RSI],
+	       lite->vcpu.regs[VCPU_REGS_RDI],
+	       lite->vcpu.regs[VCPU_REGS_RBP],
+	       lite->vcpu.regs[VCPU_REGS_RSP]);
+#else
+	printk("%s@%#lx (%lu)\n",
+	       str, lite->vcpu.rip, lite->total_exits);
+#endif
+}
+
+static inline struct vcpu_lite *to_lite(struct kvm_vcpu *vcpu)
+{
+	return container_of(vcpu, struct vcpu_lite, vcpu);
+}
+
+/* This cpu's struct lite_pages. */
+static struct lite_pages *lite_pages(unsigned int cpu)
+{
+	return &(((struct lite_pages *)
+		  (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
+}
+
+static __init int lite_is_always_supported(void)
+{
+	return 1;
+}
+
+static __init int you_cant_stop_the_lite(void)
+{
+	return 0;
+}
+
+static __init void hardware_enable(void *dummy)
+{
+	if (host_has_pge) {
+		/* FIXME: Do this only before running. */
+		write_cr4(read_cr4() & ~X86_CR4_PGE);
+
+		/* Turn off the feature in the global feature set. */
+		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+	}
+}
+
+static __init void hardware_disable(void *dummy)
+{
+	if (host_has_pge) {
+		write_cr4(read_cr4() | X86_CR4_PGE);
+		/* FIXME: Do this only after all cpus done. */
+		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+	}
+}
+
+static void check_processor_compatibility(void *rtn)
+{
+	*(int *)rtn = 0;
+}
+
+/* This routine is called at boot or modprobe time for each CPU to set up the
+ * "constant" GDT entries for Guests running on that CPU. */
+static void setup_default_gdt_entries(struct lite_ro_state *state)
+{
+	/* FIXME: Use segment_descriptor */
+	struct desc_struct *gdt = (struct desc_struct *)state->guest_gdt;
+	unsigned long tss = (unsigned long)&state->guest_tss;
+
+	/* The hypervisor segments are full 0-4G segments, privilege level 0 */
+	gdt[GDT_ENTRY_LITE_CS] = FULL_EXEC_SEGMENT;
+	gdt[GDT_ENTRY_LITE_DS] = FULL_SEGMENT;
+
+	/* The TSS segment refers to the TSS entry for this CPU, so we cannot
+	 * copy it from the Guest.  Forgive the magic flags */
+	gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);
+	gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000)
+		| ((tss >> 16) & 0x000000FF);
+}
+
+/* The default entry for each interrupt points into the Switcher routines which
+ * simply return to the Host.  handle_exit() will then bounce it back into the
+ * Guest. */
+static void default_idt_entry(struct desc_struct *idt,
+			      int trap,
+			      const unsigned long handler)
+{
+	/* A present interrupt gate. */
+	u32 flags = 0x8e00;
+
+	/* Set the privilege level on the entry for the hypercall: this allows
+	 * the Guest to use the "int" instruction to trigger it. */
+	if (trap == KVM_LITE_HCALL_TRAP)
+		flags |= (GUEST_PL << 13);
+#if 1 /* FIXME: When we do direct traps, this is not necessary. */
+	if (trap == SYSCALL_VECTOR)
+		flags |= (3 << 13);
+#endif
+	/* Now pack it into the IDT entry in its weird format. */
+	idt->a = (LITE_CS<<16) | (handler&0x0000FFFF);
+	idt->b = (handler&0xFFFF0000) | flags;
+}
+
+/* When the Guest first starts, we put default entries into the IDT. */
+static void setup_default_idt_entries(struct lite_ro_state *state,
+				      const unsigned long *def)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
+		default_idt_entry(&state->guest_idt[i], i, def[i]);
+}
+
+static int setup_nailed_mappings(void)
+{
+	int i, j, err;
+	struct page *pages[TOTAL_SWITCHER_PAGES];
+
+	/* Every CPU has the hypervisor text (read only). */
+	for (i = 0; i < SHARED_SWITCHER_PAGES; i++) {
+		pages[i] = switcher_page[i];
+		pages[i]->private = 0;
+	}
+
+	for_each_possible_cpu(i) {
+		memset(pages+SHARED_SWITCHER_PAGES, 0,
+		       sizeof(pages) - SHARED_SWITCHER_PAGES*sizeof(pages[0]));
+		pages[SHARED_SWITCHER_PAGES + i*2]
+			= switcher_page[SHARED_SWITCHER_PAGES + i*2];
+		pages[SHARED_SWITCHER_PAGES + i*2 + 1]
+			= switcher_page[SHARED_SWITCHER_PAGES + i*2 + 1];
+		/* First page is writable, second isn't */
+		pages[SHARED_SWITCHER_PAGES + i*2]->private = 1;
+		pages[SHARED_SWITCHER_PAGES + i*2 + 1]->private = 0;
+		err = kvm_mmu_init_nailed_mapping(i, pages, ARRAY_SIZE(pages));
+		if (err)
+			goto undo;
+	}
+	return 0;
+
+undo:
+	for_each_possible_cpu(j) {
+		if (j == i)
+			break;
+		kvm_mmu_free_nailed_mapping(j);
+	}
+	return err;
+}
+
+static __init int map_switcher(void)
+{
+	int i, err;
+	struct page **pagep;
+
+	/*
+	 * Map the Switcher in to high memory.
+	 *
+	 * It turns out that if we choose the address 0xFFE00000 (2MB under the
+	 * top virtual address), it makes setting up the page tables really
+	 * easy.
+	 */
+
+	/* This is the "lite under lite" case.  Just Say No. */
+	if (__FIXADDR_TOP <= SWITCHER_ADDR) {
+		printk(KERN_INFO "kvm-lite: top of memory already reserved\n");
+		err = -EEXIST;
+		goto out;
+	}
+
+	/* We allocate an array of "struct page"s.  map_vm_area() wants the
+	 * pages in this form, rather than just an array of pointers. */
+	switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
+				GFP_KERNEL);
+	if (!switcher_page) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* Now we actually allocate the pages.  The Guest will see these pages,
+	 * so we make sure they're zeroed. */
+	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
+		unsigned long addr = get_zeroed_page(GFP_KERNEL);
+		if (!addr) {
+			err = -ENOMEM;
+			goto free_some_pages;
+		}
+		switcher_page[i] = virt_to_page(addr);
+	}
+
+	/* Now we reserve the "virtual memory area" we want: 0xFFC00000
+	 * (SWITCHER_ADDR).  We might not get it in theory, but in practice
+	 * it's worked so far. */
+	switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
+				     VM_ALLOC, SWITCHER_ADDR, VMALLOC_END);
+	if (!switcher_vma) {
+		err = -ENOMEM;
+		printk("lite: could not map switcher pages high\n");
+		goto free_pages;
+	}
+
+	err = setup_nailed_mappings();
+	if (err) {
+		printk("lite: failed to set up nailed mappings: %i\n", err);
+		goto free_vma;
+	}
+
+	/* This code actually sets up the pages we've allocated to appear at
+	 * SWITCHER_ADDR.  map_vm_area() takes the vma we allocated above, the
+	 * kind of pages we're mapping (kernel pages), and a pointer to our
+	 * array of struct pages.  It increments that pointer, but we don't
+	 * care. */
+	pagep = switcher_page;
+	err = map_vm_area(switcher_vma, __pgprot(_PAGE_KERNEL_EXEC), &pagep);
+	if (err) {
+		printk("lite: map_vm_area failed: %i\n", err);
+		goto free_nailed_mappings;
+	}
+
+	/* Now the switcher is mapped at the right address, we can't fail!
+	 * Copy in the compiled-in Switcher code (from switcher.S). */
+	memcpy(switcher_vma->addr, start_switcher_text,
+	       end_switcher_text - start_switcher_text);
+
+	/* Most of the switcher.S doesn't care that it's been moved; on Intel,
+	 * jumps are relative, and it doesn't access any references to external
+	 * code or data.
+	 *
+	 * The only exception is the interrupt handlers in switcher.S: their
+	 * addresses are placed in a table (default_idt_entries), so we need to
+	 * update the table with the new addresses.  switcher_offset() is a
+	 * convenience function which returns the distance between the builtin
+	 * switcher code and the high-mapped copy we just made. */
+	for (i = 0; i < IDT_ENTRIES; i++)
+		default_idt_entries[i] += switcher_offset();
+
+	/*
+	 * Set up the Switcher's per-cpu areas.
+	 *
+	 * Each CPU gets two pages of its own within the high-mapped region
+	 * (aka. "struct lite_pages").  Much of this can be initialized now,
+	 * but some depends on what Guest we are running (which is set up in
+	 * copy_in_guest_info()).
+	 */
+	for_each_possible_cpu(i) {
+		/* lite_pages() returns this CPU's two pages. */
+		struct lite_pages *pages = lite_pages(i);
+		/* This is a convenience pointer to make the code fit one
+		 * statement to a line. */
+		struct lite_ro_state *state = &pages->state;
+
+		/* The Global Descriptor Table: the Host has a different one
+		 * for each CPU.  We keep a descriptor for the GDT which says
+		 * where it is and how big it is (the limit is the last
+		 * byte, not the size, hence the "-1"). */
+		state->host_gdt_desc.limit = GDT_SIZE-1;
+		state->host_gdt_desc.base = (long)get_cpu_gdt_table(i);
+
+		/* All CPUs on the Host use the same Interrupt Descriptor
+		 * Table, so we just use get_idt(), which gets this CPU's IDT
+		 * descriptor. */
+		get_idt(&state->host_idt_desc);
+
+		/* The descriptors for the Guest's GDT and IDT can be filled
+		 * out now, too.  We copy the GDT & IDT into ->guest_gdt and
+		 * ->guest_idt before actually running the Guest. */
+		state->guest_idt_desc.limit = sizeof(state->guest_idt)-1;
+		state->guest_idt_desc.base = (long)&state->guest_idt;
+		state->guest_gdt_desc.limit = sizeof(state->guest_gdt)-1;
+		state->guest_gdt_desc.base = (long)&state->guest_gdt;
+
+		/* We know where we want the stack to be when the Guest enters
+		 * the switcher: in pages->regs.  The stack grows upwards, so
+		 * we start it at the end of that structure. */
+		state->guest_tss.esp0 = (long)(&pages->regs + 1);
+		/* And this is the GDT entry to use for the stack: we keep a
+		 * couple of special LITE entries. */
+		state->guest_tss.ss0 = LITE_DS;
+
+		/* x86 can have a finegrained bitmap which indicates what I/O
+		 * ports the process can use.  We set it to the end of our
+		 * structure, meaning "none". */
+		state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
+
+		/* Some GDT entries are the same across all Guests, so we can
+		 * set them up now. */
+		setup_default_gdt_entries(state);
+		/* Most IDT entries are the same for all Guests, too.*/
+		setup_default_idt_entries(state, default_idt_entries);
+
+		/* The Host needs to be able to use the LITE segments on this
+		 * CPU, too, so put them in the Host GDT. */
+		get_cpu_gdt_table(i)[GDT_ENTRY_LITE_CS] = FULL_EXEC_SEGMENT;
+		get_cpu_gdt_table(i)[GDT_ENTRY_LITE_DS] = FULL_SEGMENT;
+	}
+
+	/* In the Switcher, we want the %cs segment register to use the
+	 * LITE_CS GDT entry: we've put that in the Host and Guest GDTs, so
+	 * it will be undisturbed when we switch.  To change %cs and jump we
+	 * need this structure to feed to Intel's "lcall" instruction. */
+	lite_entry.offset = (long)switch_to_guest + switcher_offset();
+	lite_entry.segment = LITE_CS;
+
+	printk(KERN_INFO "kvm-lite: mapped switcher at %p\n",
+	       switcher_vma->addr);
+
+	/* Save whether PGE should be disabled & re-enabled. */
+	host_has_pge = cpu_has_pge;
+
+	/* And we succeeded... */
+	return 0;
+
+free_nailed_mappings:
+	for_each_possible_cpu(i)
+		kvm_mmu_free_nailed_mapping(i);
+free_vma:
+	vunmap(switcher_vma->addr);
+free_pages:
+	i = TOTAL_SWITCHER_PAGES;
+free_some_pages:
+	for (--i; i >= 0; i--)
+		__free_pages(switcher_page[i], 0);
+	kfree(switcher_page);
+out:
+	return err;
+}
+
+/* Cleaning up the mapping when the module is unloaded is almost...
+ * too easy. */
+static void unmap_switcher(void)
+{
+	unsigned int i;
+
+	/* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
+	vunmap(switcher_vma->addr);
+	/* Now we just need to free the pages we copied the switcher into */
+	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
+		__free_pages(switcher_page[i], 0);
+	for_each_possible_cpu(i)
+		kvm_mmu_free_nailed_mapping(i);
+}
+
+static struct kvm_vcpu *lite_vcpu_create(struct kvm *kvm, unsigned id)
+{
+	struct vcpu_lite *lite;
+	int err;
+
+	lite = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	if (!lite) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = kvm_vcpu_init(&lite->vcpu, kvm, id);
+	if (err)
+		goto free_lite;
+
+	fx_init(&lite->vcpu);
+	lite->vcpu.fpu_active = 1;
+	lite->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+	if (lite->vcpu.vcpu_id == 0)
+		lite->vcpu.apic_base |= MSR_IA32_APICBASE_BSP;
+
+	return &lite->vcpu;
+
+free_lite:
+	kmem_cache_free(kvm_vcpu_cache, lite);
+out:
+	return ERR_PTR(err);
+}
+
+static void lite_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_lite *lite = to_lite(vcpu);
+
+	kvm_vcpu_uninit(vcpu);
+	kmem_cache_free(kvm_vcpu_cache, lite);
+}
+
+/* FIXME: Allow direct traps. */
+static int direct_trap(const struct vcpu_lite *lite,
+		       const struct desc_struct *trap,
+		       unsigned int num)
+{
+	return 0;
+}
+
+/* We don't use the IDT entries in the "struct vcpu_lite" directly, instead we
+ * copy them into the IDT which we've set up for Guests on this CPU, just
+ * before we run the Guest.  This routine does that copy. */
+static void copy_traps(const struct vcpu_lite *lite, struct desc_struct *idt,
+		       const unsigned long *def)
+{
+	unsigned int i;
+
+	/* We can simply copy the direct traps, otherwise we use the default
+	 * ones in the Switcher: they will return to the Host. */
+	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) {
+		if (direct_trap(lite, &lite->idt[i], i))
+			idt[i] = lite->idt[i];
+		else
+			default_idt_entry(&idt[i], i, def[i]);
+	}
+
+	/* Don't forget the system call trap!  The IDT entries for other
+	 * interupts never change, so no need to copy them. */
+	i = SYSCALL_VECTOR;
+	if (direct_trap(lite, &lite->idt[i], i))
+		idt[i] = lite->idt[i];
+	else
+		default_idt_entry(&idt[i], i, def[i]);
+}
+
+/* There are several entries we don't let the Guest set.  The TSS entry is the
+ * "Task State Segment" which controls all kinds of delicate things.  The
+ * LITE_CS and LITE_DS entries are reserved for the Switcher, and the
+ * the Guest can't be trusted to deal with double faults. */
+static int ignored_gdt(unsigned int num)
+{
+	return (num == GDT_ENTRY_TSS
+		|| num == GDT_ENTRY_LITE_CS
+		|| num == GDT_ENTRY_LITE_DS
+		|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
+}
+
+/* Like the IDT, we never simply use the GDT the Guest gives us.  We set up the
+ * GDTs for each CPU, then we copy across the entries each time we want to run
+ * a different Guest on that CPU. */
+static void copy_gdt(const struct vcpu_lite *lite,
+		     struct segment_descriptor *gdt)
+{
+	unsigned int i;
+
+	/* The default entries from setup_default_gdt_entries() are not
+	 * replaced.  See ignored_gdt() above. */
+	for (i = 0; i < GDT_ENTRIES; i++)
+		if (!ignored_gdt(i))
+			gdt[i] = lite->gdt[i];
+}
+
+/* Copies state into curr_pages, ready to run Guest. */
+static void copy_to_curr_pages(const struct vcpu_lite *lite)
+{
+	/* Set up the two "TSS" members which tell the CPU what stack to use
+	 * for traps which do directly into the Guest (ie. traps at privilege
+	 * level 1). */
+	lite->curr_pages->state.guest_tss.esp1 = lite->kstack;
+	lite->curr_pages->state.guest_tss.ss1 = lite->kstack_ss;
+
+	/* Copy direct-to-Guest trap entries. */
+	copy_traps(lite, lite->curr_pages->state.guest_idt,
+		   default_idt_entries);
+
+	/* Copy all GDT entries which the Guest can change. */
+	copy_gdt(lite, lite->curr_pages->state.guest_gdt);
+
+	/* Copy the registers in.  FIXME: Re-order to memcpy. */
+	lite->curr_pages->regs.eax = lite->vcpu.regs[VCPU_REGS_RAX];
+	lite->curr_pages->regs.ecx = lite->vcpu.regs[VCPU_REGS_RCX];
+	lite->curr_pages->regs.edx = lite->vcpu.regs[VCPU_REGS_RDX];
+	lite->curr_pages->regs.ebx = lite->vcpu.regs[VCPU_REGS_RBX];
+	lite->curr_pages->regs.ebp = lite->vcpu.regs[VCPU_REGS_RBP];
+	lite->curr_pages->regs.esi = lite->vcpu.regs[VCPU_REGS_RSI];
+	lite->curr_pages->regs.edi = lite->vcpu.regs[VCPU_REGS_RDI];
+	lite->curr_pages->regs.esp = lite->vcpu.regs[VCPU_REGS_RSP];
+	lite->curr_pages->regs.eip = lite->vcpu.rip;
+
+	lite->curr_pages->regs.cs = lite->sregs[VCPU_SREG_CS];
+	lite->curr_pages->regs.ds = lite->sregs[VCPU_SREG_DS];
+	lite->curr_pages->regs.es = lite->sregs[VCPU_SREG_ES];
+	lite->curr_pages->regs.fs = lite->sregs[VCPU_SREG_FS];
+	lite->curr_pages->regs.gs = lite->sregs[VCPU_SREG_GS];
+	lite->curr_pages->regs.ss = lite->sregs[VCPU_SREG_SS];
+
+	lite->curr_pages->regs.eflags = lite->rflags;
+}
+
+/* Copies state from curr_pages, after Guest has run. */
+static void copy_from_curr_pages(struct vcpu_lite *lite)
+{
+	/* Copy the registers in.  FIXME: Re-order to memcpy. */
+	lite->vcpu.regs[VCPU_REGS_RAX] = lite->curr_pages->regs.eax;
+	lite->vcpu.regs[VCPU_REGS_RCX] = lite->curr_pages->regs.ecx;
+	lite->vcpu.regs[VCPU_REGS_RDX] = lite->curr_pages->regs.edx;
+	lite->vcpu.regs[VCPU_REGS_RBX] = lite->curr_pages->regs.ebx;
+	lite->vcpu.regs[VCPU_REGS_RBP] = lite->curr_pages->regs.ebp;
+	lite->vcpu.regs[VCPU_REGS_RSI] = lite->curr_pages->regs.esi;
+	lite->vcpu.regs[VCPU_REGS_RDI] = lite->curr_pages->regs.edi;
+	lite->vcpu.regs[VCPU_REGS_RSP] = lite->curr_pages->regs.esp;
+	lite->vcpu.rip = lite->curr_pages->regs.eip;
+	lite->sregs[VCPU_SREG_CS] = lite->curr_pages->regs.cs;
+	lite->sregs[VCPU_SREG_DS] = lite->curr_pages->regs.ds;
+	lite->sregs[VCPU_SREG_ES] = lite->curr_pages->regs.es;
+	lite->sregs[VCPU_SREG_FS] = lite->curr_pages->regs.fs;
+	lite->sregs[VCPU_SREG_GS] = lite->curr_pages->regs.gs;
+	lite->sregs[VCPU_SREG_SS] = lite->curr_pages->regs.ss;
+	lite->rflags = lite->curr_pages->regs.eflags;
+}
+
+static void pre_lite_run(struct vcpu_lite *lite)
+{
+	BUG_ON(lite->curr_pages);
+
+	/* FIXME: Do lazy */
+	kvm_load_guest_fpu(&lite->vcpu);
+	if (lite->cr0 & X86_CR0_TS)
+		stts();
+	else
+		clts();
+	lite->curr_pages = lite_pages(smp_processor_id());
+	copy_to_curr_pages(lite);
+}
+
+static void post_lite_run(struct vcpu_lite *lite)
+{
+	BUG_ON(!lite->curr_pages);
+	copy_from_curr_pages(lite);
+	lite->curr_pages = NULL;
+	kvm_put_guest_fpu(&lite->vcpu);
+}
+
+/* FIXME: Somehow do intelligent caching here. */
+static void lite_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+
+static void lite_vcpu_put(struct kvm_vcpu *vcpu)
+{
+}
+
+static void lite_cache_regs(struct kvm_vcpu *vcpu)
+{
+}
+
+static void lite_decache_regs(struct kvm_vcpu *vcpu)
+{
+}
+
+static void lite_vcpu_decache(struct kvm_vcpu *vcpu)
+{
+}
+
+static int lite_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
+{
+	return -EOPNOTSUPP;
+}
+
+static u32 *segreg(struct vcpu_lite *lite, int seg)
+{
+	BUG_ON(seg >= ARRAY_SIZE(lite->sregs));
+	return &lite->sregs[seg];
+}
+
+static u64 segbase(const struct segment_descriptor *d)
+{
+	return d->base_low
+		| ((unsigned long)d->base_mid << 16)
+		| ((unsigned long)d->base_high << 24);
+}
+
+static u64 lite_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+	struct vcpu_lite *lite = to_lite(vcpu);
+	struct segment_descriptor *d;
+	u32 selector;
+
+	if (seg == VCPU_SREG_TR || seg == VCPU_SREG_LDTR) {
+		pr_unimpl(vcpu, "lite: get_segment_base(%i)\n", seg);
+		return 0;
+	}
+
+	selector = *segreg(lite, seg) >> 3;
+
+#ifdef CONFIG_X86_64
+#error "FIXME: X86-64 handles some segments strangely: see segment_base()"
+#endif
+	/* This works correctly for 0 segments. */
+	d = &lite->gdt[selector];
+	return segbase(d);
+}
+
+static void lite_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var,
+			     int seg)
+{
+	struct vcpu_lite *lite = to_lite(vcpu);
+	struct segment_descriptor *d;
+	u32 selector;
+
+	if (seg == VCPU_SREG_TR || seg == VCPU_SREG_LDTR) {
+		pr_unimpl(vcpu, "lite: get_segment(%i)\n", seg);
+		memset(var, 0, sizeof(*var));
+		return;
+	}
+
+	selector = *segreg(lite, seg) >> 3;
+	d = &lite->gdt[selector];
+	var->base = segbase(d);
+	var->limit = d->limit_low | ((unsigned long)d->limit_high << 8);
+	var->selector = selector;
+	var->type = d->type;
+	var->s = d->system;
+	var->dpl = d->dpl;
+	var->present = d->present;
+	var->avl = d->avl;
+	var->l = d->long_mode;
+	var->db = d->default_op;
+	var->g = d->granularity;
+	var->unusable = (seg != 0);
+}
+
+static void write_timestamp(struct vcpu_lite *lite)
+{
+	struct timespec ts;
+
+	ktime_get_real_ts(&ts);
+	set_lite_data(lite, time, ts);
+}
+
+static void load_gdt_entry(struct vcpu_lite *lite, unsigned int num,
+			   const struct segment_descriptor *desc)
+{
+	/* We use the base of the 0th GDT entry as ptr to lite_data. */
+	if (num == 0 && !lite->lite_data) {
+		lite->lite_data = segbase(desc);
+
+		if (lite->lite_data) {
+			/* Set up the initial fields for guest. */
+			set_lite_data(lite, reserve_mem, -SWITCHER_ADDR);
+			write_timestamp(lite);
+		}
+	}
+
+	/* We never copy these ones to real GDT, so don't care what they say */
+	if (ignored_gdt(num))
+		return;
+
+	lite->gdt[num] = *desc;
+	/* Segment descriptors contain a privilege level: the Guest is
+	 * sometimes careless and leaves this as 0, even though it's running at
+	 * privilege level 1.  If so, we fix it here. */
+	if (lite->gdt[num].dpl == 0)
+		lite->gdt[num].dpl = GUEST_PL;
+
+	/* Each descriptor has an "accessed" bit.  If we don't set it now, the
+	 * CPU will try to set it when the Guest first loads that entry into a
+	 * segment register.  But the GDT isn't writable by the Guest, so bad
+	 * things can happen. */
+	lite->gdt[num].type |= 1;
+}
+
+static void lite_set_segment(struct kvm_vcpu *vcpu,
+			     struct kvm_segment *var, int seg)
+{
+	struct vcpu_lite *lite = to_lite(vcpu);
+	struct segment_descriptor d;
+	u32 *reg;
+
+	if (seg == VCPU_SREG_TR || seg == VCPU_SREG_LDTR) {
+		pr_unimpl(vcpu, "lite: set_segment(%i)\n", seg);
+		return;
+	}
+
+	if (var->selector >= sizeof(lite->gdt)) {
+		kill_guest(lite, "large segment selector %i\n", var->selector);
+		return;
+	}
+
+	if (var->selector & 4) {
+		pr_unimpl(vcpu, "lite: ldt selector %i\n", var->selector);
+		return;
+	}
+
+	reg = segreg(lite, seg);
+	*reg = var->selector;
+
+	/* We re-encode the cached kvm_segment into the GDT, since
+	 * kvm-lite guests must keep their GDT and live segment
+	 * registers in sync. */
+	d.limit_low = (var->limit & 0xFFFF);
+	d.limit_high = (var->limit >> 16);
+	d.base_low = (var->base & 0xFFFF);
+	d.base_mid = (var->base >> 16);
+	d.base_high = (var->base >> 24);
+	d.type = var->type;
+	d.system = var->s;
+	d.dpl = var->dpl;
+	d.present = var->present;
+	d.avl = var->avl;
+	d.long_mode = var->l;
+	d.default_op = var->db;
+	d.granularity = var->g;
+	printk("lite: setting segment %s: %#x type=%x\n",
+	       seg == VCPU_SREG_SS ? "ss"
+	       : seg == VCPU_SREG_GS ? "gs"
+	       : seg == VCPU_SREG_DS ? "ds"
+	       : seg == VCPU_SREG_ES ? "es"
+	       : seg == VCPU_SREG_FS ? "fs"
+	       : seg == VCPU_SREG_CS ? "cs" : "???",
+	       var->selector, d.type);
+	load_gdt_entry(lite, var->selector >> 3, &d);
+}
+
+static void lite_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
+{
+}
+
+static void lite_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+	struct vcpu_lite *lite = to_lite(vcpu);
+
+	lite->cr0 = cr0;
+}
+
+static void lite_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+{
+	struct vcpu_lite *lite = to_lite(vcpu);
+
+	lite->pgtable = root;
+}
+
+static void lite_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+}
+
+#ifdef CONFIG_X86_64
+#error need set_efer
+#endif
+
+static void lite_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+	struct vcpu_lite *lite = to_lite(vcpu);
+
+	*dt = lite->idt_desc;
+}
+
+#ifdef CONFIG_X86_64
+#error need IDT decoding
+#endif
+
+/* The address of the interrupt handler is split into two bits: */
+static unsigned long idt_address(u32 lo, u32 hi)
+{
+	return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
+}
+
+/* The "type" of the interrupt handler is a 4 bit field: we only support a
+ * couple of types. */
+static int idt_type(u32 lo, u32 hi)
+{
+	return (hi >> 8) & 0xF;
+}
+
+/* An IDT entry can't be used unless the "present" bit is set. */
+static int idt_present(u32 lo, u32 hi)
+{
+	return (hi & 0x8000);
+}
+
+/* This is the routine which actually checks the Guest's IDT entry and
+ * transfers it into our entry in "struct lite": */
+static void set_trap(struct vcpu_lite *lite, struct desc_struct *trap,
+		     unsigned int num, u32 lo, u32 hi)
+{
+	u8 type = idt_type(lo, hi);
+
+	/* We zero-out a not-present entry */
+	if (!idt_present(lo, hi)) {
+		trap->a = trap->b = 0;
+		return;
+	}
+
+	/* We only support interrupt and trap gates. */
+	if (type != 0xE && type != 0xF)
+		kill_guest(lite, "bad IDT type %i", type);
+
+	/* We only copy the handler address, present bit, privilege level and
+	 * type.  The privilege level controls where the trap can be triggered
+	 * manually with an "int" instruction.  This is usually GUEST_PL,
+	 * except for system calls which userspace can use. */
+	trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
+	trap->b = (hi&0xFFFFEF00);
+}
+
+static void load_idt_entry(struct vcpu_lite *lite, unsigned int num,
+			   u32 lo, u32 hi)
+{
+	/* Guest never handles: NMI, doublefault, spurious interrupt or
+	 * hypercall.  We ignore when it tries to set them. */
+	if (num == 2 || num == 8 || num == 15 || num == KVM_LITE_HCALL_TRAP)
+		return;
+
+	if (num == 0x80)
+		printk("Setting IDT entry 0x80 to %08x %08x (DPL=%u)\n",
+		       lo, hi, (hi >> 13) & 0x3);
+
+	set_trap(lite, &lite->idt[num], num, lo, hi);
+}
+
+static void lite_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+	struct vcpu_lite *lite = to_lite(vcpu);
+	unsigned int i, num;
+
+	/* QEMU sets up huge IDT and GDTs but we ignore them. */
+	num = (dt->limit + 1) / sizeof(lite->idt[0]);
+	if (num > ARRAY_SIZE(lite->idt))
+		num = ARRAY_SIZE(lite->idt);
+
+	lite->idt_desc = *dt;
+	for (i = 0; i < num; i++) {
+		struct desc_struct e;
+		if (emulator_read_std(lite->idt_desc.base
+				      + i * sizeof(lite->idt[0]),
+				      &e, sizeof(e), vcpu)
+		    != X86EMUL_CONTINUE) {
+			kill_guest(lite, "Failed reading IDT %i", i);
+			return;
+		}
+		load_idt_entry(lite, i, e.a, e.b);
+	}
+}
+
+static void lite_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+	struct vcpu_lite *lite = to_lite(vcpu);
+
+	*dt = lite->gdt_desc;
+}
+
+static void lite_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+	struct vcpu_lite *lite = to_lite(vcpu);
+	unsigned int i, num;
+
+	/* QEMU sets up huge IDT and GDTs but we ignore them. */
+	num = (dt->limit + 1) / sizeof(lite->gdt[0]);
+	if (num > ARRAY_SIZE(lite->gdt))
+		num = ARRAY_SIZE(lite->gdt);
+
+	lite->gdt_desc = *dt;
+	for (i = 0; i < num; i++) {
+		struct segment_descriptor d;
+		if (emulator_read_std(lite->gdt_desc.base
+				      + i * sizeof(lite->gdt[0]),
+				      &d, sizeof(d), vcpu)
+		    != X86EMUL_CONTINUE) {
+			kill_guest(lite, "Failed reading GDT");
+			return;
+		}
+		load_gdt_entry(lite, i, &d);
+	}
+}
+
+static unsigned long lite_get_dr(struct kvm_vcpu *vcpu, int dr)
+{
+	pr_unimpl(vcpu, "lite_get_dr %i\n", dr);
+	return 0;
+}
+
+static void lite_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
+			int *exception)
+{
+	pr_unimpl(vcpu, "lite_set_dr %i\n", dr);
+}
+
+/* Get IF bit of guest's interrupt flags. */
+static int get_guest_eflags_if(struct vcpu_lite *lite)
+{
+	/* They don't enable before setting up GDT. */
+	if (!lite->lite_data)
+		return 0;
+
+	return get_lite_data(lite, irq_enabled) & X86_EFLAGS_IF;
+}
+
+static void set_guest_eflags_if(struct vcpu_lite *lite, int eflags)
+{
+	if (!lite->lite_data)
+		return;
+
+	printk("Setting EFLAGS to %08x\n", eflags);
+	set_lite_data(lite, irq_enabled, eflags);
+}
+
+static unsigned long lite_get_rflags(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_lite *lite = to_lite(vcpu);
+	unsigned long rflags;
+
+	/* Interrupts are always really enabled: read that bit from guest */
+	rflags = (lite->rflags & ~(unsigned long)X86_EFLAGS_IF);
+	return rflags | get_guest_eflags_if(lite);
+}
+
+static void lite_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+	struct vcpu_lite *lite = to_lite(vcpu);
+
+	/* Interrupts are always really enabled, and "2" must be set. */
+	lite->rflags = rflags | X86_EFLAGS_IF | 0x2;
+	set_guest_eflags_if(lite, rflags & X86_EFLAGS_IF);
+}
+
+static void lite_flush_tlb(struct kvm_vcpu *vcpu)
+{
+}
+
+static void push_guest_stack(struct vcpu_lite *lite,
+			     unsigned long *gstack,
+			     unsigned long val)
+{
+	/* Stack grows upwards: move stack then write value. */
+	*gstack -= sizeof(long);
+	if (emulator_write_emulated(*gstack, &val, sizeof(val), &lite->vcpu)
+	    != X86EMUL_CONTINUE)
+		kill_guest(lite, "Stack write to %#lx failed", *gstack);
+}
+
+/* FIXME: In-kernel apic might supply a generic version of this. */
+static void set_guest_interrupt(struct vcpu_lite *lite,
+				u32 lo, u32 hi, bool has_err,
+				unsigned long errcode)
+{
+	unsigned long gstack;
+	u32 eflags, ss;
+
+	/* There are two cases for interrupts: one where the Guest is already
+	 * in the kernel, and a more complex one where the Guest is in
+	 * userspace.  We check the privilege level to find out. */
+	if ((lite->sregs[VCPU_SREG_SS]&0x3) != GUEST_PL) {
+		/* The Guest told us their kernel stack with the SET_STACK
+		 * hypercall: both the virtual address and the segment */
+		gstack = lite->kstack;
+		ss = lite->kstack_ss;
+		/* We push the old stack segment and pointer onto the new
+		 * stack: when the Guest does an "iret" back from the interrupt
+		 * handler the CPU will notice they're dropping privilege
+		 * levels and expect these here. */
+		push_guest_stack(lite, &gstack, lite->sregs[VCPU_SREG_SS]);
+		push_guest_stack(lite, &gstack,lite->vcpu.regs[VCPU_REGS_RSP]);
+		printk("IRQ from different PL\n");
+	} else {
+		/* We're staying on the same Guest (kernel) stack. */
+		gstack = lite->vcpu.regs[VCPU_REGS_RSP];
+		ss = lite->sregs[VCPU_SREG_SS];
+	}
+
+	/* Remember that we never let the Guest actually disable interrupts, so
+	 * the "Interrupt Flag" bit is always set.  We copy that bit from the
+	 * Guest's "irq_enabled" field into the eflags word: the Guest copies
+	 * it back in "lite_iret". */
+	eflags = lite->rflags;
+	if (get_guest_eflags_if(lite) == 0)
+		eflags &= ~X86_EFLAGS_IF;
+
+	/* An interrupt is expected to push three things on the stack: the old
+	 * "eflags" word, the old code segment, the old instruction pointer and
+	 * the error code. */
+	push_guest_stack(lite, &gstack, eflags);
+	push_guest_stack(lite, &gstack, lite->sregs[VCPU_SREG_CS]);
+	push_guest_stack(lite, &gstack, lite->vcpu.rip);
+	if (has_err)
+		push_guest_stack(lite, &gstack, errcode);
+
+	/* Now we've pushed all the old state, we change the stack, the code
+	 * segment and the address to execute. */
+	lite->sregs[VCPU_SREG_SS] = ss;
+	lite->vcpu.regs[VCPU_REGS_RSP] = gstack;
+	lite->sregs[VCPU_SREG_CS] = (__KERNEL_CS|GUEST_PL);
+	lite->vcpu.rip = idt_address(lo, hi);
+
+	/* There are two kinds of interrupt handlers: 0xE is an "interrupt
+	 * gate" which expects interrupts to be disabled on entry. */
+	if (idt_type(lo, hi) == 0xE)
+		set_guest_eflags_if(lite, 0);
+
+	/* Every time we deliver an interrupt, we update the timestamp in the
+	 * Guest's kvm_lite_data struct.  It would be better for the Guest if
+	 * we did this more often, but it can actually be quite slow: doing it
+	 * here is a compromise which means at least it gets updated every
+	 * timer interrupt. */
+	write_timestamp(lite);
+}
+
+static void set_page_fault(struct vcpu_lite *lite)
+{
+	struct desc_struct *pf_idt;
+
+	if (!lite->lite_data) {
+		kill_guest(lite, "Early page fault");
+		return;
+	}
+
+	pf_idt = &lite->idt[PF_VECTOR];
+	if (!idt_present(pf_idt->a, pf_idt->b)) {
+		kill_guest(lite, "No handler for #PF@%#lx %#lx",
+			   lite->vcpu.rip, lite->pf_addr);
+		return;
+	}
+
+	set_lite_data(lite, cr2, lite->pf_addr);
+	set_guest_interrupt(lite, pf_idt->a, pf_idt->b, true,
+			    lite->pf_errcode);
+	lite->wants_page_fault = false;
+}
+
+/* We can't simply inject the page fault now, since the emulator will
+ * set registers and overwrite it. */
+static void lite_inject_page_fault(struct kvm_vcpu *vcpu,
+				   unsigned long addr, u32 err_code)
+{
+	struct vcpu_lite *lite = to_lite(vcpu);
+
+	/* This can happen when set_page_fault writes to stack. */
+	if (lite->wants_page_fault) {
+		kill_guest(lite, "Recursive page fault");
+		return;
+	}
+
+	printk("pf inject for %#lx\n", addr);
+	lite->wants_page_fault = true;
+	lite->pf_addr = addr;
+	lite->pf_errcode = err_code;
+	if (!addr)
+		kill_guest(lite, "Page fault\n");
+}
+
+static void lite_inject_gp(struct kvm_vcpu *vcpu, u32 err_code)
+{
+	struct vcpu_lite *lite = to_lite(vcpu);
+
+	if (!lite->lite_data) {
+		kill_guest(lite, "Early #GP");
+		return;
+	}
+
+	printk("GP to %#lx %#lx\n",
+	       lite->idt[GP_VECTOR].a, lite->idt[GP_VECTOR].b);
+	set_guest_interrupt(lite, lite->idt[GP_VECTOR].a,
+			    lite->idt[GP_VECTOR].b, true, err_code);
+}
+
+static int dm_request_for_irq_injection(struct vcpu_lite *lite,
+					struct kvm_run *kvm_run)
+{
+	return (!lite->vcpu.irq_summary &&
+		kvm_run->request_interrupt_window &&
+		lite->vcpu.interrupt_window_open &&
+		get_guest_eflags_if(lite));
+}
+
+/* 0 or -errno means we stop running and return the userspace.  Positive means
+ * it's all good. */
+static int handle_page_fault(struct vcpu_lite *lite, struct kvm_run *run,
+			     unsigned long cr2, u32 errcode)
+{
+	int r;
+	unsigned long old_eip = lite->vcpu.rip;
+
+	if (cr2 < 0xc0000000 && cr2 > 0x8000000)
+		printk("Page fault at %#lx\n", cr2);
+
+	mutex_lock(&lite->vcpu.kvm->lock);
+	r = kvm_mmu_page_fault(&lite->vcpu, cr2, errcode);
+	if (old_eip != lite->vcpu.rip) {
+		printk("Woah!  Trap at %#lx (%#lx) moved to %#lx\n",
+		       old_eip, cr2, lite->vcpu.rip);
+	}
+
+	if (r > 0) {
+		/* FIXME: This is horrible.  Guest page table needs to be
+		 * updated: we use emulator */
+		enum emulation_result er;
+		er = emulate_instruction(&lite->vcpu, run, cr2, errcode);
+		mutex_unlock(&lite->vcpu.kvm->lock);
+
+//		printk("Emaulte after pf says %u\n", er);
+		switch (er) {
+		case EMULATE_DONE:
+			return 1;
+		case EMULATE_DO_MMIO:
+			++lite->vcpu.stat.mmio_exits;
+			return 0;
+		case EMULATE_FAIL:
+			vcpu_printf(&lite->vcpu, "%s: emulate fail\n",
+				    __FUNCTION__);
+			break;
+		default:
+			BUG();
+		}
+		return -EIO;
+	} else
+		mutex_unlock(&lite->vcpu.kvm->lock);
+
+	if (r == 0)
+		return 1;
+
+	return r;
+}
+
+static int has_err(unsigned int trap)
+{
+	return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
+}
+
+/* FIXME: Put in x86_emulate.c.  This just skips over instructions. */
+static int skip_io(struct vcpu_lite *lite)
+{
+	u8 insn;
+	unsigned int insnlen = 0, in = 0, shift = 0;
+
+	/* Decoding x86 instructions is icky. */
+	if (emulator_read_std(lite->vcpu.rip, &insn, 1, &lite->vcpu)
+	    != X86EMUL_CONTINUE)
+		return 0;
+
+	/* 0x66 is an "operand prefix".  It means it's using the upper 16 bits
+	   of the eax register. */
+	if (insn == 0x66) {
+		shift = 16;
+		/* The instruction is 1 byte so far, read the next byte. */
+		insnlen = 1;
+		if (emulator_read_std(lite->vcpu.rip+insnlen,
+				      &insn, 1, &lite->vcpu)
+		    != X86EMUL_CONTINUE)
+			return 0;
+	}
+
+	/* We can ignore the lower bit for the moment and decode the 4 opcodes
+	 * we need to emulate. */
+	switch (insn & 0xFE) {
+	case 0xE4: /* in     <next byte>,%al */
+		insnlen += 2;
+		in = 1;
+		break;
+	case 0xEC: /* in     (%dx),%al */
+		insnlen += 1;
+		in = 1;
+		break;
+	case 0xE6: /* out    %al,<next byte> */
+		insnlen += 2;
+		break;
+	case 0xEE: /* out    %al,(%dx) */
+		insnlen += 1;
+		break;
+	default:
+		/* OK, we don't know what this is, can't emulate. */
+		return 0;
+	}
+
+	/* If it was an "IN" instruction, they expect the result to be read
+	 * into %eax, so we change %eax.  We always return all-ones, which
+	 * traditionally means "there's nothing there". */
+	if (in) {
+		/* Lower bit tells is whether it's a 16 or 32 bit access */
+		if (insn & 0x1)
+			lite->vcpu.regs[VCPU_REGS_RAX] = 0xFFFFFFFF;
+		else
+			lite->vcpu.regs[VCPU_REGS_RAX] |= (0xFFFF << shift);
+	}
+	/* Finally, we've "done" the instruction, so move past it. */
+	lite->vcpu.rip += insnlen;
+	/* Success! */
+	return 1;
+}
+
+/* FIXME: emulate_instruction() faults and doesn't update EIP. */
+static bool skip_invlpg(struct vcpu_lite *lite)
+{
+	const u8 invlpg[] = { 0x0f, 0x01, 0x38 };
+	u8 insn;
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(invlpg); i++) {
+		if (emulator_read_std(lite->vcpu.rip+i, &insn, 1, &lite->vcpu)
+		    != X86EMUL_CONTINUE)
+			return false;
+		if (insn != invlpg[i])
+			return false;
+	}
+	printk("invlpg %#lx!\n", lite->vcpu.rip);
+#if 0
+	lite->vcpu.rip += i;
+//	kvm_mmu_reset_context(&lite->vcpu);
+#endif
+	return true;
+}
+
+static enum emulation_result emulate(struct vcpu_lite *lite,
+				     struct kvm_run *run,
+				     unsigned long cr2, u16 errcode)
+{
+	/* For safety, we do not emulate non-kernel commands */
+	if ((lite->sregs[VCPU_SREG_CS] & SEGMENT_RPL_MASK) != GUEST_PL)
+		return EMULATE_FAIL;
+
+	printk("Emulating @%lx: esp = %#lx\n", lite->vcpu.rip,
+		lite->vcpu.regs[VCPU_REGS_RSP]);
+
+	if (skip_io(lite))
+		return EMULATE_DONE;
+
+	if (skip_invlpg(lite)) {
+		enum emulation_result er;
+		er = emulate_instruction(&lite->vcpu, run, cr2, errcode);
+		printk("EIP after emulate = %#lx\n", lite->vcpu.rip);
+		return er;
+	}
+
+	return emulate_instruction(&lite->vcpu, run, cr2, errcode);
+}
+
+static void lite_set_stack(struct vcpu_lite *lite,
+			   u32 seg, u32 esp, unsigned int pages)
+{
+	/* You are not allowd have a stack segment with privilege level 0: bad
+	 * Guest! */
+	if ((seg & 0x3) != GUEST_PL)
+		kill_guest(lite, "bad stack segment %i", seg);
+	/* We only expect one or two stack pages. */
+	if (pages > 2)
+		kill_guest(lite, "bad stack pages %u", pages);
+
+	/* Save where the stack is */
+	lite->kstack_ss = seg;
+	lite->kstack = esp;
+
+	/* FIXME: When we want to deliver traps directly, we must ensure that
+	 * the stack is always mapped. */
+#if 0
+	lg->stack_pages = pages;
+	/* Make sure the new stack pages are mapped */
+	pin_stack_pages(lg);
+#endif
+}
+
+static int do_hypercall(struct vcpu_lite *lite, struct kvm_run *run)
+{
+	unsigned long *regs = lite->vcpu.regs;
+
+	if (!lite->lite_data) {
+		kill_guest(lite, "Early hypercall");
+		return 0;
+	}
+
+	switch (regs[VCPU_REGS_RBX]) {
+	case KVM_HCALL_LOAD_IDT_ENTRY:
+		load_idt_entry(lite, regs[VCPU_REGS_RAX],
+			       regs[VCPU_REGS_RCX], regs[VCPU_REGS_RDX]);
+		return 1;
+
+	case KVM_HCALL_LOAD_GDT_ENTRY: {
+		/* FIXME */
+		struct desc_struct d;
+		d.a = regs[VCPU_REGS_RCX];
+		d.b = regs[VCPU_REGS_RDX];
+		load_gdt_entry(lite, regs[VCPU_REGS_RAX],
+			       (struct segment_descriptor *)&d);
+		return 1;
+	}
+	case KVM_HCALL_SET_STACK:
+		lite_set_stack(lite, regs[VCPU_REGS_RAX],
+			       regs[VCPU_REGS_RCX], regs[VCPU_REGS_RDX]);
+		return 1;
+
+	case KVM_HCALL_HALT:
+		/* Re-enable interrupts, then try to halt. */
+		printk("Setting EFLAGS for halt\n");
+		set_lite_data(lite, irq_enabled, X86_EFLAGS_IF);
+		return kvm_emulate_halt(&lite->vcpu);
+
+	case KVM_HCALL_HACK_WRITE:
+		return kvm_emulate_pio(&lite->vcpu, run, 0, 1, 1);
+
+	case KVM_HCALL_SET_CLOCKEVENT:
+		return kvm_emulate_pio(&lite->vcpu, run, 0, 4, 2);
+
+	default:
+		kill_guest(lite, "Hypercall %lu\n", regs[VCPU_REGS_RBX]);
+		return -ENOENT;
+	}
+	return 0;
+}
+
+/* 0 or -errno means we stop running and return the userspace.  Positive means
+ * it's all good. */
+static int handle_exit(struct vcpu_lite *lite, struct kvm_run *run,
+		       unsigned trapnum, unsigned errcode, unsigned long cr2)
+{
+	/* OK, so what happened? */
+	switch (trapnum) {
+	case GP_VECTOR: /* We've intercepted a GPF. */
+		switch (emulate(lite, run, cr2, errcode)) {
+		case EMULATE_DONE:
+			return 1;
+		case EMULATE_DO_MMIO:
+			kill_guest(lite, "Doing MMIO!\n");
+			return 0;
+		default:
+			kill_guest(lite, "Emulate failed for %#lx",
+				   lite->vcpu.rip);
+		}
+		break;
+
+	case PF_VECTOR:
+		/* If this wants page fault set to guest, it does it explicitly
+		 * via ->inject_page_fault. */
+		return handle_page_fault(lite, run, cr2, errcode);
+
+	case NM_VECTOR:
+		/* Always reflect this. */
+		break;
+
+	case KVM_LITE_HCALL_TRAP:
+		return do_hypercall(lite, run);
+
+	case FIRST_EXTERNAL_VECTOR ... KVM_NR_INTERRUPTS-1:
+		/* FIXME: Direct system calls get rid of this. */
+		if (trapnum == SYSCALL_VECTOR) {
+			printk("System call %lu\n",
+			       lite->vcpu.regs[VCPU_REGS_RAX]);
+			break;
+		}
+
+		/* An external interrupt, already delivered. */
+		lite->vcpu.stat.irq_exits++;
+		return 1;
+
+	case 256:
+		kill_guest(lite, "Guest faulted in switcher");
+		return -ENOENT;
+	}
+
+	/* We re-inject fault into guest. */
+	if (!idt_present(lite->idt[trapnum].a, lite->idt[trapnum].b)) {
+		kill_guest(lite, "No handler for trap %u", trapnum);
+		run->exit_reason = KVM_EXIT_EXCEPTION;
+		return -EIO;
+	}
+	set_guest_interrupt(lite, lite->idt[trapnum].a,
+			    lite->idt[trapnum].b, has_err(trapnum), errcode);
+	return 1;
+}
+
+static void maybe_do_interrupt(struct vcpu_lite *lite,
+			       const struct kvm_run *run)
+{
+	struct desc_struct *idt;
+
+	lite->vcpu.interrupt_window_open = get_guest_eflags_if(lite);
+
+	if (!lite->vcpu.interrupt_window_open || !lite->vcpu.irq_summary)
+		return;
+
+	/* Look at the IDT entry the Guest gave us for this interrupt. */
+	idt = &lite->idt[kvm_pop_irq(&lite->vcpu)];
+
+	if (lite->vcpu.rip >= 0xc0103070 && lite->vcpu.rip < 0xc01031a0)
+		printk("Interrupt in __switch_to!\n");
+
+	/* If they don't have a handler (yet?), we just ignore it */
+	if (idt_present(idt->a, idt->b)) {
+		/* set_guest_interrupt() takes the interrupt descriptor and a
+		 * flag to say whether this interrupt pushes an error code onto
+		 * the stack as well: virtual interrupts never do. */
+		set_guest_interrupt(lite, idt->a, idt->b, 0, 0);
+	}
+}
+
+static int lite_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+	struct vcpu_lite *lite = to_lite(vcpu);
+	int r;
+	unsigned long trapnum, errcode, cr2 = 0; /* Damn gcc */
+	/* This is a dummy value we need for GCC's sake. */
+	unsigned int clobber;
+
+again:
+	if (unlikely(lite->dead)) {
+		r = -ENOENT;
+		goto out;
+	}
+
+	r = kvm_mmu_reload(vcpu);
+	if (unlikely(r))
+		goto out;
+
+	/* Set up the Guest's page tables to see this CPU's pages (and no
+	 * other CPU's pages). */
+	kvm_use_nailed_mappings(vcpu);
+
+	if (lite->wants_page_fault)
+		set_page_fault(lite);
+
+	if (!vcpu->mmio_read_completed)
+		maybe_do_interrupt(lite, run);
+
+	kvm_load_guest_fpu(vcpu);
+
+#if 0
+	if (trapnum < KVM_LITE_HCALL_TRAP)
+		dump_regs("Entering guest", -1, lite);
+#endif
+
+	/* OK, now we're ready to jump into the Guest.  First we put up
+	 * the "Do Not Disturb" sign: */
+	local_irq_disable();
+
+	/* SYSENTER is an optimized way of doing system calls.  We
+	 * can't allow it because it always jumps to privilege level 0.
+	 * A normal Guest won't try it because we don't advertise it in
+	 * CPUID, but a malicious Guest (or malicious Guest userspace
+	 * program) could, so we tell the CPU to disable it before
+	 * running the Guest. */
+	/* FIXME: move to vcpu_load */
+	if (boot_cpu_has(X86_FEATURE_SEP))
+		wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
+
+	pre_lite_run(lite);
+
+	/* Set trap to impossible number so we know if Switcher faulted. */
+	lite->curr_pages->regs.trapnum = 256;
+
+	/* Save the current Host top-level page directory. */
+	lite->curr_pages->state.host_cr3 = __pa(current->mm->pgd);
+
+	/* Now: we push the "eflags" register on the stack, then do an "lcall".
+	 * This is how we change from using the kernel code segment to using
+	 * the dedicated lite code segment, as well as jumping into the
+	 * Switcher.
+	 *
+	 * The lcall also pushes the old code segment (KERNEL_CS) onto the
+	 * stack, then the address of this call.  This stack layout happens to
+	 * exactly match the stack of an interrupt... */
+	asm volatile("pushf; lcall *lite_entry"
+		     /* This is how we tell GCC that %eax ("a") and %ebx ("b")
+		      * are changed by this routine.  The "=" means output. */
+		     : "=a"(clobber), "=b"(clobber)
+		     /* %eax contains the pages pointer.  ("0" refers to the
+		      * 0-th argument above, ie "a").  %ebx contains the
+		      * physical address of the Guest's top-level page
+		      * directory. */
+		     : "0"(lite->curr_pages), "1"(lite->pgtable)
+		     /* We tell gcc that all these registers could change,
+		      * which means we don't have to save and restore them in
+		      * the Switcher. */
+		     : "memory", "%edx", "%ecx", "%edi", "%esi");
+
+	trapnum = lite->curr_pages->regs.trapnum;
+	errcode = lite->curr_pages->regs.errcode;
+
+	/* If the Guest page faulted, then the cr2 register will tell us the
+	 * bad virtual address.  We have to grab this now, because once we
+	 * re-enable interrupts an interrupt could fault and thus overwrite
+	 * cr2, or we could even move off to a different CPU. */
+	if (trapnum == 14)
+		cr2 = read_cr2();
+
+	/* Restore SYSENTER if it's supposed to be on. */
+	if (boot_cpu_has(X86_FEATURE_SEP))
+		wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+
+	post_lite_run(lite);
+
+	/* Now we're ready to be interrupted or moved to other CPUs */
+	local_irq_enable();
+
+	if (trapnum < KVM_LITE_HCALL_TRAP)
+		dump_regs("Exiting guest", trapnum, lite);
+
+	/* Remove nailed entries from page tables. */
+	kvm_remove_nailed_mappings(vcpu);
+
+	/* Positive means we've handled it, go around again. */
+	r = handle_exit(lite, run, trapnum, errcode, cr2);
+	if (r > 0) {
+		/* Give scheduler a change to reschedule. */
+		if (signal_pending(current)) {
+			r = -EINTR;
+			run->exit_reason = KVM_EXIT_INTR;
+			++vcpu->stat.signal_exits;
+			goto out;
+		}
+		if (dm_request_for_irq_injection(lite, run)) {
+			r = -EINTR;
+			run->exit_reason = KVM_EXIT_INTR;
+			++vcpu->stat.request_irq_exits;
+			goto out;
+		}
+		kvm_resched(vcpu);
+		goto again;
+	}
+
+out:
+	run->ready_for_interrupt_injection
+		= (vcpu->interrupt_window_open && vcpu->irq_summary == 0);
+	run->if_flag = get_guest_eflags_if(lite);
+#ifdef CONFIG_X86_64
+#error FIXME: cr8 handling
+#endif
+	run->apic_base = vcpu->apic_base;
+	return r;
+}
+
+/* This is called after our "fake" IO, so we don't do anything. */
+static void lite_skip_emulated(struct kvm_vcpu *vcpu)
+{
+#if 0
+	printk("how to skip emulated insn %#lx?", vcpu->rip);
+	WARN_ON(1);
+#endif
+}
+
+static void lite_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *insns)
+{
+	/* "int <lite-trap-entry>". */
+	insns[0] = 0xcd;
+	insns[1] = KVM_LITE_HCALL_TRAP;
+
+	/* GENERIC_NOP1 */
+	insns[2] = 0x90;
+}
+
+static struct kvm_arch_ops lite_arch_ops = {
+	.cpu_has_kvm_support = lite_is_always_supported,
+	.disabled_by_bios = you_cant_stop_the_lite,
+	.hardware_enable = hardware_enable,
+	.hardware_disable = hardware_disable,
+	.check_processor_compatibility = check_processor_compatibility,
+	.hardware_setup = map_switcher,
+	.hardware_unsetup = unmap_switcher,
+	.vcpu_create = lite_vcpu_create,
+	.vcpu_free = lite_vcpu_free,
+	.vcpu_load = lite_vcpu_load,
+	.vcpu_put = lite_vcpu_put,
+	.vcpu_decache = lite_vcpu_decache,
+
+	.set_guest_debug = lite_guest_debug,
+	.get_msr = kvm_get_msr_common,
+	.set_msr = kvm_set_msr_common,
+	.get_segment_base = lite_get_segment_base,
+	.get_segment = lite_get_segment,
+	.set_segment = lite_set_segment,
+	.get_cs_db_l_bits = generic_get_cs_db_l_bits,
+	.decache_cr4_guest_bits = lite_decache_cr4_guest_bits,
+
+	.set_cr0 = lite_set_cr0,
+	.set_cr3 = lite_set_cr3,
+	.set_cr4 = lite_set_cr4,
+	.get_idt = lite_get_idt,
+	.set_idt = lite_set_idt,
+	.get_gdt = lite_get_gdt,
+	.set_gdt = lite_set_gdt,
+	.get_dr = lite_get_dr,
+	.set_dr = lite_set_dr,
+	.cache_regs = lite_cache_regs,
+	.decache_regs = lite_decache_regs,
+	.get_rflags = lite_get_rflags,
+	.set_rflags = lite_set_rflags,
+	.tlb_flush = lite_flush_tlb,
+
+	.inject_page_fault = lite_inject_page_fault,
+	.inject_gp = lite_inject_gp,
+
+	.run = lite_run,
+	.skip_emulated_instruction = lite_skip_emulated,
+	.patch_hypercall = lite_patch_hypercall,
+};
+
+static int __init lite_init(void)
+{
+	return kvm_init_arch(&lite_arch_ops, sizeof(struct vcpu_lite),
+			      THIS_MODULE);
+}
+
+static void __exit lite_exit(void)
+{
+	kvm_exit_arch();
+}
+
+module_init(lite_init);
+module_exit(lite_exit);
+MODULE_LICENSE("GPL");
diff -r 039995825488 drivers/kvm/lite.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/kvm/lite.h	Sun Sep 02 10:29:54 2007 +1000
@@ -0,0 +1,65 @@
+#ifndef __KVM_LITE_H
+#define __KVM_LITE_H
+
+#define GDT_ENTRY_LITE_CS	10
+#define GDT_ENTRY_LITE_DS	11
+#define LITE_CS			(GDT_ENTRY_LITE_CS * 8)
+#define LITE_DS			(GDT_ENTRY_LITE_DS * 8)
+
+#define KVM_LITE_HCALL_TRAP 0x1F
+
+#ifndef __ASSEMBLY__
+#include "kvm.h"
+#include "segment_descriptor.h"
+
+/* Full 4G segment descriptors, suitable for CS and DS. */
+#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
+#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300})
+
+struct lite_regs
+{
+	/* Manually saved part. */
+	unsigned long ebx, ecx, edx;
+	unsigned long esi, edi, ebp;
+	unsigned long gs;
+	unsigned long eax;
+	unsigned long fs, ds, es;
+	unsigned long trapnum, errcode;
+	/* Trap pushed part */
+	unsigned long eip;
+	unsigned long cs;
+	unsigned long eflags;
+	unsigned long esp;
+	unsigned long ss;
+};
+
+/* This is a guest-specific page (mapped ro) into the guest. */
+struct lite_ro_state
+{
+	/* Host information we need to restore when we switch back. */
+	u32 host_cr3;
+	struct descriptor_table host_idt_desc;
+	struct descriptor_table host_gdt_desc;
+	u32 host_sp;
+
+	/* Fields which are used when guest is running. */
+	struct descriptor_table guest_idt_desc;
+	struct descriptor_table guest_gdt_desc;
+	struct i386_hw_tss guest_tss;
+	struct desc_struct guest_idt[IDT_ENTRIES];
+	struct segment_descriptor guest_gdt[GDT_ENTRIES];
+};
+
+/* We have two pages shared with guests, per cpu.  */
+struct lite_pages
+{
+	/* This is the stack page mapped rw in guest */
+	char spare[PAGE_SIZE - sizeof(struct lite_regs)];
+	struct lite_regs regs;
+
+	/* This is the host state & guest descriptor page, ro in guest */
+	struct lite_ro_state state;
+} __attribute__((aligned(PAGE_SIZE)));
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __KVM_LITE_H */
diff -r 039995825488 drivers/kvm/lite_guest.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/kvm/lite_guest.c	Wed Sep 05 11:46:18 2007 +1000
@@ -0,0 +1,527 @@
+#include <linux/init.h>
+#include <linux/kvm_lite.h>
+#include <linux/start_kernel.h>
+#include <linux/console.h>
+#include <linux/irq.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <asm/page.h>
+#include <asm/bootparam.h>
+#include <asm/paravirt.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+#include <asm/fixmap.h>
+#include <asm/mce.h>
+#include "../char/hvc_console.h"
+
+/* This is accessed by lite_guest_asm.S */
+struct kvm_lite_data kvm_lite_data;
+
+static cycle_t clock_base;
+#define LITE_CLOCK_MIN_DELTA	100UL
+#define LITE_CLOCK_MAX_DELTA	ULONG_MAX
+
+/* This is inside lite_guest_asm.S. */
+extern void kvm_lite_iret(void);
+
+static unsigned long hcall(unsigned long call,
+			   unsigned long arg1,
+			   unsigned long arg2,
+			   unsigned long arg3)
+{
+	/* FIXME: VMCALL does #GP, but we can't hand all #GP to emulator 8( */
+	asm volatile("int $0x1f; nop" /* Must be 3 bytes */
+		     : "=a"(arg1)
+		     : "b"(call), "a"(arg1), "c"(arg2), "d"(arg3)
+		     : "memory");
+	return arg1;
+}
+
+/* save_flags() is expected to return the processor state (ie. "eflags").  The
+ * eflags word contains all kind of stuff, but in practice Linux only cares
+ * about the interrupt flag.  Our "save_flags()" just returns that. */
+static unsigned long save_fl(void)
+{
+	return kvm_lite_data.irq_enabled;
+}
+
+/* "restore_flags" just sets the flags back to the value given. */
+static void restore_fl(unsigned long flags)
+{
+	kvm_lite_data.irq_enabled = flags;
+}
+
+/* Interrupts go off... */
+static void irq_disable(void)
+{
+	kvm_lite_data.irq_enabled = 0;
+}
+
+/* Interrupts go on... */
+static void irq_enable(void)
+{
+	kvm_lite_data.irq_enabled = X86_EFLAGS_IF;
+}
+
+/* Note that these assume we're writing to the active IDT/GDT. */
+static void lite_write_idt_entry(struct desc_struct *dt,
+				 int entrynum, u32 low, u32 high)
+{
+	/* Keep the local copy up to date. */
+	write_dt_entry(dt, entrynum, low, high);
+	/* Tell Host about this new entry. */
+	hcall(KVM_HCALL_LOAD_IDT_ENTRY, entrynum, low, high);
+}
+
+static void lite_write_gdt_entry(struct desc_struct *dt,
+				 int entrynum, u32 low, u32 high)
+{
+	write_dt_entry(dt, entrynum, low, high);
+	hcall(KVM_HCALL_LOAD_GDT_ENTRY, entrynum, low, high);
+}
+
+/* We always tell host where our kvm_lite_data is using base of entry
+ * 0 in GDT. */
+static void lite_load_gdt(const struct Xgt_desc_struct *desc)
+{
+	struct desc_struct *gdt = (void *)desc->address;
+
+	gdt[0].a = ((unsigned long)&kvm_lite_data << 16);
+	gdt[0].b = (((unsigned long)&kvm_lite_data >> 16) & 0xFF)
+		| ((unsigned long)&kvm_lite_data & 0xFF000000);
+	native_load_gdt(desc);
+}
+
+/* CR2 is the virtual address of the last page fault, which the Guest only ever
+ * reads.  The Host kindly writes this into our "struct lguest_data", so we
+ * just read it out of there. */
+static unsigned long lite_read_cr2(void)
+{
+	return kvm_lite_data.cr2;
+}
+
+static void lite_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	unsigned int i;
+	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+
+	/* There's one problem which normal hardware doesn't have: the Host
+	 * can't handle us removing entries we're currently using.  So we clear
+	 * the GS register here: if it's needed it'll be reloaded anyway. */
+	loadsegment(gs, 0);
+
+	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+		lite_write_gdt_entry(gdt, GDT_ENTRY_TLS_MIN + i,
+				     t->tls_array[i].a, t->tls_array[i].b);
+}
+
+/* FIXME: Host should decide what we report here. */
+static void lite_cpuid(unsigned int *eax, unsigned int *ebx,
+		       unsigned int *ecx, unsigned int *edx)
+{
+	int function = *eax;
+
+	native_cpuid(eax, ebx, ecx, edx);
+	switch (function) {
+	case 1:	/* Basic feature request. */
+		/* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
+		*ecx &= 0x00002201;
+		/* SSE, SSE2, FXSR, MMX, CMOV, PGE, CMPXCHG8B, PAE, PSE, FPU. */
+		*edx &= 0x0780A149;
+		break;
+	case 0x80000000:
+		/* Futureproof this a little: if they ask how much extended
+		 * processor information there is, limit it to known fields. */
+		if (*eax > 0x80000008)
+			*eax = 0x80000008;
+		break;
+	}
+}
+
+static void lite_load_esp0(struct tss_struct *tss,
+				     struct thread_struct *thread)
+{
+	hcall(KVM_HCALL_SET_STACK, __KERNEL_DS|0x1,
+	      thread->esp0, THREAD_SIZE/PAGE_SIZE);
+}
+
+static void lite_safe_halt(void)
+{
+	hcall(KVM_HCALL_HALT, 0, 0, 0);
+}
+
+static void disable_lite_irq(unsigned int irq)
+{
+#if 0 /* FIXME */
+	set_bit(irq, lguest_data.blocked_interrupts);
+#endif
+}
+
+static void enable_lite_irq(unsigned int irq)
+{
+#if 0 /* FIXME */
+	clear_bit(irq, lguest_data.blocked_interrupts);
+#endif
+}
+
+/* This structure describes the IRQ controller. */
+static struct irq_chip lite_irq_controller = {
+	.name		= "kvm-lite",
+	.mask		= disable_lite_irq,
+	.mask_ack	= disable_lite_irq,
+	.unmask		= enable_lite_irq,
+};
+
+/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
+ * interrupt (except 128, which is used for system calls), and then tells the
+ * Linux infrastructure that each interrupt is controlled by our level-based
+ * interrupt controller. */
+static void __init lite_init_IRQ(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < 256 - FIRST_EXTERNAL_VECTOR; i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (vector != SYSCALL_VECTOR) {
+			set_intr_gate(vector, interrupt[i]);
+			set_irq_chip_and_handler(i, &lite_irq_controller,
+						 handle_level_irq);
+		}
+	}
+	/* This call is required to set up for 4k stacks, where we have
+	 * separate stacks for hard and soft interrupts. */
+	irq_ctx_init(smp_processor_id());
+}
+
+/*
+ * Time.
+ *
+ * It would be far better for everyone if the Guest had its own clock, but
+ * until then the Host gives us the time on every interrupt.
+ */
+static unsigned long lite_get_wallclock(void)
+{
+	return kvm_lite_data.time.tv_sec;
+}
+
+static cycle_t lite_clock_read(void)
+{
+	unsigned long sec, nsec;
+
+	/* We read the time value written by the Host.  Since it's in two parts
+	 * (seconds and nanoseconds), we risk reading it just as it's changing
+	 * from 99 & 0.999999999 to 100 and 0, and getting 99 and 0.  As Linux
+	 * tends to come apart under the stress of time travel, we must be
+	 * careful: */
+	do {
+		/* First we read the seconds part. */
+		sec = kvm_lite_data.time.tv_sec;
+		/* This read memory barrier tells the compiler and the CPU that
+		 * this can't be reordered: we have to complete the above
+		 * before going on. */
+		rmb();
+		/* Now we read the nanoseconds part. */
+		nsec = kvm_lite_data.time.tv_nsec;
+		/* Make sure we've done that. */
+		rmb();
+		/* Now if the seconds part has changed, try again. */
+	} while (unlikely(kvm_lite_data.time.tv_sec != sec));
+
+	/* Our non-TSC clock is in real nanoseconds. */
+	return sec*1000000000ULL + nsec;
+}
+
+/* This is what we tell the kernel is our clocksource.  */
+static struct clocksource lite_clock = {
+	.name		= "kvm-lite",
+	.rating		= 400,
+	.read		= lite_clock_read,
+	.mask		= CLOCKSOURCE_MASK(64),
+	.mult		= 1 << 22,
+	.shift		= 22,
+};
+
+/* The "scheduler clock" is just our real clock, adjusted to start at zero */
+static unsigned long long lite_sched_clock(void)
+{
+	return cyc2ns(&lite_clock, lite_clock_read() - clock_base);
+}
+
+/* We also need a "struct clock_event_device": Linux asks us to set it to go
+ * off some time in the future.  Actually, James Morris figured all this out, I
+ * just applied the patch. */
+static int lite_clockevent_set_next_event(unsigned long delta,
+					  struct clock_event_device *evt)
+{
+	if (delta < LITE_CLOCK_MIN_DELTA) {
+		if (printk_ratelimit())
+			printk(KERN_DEBUG "%s: small delta %lu ns\n",
+			       __FUNCTION__, delta);
+		return -ETIME;
+	}
+	hcall(KVM_HCALL_SET_CLOCKEVENT, delta, 0, 0);
+	return 0;
+}
+
+static void lite_clockevent_set_mode(enum clock_event_mode mode,
+				     struct clock_event_device *evt)
+{
+	switch (mode) {
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		/* A 0 argument shuts the clock down. */
+		hcall(KVM_HCALL_SET_CLOCKEVENT, 0, 0, 0);
+		break;
+	case CLOCK_EVT_MODE_ONESHOT:
+		/* This is what we expect. */
+		break;
+	case CLOCK_EVT_MODE_PERIODIC:
+		BUG();
+	case CLOCK_EVT_MODE_RESUME:
+		break;
+	}
+}
+
+/* This describes our primitive timer chip. */
+static struct clock_event_device lite_clockevent = {
+	.name                   = "kvm-lite",
+	.features               = CLOCK_EVT_FEAT_ONESHOT,
+	.set_next_event         = lite_clockevent_set_next_event,
+	.set_mode               = lite_clockevent_set_mode,
+	.rating                 = INT_MAX,
+	.mult                   = 1,
+	.shift                  = 0,
+	.min_delta_ns           = LITE_CLOCK_MIN_DELTA,
+	.max_delta_ns           = LITE_CLOCK_MAX_DELTA,
+};
+
+/* This is the Guest timer interrupt handler (hardware interrupt 0).  We just
+ * call the clockevent infrastructure and it does whatever needs doing. */
+static void lite_time_irq(unsigned int irq, struct irq_desc *desc)
+{
+	unsigned long flags;
+
+	/* Don't interrupt us while this is running. */
+	local_irq_save(flags);
+	lite_clockevent.event_handler(&lite_clockevent);
+	local_irq_restore(flags);
+}
+
+/* At some point in the boot process, we get asked to set up our timing
+ * infrastructure.  The kernel doesn't expect timer interrupts before this, but
+ * we cleverly initialized the "blocked_interrupts" field of "struct
+ * lite_data" so that timer interrupts were blocked until now. */
+static void lite_time_init(void)
+{
+	/* Set up the timer interrupt (0) to go to our simple timer routine */
+	set_irq_handler(0, lite_time_irq);
+
+	clock_base = lite_clock_read();
+	clocksource_register(&lite_clock);
+
+	/* Now we've set up our clock, we can use it as the scheduler clock */
+	paravirt_ops.sched_clock = lite_sched_clock;
+
+	/* We can't set cpumask in the initializer: damn C limitations!  Set it
+	 * here and register our timer device. */
+	lite_clockevent.cpumask = cpumask_of_cpu(0);
+	clockevents_register_device(&lite_clockevent);
+
+	/* Finally, we unblock the timer interrupt. */
+	enable_lite_irq(0);
+}
+
+/* FIXME */
+static unsigned lite_patch(u8 type, u16 clobbers, void *insnbuf,
+			   unsigned long addr, unsigned len)
+{
+	return len;
+}
+
+static int lite_panic(struct notifier_block *nb, unsigned long l, void *p)
+{
+	hcall(9999, __pa(p), 0, 0);
+	/* The hcall won't return, but to keep gcc happy, we're "done". */
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block paniced = {
+	.notifier_call = lite_panic
+};
+
+/* FIXME: we only need this because copy_e820_map disbelieves our 1-element
+ * memory map  If we had emulated VGA, our mem map would probably pass. */
+static __init char *lite_memory_setup(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
+
+	add_memory_region(E820_MAP->addr, E820_MAP->size, E820_MAP->type);
+
+	/* This string is for the boot messages. */
+	return "kvm-lite";
+}
+
+static int put_chars(u32 vtermno, const char *buf, int count)
+{
+	unsigned int i;
+
+	for (i = 0; i < count; i++)
+		hcall(KVM_HCALL_HACK_WRITE, buf[i], 0, 0);
+
+	/* We're expected to return the amount of data we wrote: all of it. */
+	return count;
+}
+
+static int get_chars(u32 vtermno, char *buf, int count)
+{
+	return 0;
+}
+
+static struct hv_ops lite_cons = {
+	.get_chars = get_chars,
+	.put_chars = put_chars,
+};
+
+static int __init cons_init(void)
+{
+	if (strcmp(paravirt_ops.name, "kvm-lite") != 0)
+		return 0;
+
+	return hvc_instantiate(0, 0, &lite_cons);
+}
+console_initcall(cons_init);
+
+/* The standard init function */
+static int __init hvc_lite_init(void)
+{
+	hvc_alloc(0, 0, &lite_cons, 256);
+	return 0;
+}
+module_init(hvc_lite_init);
+
+void lite_trace_on(void)
+{
+	kvm_lite_data.trace = 1;
+}
+
+static void lite_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+	/* FIXME: This is the UP version, works around a qemu cmpxchg8b bug */
+	*pmdp = pmd;
+}
+
+static void lite_set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+	/* FIXME: This is the UP version, works around a qemu cmpxchg8b bug */
+	*ptep = pteval;
+}
+
+/* The boot parameters also tell us where the command-line is: save it. */
+static __init void copy_cmdline(void)
+{
+	const char *cmdline;
+
+	/* QEMU uses an old boot protocol version: hardcoded cmdline addr */
+	if (boot_params.hdr.cmd_line_ptr == 0) {
+		u16 *cl_offset = __va(OLD_CL_OFFSET);
+		cmdline = __va(OLD_CL_BASE_ADDR) + *cl_offset;
+	} else
+		cmdline = __va(boot_params.hdr.cmd_line_ptr);
+
+	memcpy(boot_command_line, cmdline, COMMAND_LINE_SIZE);
+}
+
+__init void lite_init(void *boot)
+{
+	extern struct Xgt_desc_struct boot_gdt_descr;
+
+	/* Ensures host knows where our kvm_lite_data is */
+	lite_load_gdt(&boot_gdt_descr);
+
+	/* Copy boot parameters first: the Launcher put the physical location
+	 * in %esi, and lite_guest_asm.S converted that to a virtual address
+	 * and handed it to us. */
+	memcpy(&boot_params, boot, PARAM_SIZE);
+
+	copy_cmdline();
+
+	/* We're under kvm-lite, paravirt is enabled, and we're running at
+	 * privilege level 1, not 0 as normal. */
+	paravirt_ops.name = "kvm-lite";
+	paravirt_ops.paravirt_enabled = 1;
+	paravirt_ops.kernel_rpl = 1;
+
+	paravirt_ops.save_fl = save_fl;
+	paravirt_ops.restore_fl = restore_fl;
+	paravirt_ops.irq_disable = irq_disable;
+	paravirt_ops.irq_enable = irq_enable;
+	paravirt_ops.cpuid = lite_cpuid;
+	paravirt_ops.iret = kvm_lite_iret;
+	paravirt_ops.load_esp0 = lite_load_esp0;
+	paravirt_ops.load_tls = lite_load_tls;
+	paravirt_ops.read_cr2 = lite_read_cr2;
+	paravirt_ops.safe_halt = lite_safe_halt;
+	paravirt_ops.write_gdt_entry = lite_write_gdt_entry;
+	paravirt_ops.write_idt_entry = lite_write_idt_entry;
+	paravirt_ops.iret = kvm_lite_iret;
+	paravirt_ops.memory_setup = lite_memory_setup;
+	paravirt_ops.init_IRQ = lite_init_IRQ;
+	paravirt_ops.get_wallclock = lite_get_wallclock;
+	paravirt_ops.time_init = lite_time_init;
+	paravirt_ops.patch = lite_patch;
+	paravirt_ops.set_pmd = lite_set_pmd;
+	paravirt_ops.set_pte_atomic = lite_set_pte_atomic;
+
+	/* FIXME: If the emulator handled ltr, we wouldn't need this.  But we
+	 * probably want to suppress it and simply write the tr value into the
+	 * kvm_lite_para. */
+	paravirt_ops.load_tr_desc = paravirt_nop;
+	/* FIXME */
+	paravirt_ops.set_ldt = paravirt_nop;
+
+	/* Load the %fs segment register (the per-cpu segment register) with
+	 * the normal data segment to get through booting. */
+	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
+
+	/* The Host uses the top of the Guest's virtual address space for the
+	 * Host<->Guest Switcher, and it tells us how much it needs in
+	 * lite.reserve_mem, set up by the initial wrmsr. */
+	reserve_top_address(kvm_lite_data.reserve_mem);
+
+	/* If we don't initialize the lock dependency checker now, it crashes
+	 * paravirt_disable_iospace. */
+	lockdep_init();
+
+	/* FIXME: If we want emulated devices, remove this. */
+	/* The IDE code spends about 3 seconds probing for disks: if we reserve
+	 * all the I/O ports up front it can't get them and so doesn't probe.
+	 * Other device drivers are similar (but less severe).  This cuts the
+	 * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */
+	paravirt_disable_iospace();
+
+	/* This is messy CPU setup stuff which the native boot code does before
+	 * start_kernel, so we have to do, too: */
+	cpu_detect(&new_cpu_data);
+	/* head.S usually sets up the first capability word, so do it here. */
+	new_cpu_data.x86_capability[0] = cpuid_edx(1);
+
+	/* Math is always hard! */
+	new_cpu_data.hard_math = 1;
+
+#ifdef CONFIG_X86_MCE
+	mce_disabled = 1;
+#endif
+#ifdef CONFIG_ACPI
+	acpi_disabled = 1;
+	acpi_ht = 0;
+#endif
+
+	/* We set the perferred console to "hvc" */
+	add_preferred_console("hvc", 0, NULL);
+
+	/* Now we're set up, call start_kernel() in init/main.c and we proceed
+	 * to boot as normal.  It never returns. */
+	start_kernel();
+}
diff -r 039995825488 drivers/kvm/lite_guest_asm.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/kvm/lite_guest_asm.S	Wed Sep 05 11:46:48 2007 +1000
@@ -0,0 +1,92 @@
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/processor-flags.h>
+
+LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
+
+#if PTRS_PER_PMD > 1
+PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
+#else
+PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
+#endif
+BOOTBITMAP_SIZE = LOW_PAGES / 8
+ALLOCATOR_SLOP = 4
+
+INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
+
+/* This is where we begin: head.S notes that the boot header's platform
+ * type field is "2" (kvm-lite), so calls us here.  The boot header is in %esi.
+ *
+ * WARNING: be very careful here!  We're running at addresses equal to physical
+ * addesses (around 0), not above PAGE_OFFSET as most code expectes
+ * (eg. 0xC0000000).  Jumps are relative, so they're OK, but we can't touch any
+ * data.
+ *
+ * The .section line puts this code in .init.text so it will be discarded after
+ * boot. */
+.section .init.text, "ax", @progbits
+ENTRY(kvm_lite_init_asm)
+	/* Clear BSS first so that there are no surprises... */
+	xorl %eax,%eax
+	movl $__bss_start - __PAGE_OFFSET,%edi
+	movl $__bss_stop - __PAGE_OFFSET,%ecx
+	subl %edi,%ecx
+	shrl $2,%ecx
+	rep ; stosl
+
+	/* Set up swapper_pg_dir page tables as per head.S */
+page_pde_offset = (__PAGE_OFFSET >> 20);
+
+	movl $(pg0 - __PAGE_OFFSET), %edi
+	movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
+	movl $0x007, %eax			/* 0x007 = PRESENT+RW+USER */
+10:
+	leal 0x007(%edi),%ecx			/* Create PDE entry */
+	movl %ecx,(%edx)			/* Store identity PDE entry */
+	movl %ecx,page_pde_offset(%edx)		/* Store kernel PDE entry */
+	addl $4,%edx
+	movl $1024, %ecx
+11:
+	stosl
+	addl $0x1000,%eax
+	loop 11b
+	/* End condition: we must map up to and including INIT_MAP_BEYOND_END */
+	/* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */
+	leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp
+	cmpl %ebp,%eax
+	jb 10b
+	movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
+
+	movl $swapper_pg_dir-__PAGE_OFFSET,%eax
+	movl %eax,%cr3		/* set the page table pointer.. */
+
+	/* Set up the initial stack so we can run C code. */
+ 	movl $(init_thread_union+THREAD_SIZE),%esp
+
+	/* Set up boot information pointer to hand to lite_init(): it wants
+	 * a virtual address. */
+	movl %esi, %eax
+	addl $__PAGE_OFFSET, %eax
+
+	pushl $0		/* fake return address for unwinder */
+
+	/* Jumps are relative, and we're running __PAGE_OFFSET too low at the
+	 * moment. */
+	jmp lite_init+__PAGE_OFFSET
+
+/* FIXME: tell host not to interrupt us between lite_noirq_start & end? */
+.text
+ENTRY(kvm_lite_iret)
+	pushl	%eax
+	movl	12(%esp), %eax
+lite_noirq_start:
+	/* Note the %ss: segment prefix here.  Normal data accesses use the
+	 * "ds" segment, but that will have already been restored for whatever
+	 * we're returning to (such as userspace): we can't trust it.  The %ss:
+	 * prefix makes sure we use the stack segment, which is still valid. */
+	movl	%eax,%ss:kvm_lite_data+LITE_DATA_irq_enabled
+	popl	%eax
+	iret
+lite_noirq_end:
+	
diff -r 039995825488 drivers/kvm/lite_switcher.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/kvm/lite_switcher.S	Fri Aug 31 15:42:48 2007 +1000
@@ -0,0 +1,309 @@
+// Not all kernel headers work from assembler
+// But these ones are needed: the ENTRY() define
+// And constants extracted from struct offsets
+// To avoid magic numbers and breakage:
+// Should they change the compiler can't save us
+// Down here in the depths of assembler code.
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include "lite.h"
+
+// We mark the start of the code to copy
+// It's placed in .text tho it's never run here
+// You'll see the trick macro at the end
+// Which interleaves data and text to effect.
+.text
+ENTRY(start_switcher_text)
+
+// When we reach switch_to_guest we have just left
+// The safe and comforting shores of C code
+// %eax has the "struct lite_pages" that we need
+// Where we save state and still see it from the Guest
+// And %ebx holds the Guest shadow pagetable:
+// Once set we have truly left Host behind.
+ENTRY(switch_to_guest)
+	// We told gcc all its regs could fade,
+	// Clobbered by our journey into the Guest
+	// We could have saved them, if we tried
+	// But time is our master and cycles count.
+
+	// Segment registers must be saved for the Host
+	// We push them on the Host stack for later
+	pushl	%es
+	pushl	%ds
+	pushl	%gs
+	pushl	%fs
+	// But the compiler is fickle, and heeds
+	// No warning of %ebp clobbers
+	// When frame pointers are used.  That register
+	// Must be saved and restored or chaos strikes.
+	pushl	%ebp
+	// The Host's stack is done, now save it away
+	// In our "struct lite_pages" at the offset
+	// Distilled into asm-offsets.h
+	movl	%esp, LITE_PAGES_host_sp(%eax)
+
+	// All saved and there's now five steps before us:
+	// Stack, GDT, IDT, TSS
+	// And last of all the page tables are flipped.
+
+	// Yet beware that our stack pointer must be
+	// Always valid lest an NMI hits
+	// %edx does the duty here as we juggle
+	// %eax is lite_pages and our stack lies within.
+	movl	%eax, %edx
+	addl	$LITE_PAGES_regs, %edx
+	movl	%edx, %esp
+
+	// The Guest's GDT we so carefully
+	// Placed in the "struct lite_pages" earlier
+	lgdt	LITE_PAGES_guest_gdt_desc(%eax)
+
+	// The Guest's IDT we did partially
+	// Copy to the "struct lite_pages" as well.
+	lidt	LITE_PAGES_guest_idt_desc(%eax)
+
+	// The TSS entry which controls traps
+	// Must be loaded up with "ltr" now:
+	// For after we switch over our page tables
+	// It (as the rest) will be writable no more.
+	// (The GDT entry TSS needs
+	// Changes type when we load it: damn Intel!)
+	movl	$(GDT_ENTRY_TSS*8), %edx
+	ltr	%dx
+
+	// Look back now, before we take this last step!
+	// The Host's TSS entry was also marked used;
+	// Let's clear it again, ere we return.
+	// The GDT descriptor of the Host
+	// Points to the table after two "size" bytes
+	movl	(LITE_PAGES_host_gdt_desc+2)(%eax), %edx
+	// Clear the type field of "used" (byte 5, bit 2)
+	andb	$0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
+
+	// Once our page table's switched, the Guest is live!
+	// The Host fades as we run this final step.
+	// Our "struct lite_pages" is now half read-only.
+	movl	%ebx, %cr3
+
+	// The Host did put our registers in "regs"
+	// Which waits deep within the "struct lite_pages"
+	// We can simply pop off all Guest regs.
+	popl	%ebx
+	popl	%ecx
+	popl	%edx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	popl	%gs
+	popl	%eax
+	popl	%fs
+	popl	%ds
+	popl	%es
+
+	// Near the base of the stack lurk two strange fields
+	// Which we fill as we exit the Guest
+	// These are the trap number and its error
+	// We can simply step past them on our way.
+	addl	$8, %esp
+
+	// The last five stack slots hold return address
+	// And everything needed to change privilege
+	// Into the Guest privilege level of 1,
+	// And the stack where the Guest had last left it.
+	// Interrupts are turned back on: we are Guest.
+	iret
+
+// There are two paths where we switch to the Host
+// So we put the routine in a macro.
+// We are on our way home, back to the Host
+// Interrupted out of the Guest, we come here.
+#define SWITCH_TO_HOST							\
+	/* We save the Guest state: all registers first			\
+	 * Laid out just as "struct lite_regs" has defined */		\
+	pushl	%es;							\
+	pushl	%ds;							\
+	pushl	%fs;							\
+	pushl	%eax;							\
+	pushl	%gs;							\
+	pushl	%ebp;							\
+	pushl	%edi;							\
+	pushl	%esi;							\
+	pushl	%edx;							\
+	pushl	%ecx;							\
+	pushl	%ebx;							\
+	/* Our stack and our code are using segments			\
+	 * Set in the TSS and IDT					\
+	 * Yet if we were to touch data we'd use			\
+	 * Whatever data segment the Guest had.				\
+	 * Load the lite ds segment to begin. */			\
+	movl	$(LITE_DS), %eax;					\
+	movl	%eax, %ds;						\
+	/* So where are we?  Which CPU, which struct?			\
+	 * The stack is our clue: our TSS starts			\
+	 * It at the end of the "struct lite_pages".			\
+	 * Or we may have stumbled while restoring			\
+	 * Our Guest segment regs while in switch_to_guest,		\
+	 * The fault pushed atop that part-unwound stack.		\
+	 * If we round the stack down to the page start			\
+	 * To find the start of our "struct lite_pages". */		\
+	movl	%esp, %eax;						\
+	andl	$(~(1 << PAGE_SHIFT - 1)), %eax;			\
+	/* Save our trap number: the switch will obscure it		\
+	 * (The Guest regs are not mapped here in the Host)		\
+	 * %ebx holds it safe for deliver_to_host */			\
+	movl	LITE_PAGES_regs_trapnum(%eax), %ebx;			\
+	/* The Host GDT, IDT and stack!					\
+	 * All these lie safely hidden from the Guest:			\
+	 * We must return to the Host page tables			\
+	 * (Hence that was saved in our struct lite_pages) */		\
+	movl	LITE_PAGES_host_cr3(%eax), %edx;			\
+	movl	%edx, %cr3;						\
+	/* As before, when we looked back at the Host			\
+	 * As we left and marked TSS unused				\
+	 * So must we now for the Guest left behind. */			\
+	andb	$0xFD, (LITE_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax);	\
+	/* Switch to Host's GDT, IDT. */				\
+	lgdt	LITE_PAGES_host_gdt_desc(%eax);				\
+	lidt	LITE_PAGES_host_idt_desc(%eax);				\
+	/* Restore the Host's stack where it's saved regs lie */	\
+	movl	LITE_PAGES_host_sp(%eax), %esp;				\
+	/* Last the TSS: our Host is complete */			\
+	movl	$(GDT_ENTRY_TSS*8), %edx;				\
+	ltr	%dx;							\
+	/* Restore now the regs saved right at the first. */		\
+	popl	%ebp;							\
+	popl	%fs;							\
+	popl	%gs;							\
+	popl	%ds;							\
+	popl	%es
+
+// Here's where we come when the Guest has just trapped:
+// (Which trap we'll see has been pushed on the stack).
+// We need only switch back, and the Host will decode
+// Why we came home, and what needs to be done.
+return_to_host:
+	SWITCH_TO_HOST
+	iret
+
+// An interrupt, with some cause external
+// Has ajerked us rudely from the Guest's code
+// Again we must return home to the Host
+deliver_to_host:
+	SWITCH_TO_HOST
+	// But now we must go home via that place
+	// Where that interrupt was supposed to go
+	// Had we not been ensconced, running the Guest.
+	// Here we see the cleverness of our stack:
+	// The Host stack is formed like an interrupt
+	// With EIP, CS and EFLAGS layered.
+	// Interrupt handlers end with "iret"
+	// And that will take us home at long long last.
+
+	// But first we must find the handler to call!
+	// The IDT descriptor for the Host
+	// Has two bytes for size, and four for address:
+	// %edx will hold it for us for now.
+	movl	(LITE_PAGES_host_idt_desc+2)(%eax), %edx
+	// We now know the table address we need,
+	// And saved the trap's number inside %ebx.
+	// Yet the pointer to the handler is smeared
+	// Across the bits of the table entry.
+	// What oracle can tell us how to extract
+	// From such a convoluted encoding?
+	// I consulted gcc, and it gave
+	// These instructions, which I gladly credit:
+	leal	(%edx,%ebx,8), %eax
+	movzwl	(%eax),%edx
+	movl	4(%eax), %eax
+	xorw	%ax, %ax
+	orl	%eax, %edx
+	// Now the address of the handler's in %edx
+	// We call it now: its "iret" takes us home.
+	jmp	*%edx
+
+// Every interrupt can come to us here
+// But we must truly tell each apart.
+// They number two hundred and fifty six
+// And each must land in a different spot,
+// Push its number on stack, and join the stream.
+
+// And worse, a mere six of the traps stand apart
+// And push on their stack an addition:
+// An error number, thirty two bits long
+// So we punish the other two fifty
+// And make them push a zero so they match.
+
+// Yet two fifty six entries is long
+// And all will look most the same as the last
+// So we create a macro which can make
+// As many entries as we need to fill.
+
+// Note the change to .data then .text:
+// We plant the address of each entry
+// Into a (data) table for the Host
+// To know where each Guest interrupt should go.
+.macro IRQ_STUB N TARGET
+	.data; .long 1f; .text; 1:
+ // Trap eight, ten through fourteen and seventeen
+ // Supply an error number.  Else zero.
+ .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
+	pushl	$0
+ .endif
+	pushl	$\N
+	jmp	\TARGET
+	ALIGN
+.endm
+
+// This macro creates numerous entries
+// Using GAS macros which out-power C's.
+.macro IRQ_STUBS FIRST LAST TARGET
+ irq=\FIRST
+ .rept \LAST-\FIRST+1
+	IRQ_STUB irq \TARGET
+  irq=irq+1
+ .endr
+.endm
+
+// Here's the marker for our pointer table
+// Laid in the data section just before
+// Each macro places the address of code
+// Forming an array: each one points to text
+// Which handles interrupt in its turn.
+.data
+.global default_idt_entries
+default_idt_entries:
+.text
+	// The first two traps go straight back to the Host
+	IRQ_STUBS 0 1 return_to_host
+	// We'll say nothing, yet, about NMI
+	IRQ_STUB 2 handle_nmi
+	// Other traps also return to the Host
+	IRQ_STUBS 3 31 return_to_host
+	// All interrupts go via their handlers
+	IRQ_STUBS 32 127 deliver_to_host
+	// 'Cept system calls coming from userspace
+	// Are to go to the Guest, never the Host.
+	IRQ_STUB 128 return_to_host
+	IRQ_STUBS 129 255 deliver_to_host
+
+// The NMI, what a fabulous beast
+// Which swoops in and stops us no matter that
+// We're suspended between heaven and hell,
+// (Or more likely between the Host and Guest)
+// When in it comes!  We are dazed and confused
+// So we do the simplest thing which one can.
+// Though we've pushed the trap number and zero
+// We discard them, return, and hope we live.
+handle_nmi:
+	addl	$8, %esp
+	iret
+
+// We are done; all that's left is Mastery
+// And "make Mastery" is a journey long
+// Designed to make your fingers itch to code.
+
+// Here ends the text, the file and poem.
+ENTRY(end_switcher_text)
diff -r 039995825488 drivers/kvm/mmu.c
--- a/drivers/kvm/mmu.c	Fri Aug 31 15:38:42 2007 +1000
+++ b/drivers/kvm/mmu.c	Fri Aug 31 15:42:48 2007 +1000
@@ -26,6 +26,7 @@
 #include <linux/highmem.h>
 #include <linux/module.h>
 
+#include <asm/io.h>
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
 
@@ -132,6 +133,7 @@ static int dbg = 1;
 #define PT32_DIR_BASE_ADDR_MASK \
 	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
 
+#define SHADOW_NAILED_RESERVE ((unsigned long)-(PT64_ENT_PER_PAGE * PAGE_SIZE))
 
 #define PFERR_PRESENT_MASK (1U << 0)
 #define PFERR_WRITE_MASK (1U << 1)
@@ -1120,6 +1122,131 @@ static void mmu_pte_write_new_pte(struct
 	else
 		paging64_update_pte(vcpu, page, spte, new, bytes);
 }
+
+/* FIXME: SMP guests cannot share toplevels with this. */
+struct nailed_mappings
+{
+	/* We might need to fake up numerous levels. */
+#ifdef CONFIG_X86_64
+	struct page *ptes[4];
+#else
+	struct page *ptes[2];
+#endif
+};
+static DEFINE_PER_CPU(struct nailed_mappings, nailed_mappings);
+
+static u64 nailed_pte_of(struct page *page)
+{
+	return ((u64)page_to_pfn(page) << PAGE_SHIFT)
+		| PT_PRESENT_MASK | PT_WRITABLE_MASK
+		| PT_ACCESSED_MASK | PT_DIRTY_MASK;
+}
+
+/* Nailed mappings are 4k page mappings at -2M */
+int kvm_mmu_init_nailed_mapping(int cpu, struct page *page[], unsigned int num)
+{
+	int i;
+	u64 *ptepage, pte;
+	struct nailed_mappings *nm = &per_cpu(nailed_mappings, cpu);
+
+	printk("SHADOW_NAILED_RESERVE = %#lx\n", SHADOW_NAILED_RESERVE);
+
+	printk("Initializing %u nailed mappings for cpu %i\n", num, cpu);
+	for (i = 0; i < ARRAY_SIZE(nm->ptes); i++) {
+		nm->ptes[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+		if (!nm->ptes[i])
+			goto free;
+	}
+
+	/* Chain last pte entries to point to our PTE page. */
+	for (i = 1; i < ARRAY_SIZE(nm->ptes); i++) {
+		ptepage = page_address(nm->ptes[i]);
+		ptepage[PT64_ENT_PER_PAGE - 1] = nailed_pte_of(nm->ptes[i-1]);
+	}
+
+	/* Bottom page contains pages as given by args. */ 
+	ptepage = page_address(nm->ptes[0]);
+	for (i = 0; i < num; i++) {
+		if (!page[i])
+			continue;
+		pte = (u64)page_to_pfn(page[i]) << PAGE_SHIFT;
+		pte |= (PT_PRESENT_MASK|PT_ACCESSED_MASK|PT_DIRTY_MASK);
+		if (page[i]->private & 1)
+			pte |= PT_WRITABLE_MASK;
+		printk("%p[%i] = %llx\n", ptepage, i, pte);
+		ptepage[i] = pte;
+	}
+	return 0;
+
+free:
+	while (--i >= 0)
+		__free_page(nm->ptes[i]);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_init_nailed_mapping);
+
+void kvm_mmu_free_nailed_mapping(int cpu)
+{
+	unsigned int i;
+	struct nailed_mappings *nm = &per_cpu(nailed_mappings, cpu);
+
+	for (i = 0; i < ARRAY_SIZE(nm->ptes); i++)
+		__free_page(nm->ptes[i]);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_nailed_mapping);
+
+void kvm_use_nailed_mappings(struct kvm_vcpu *vcpu)
+{
+	struct nailed_mappings *nm = &__get_cpu_var(nailed_mappings);
+	hpa_t shadow_page = vcpu->mmu.root_hpa;
+	u64 *shadow_ent = NULL;
+	int level;
+
+	vcpu->mmu.nailed = 1;
+
+	level = vcpu->mmu.shadow_root_level;
+	if (level == PT32E_ROOT_LEVEL) {
+		shadow_page = vcpu->mmu.pae_root[3] & PT64_BASE_ADDR_MASK;
+		level--;
+	}
+
+	for (; level > PT_PAGE_TABLE_LEVEL; level--) {
+		shadow_ent = ((u64 *)__va(shadow_page)) + PT64_ENT_PER_PAGE-1;
+		if (!is_present_pte(*shadow_ent))
+			break;
+		shadow_page = *shadow_ent & PT64_BASE_ADDR_MASK;
+	}
+	*shadow_ent = nailed_pte_of(nm->ptes[level-2]);
+}
+EXPORT_SYMBOL_GPL(kvm_use_nailed_mappings);
+
+/* FIXME: We should be able to handle this in the rest of the code, and only
+ * remove when needed. */
+void kvm_remove_nailed_mappings(struct kvm_vcpu *vcpu)
+{
+	struct nailed_mappings *nm = &__get_cpu_var(nailed_mappings);
+	hpa_t shadow_page = vcpu->mmu.root_hpa;
+	u64 *shadow_ent = NULL;
+	int level;
+
+	vcpu->mmu.nailed = 1;
+
+	level = vcpu->mmu.shadow_root_level;
+	if (level == PT32E_ROOT_LEVEL) {
+		shadow_page = vcpu->mmu.pae_root[3] & PT64_BASE_ADDR_MASK;
+		level--;
+	}
+
+	for (; level > PT_PAGE_TABLE_LEVEL; level--) {
+		shadow_ent = ((u64 *)__va(shadow_page)) + PT64_ENT_PER_PAGE-1;
+		if (*shadow_ent == nailed_pte_of(nm->ptes[level-2]))
+			break;
+		BUG_ON(!is_present_pte(*shadow_ent));
+		shadow_page = *shadow_ent & PT64_BASE_ADDR_MASK;
+	}
+	*shadow_ent = 0;
+}
+EXPORT_SYMBOL_GPL(kvm_remove_nailed_mappings);
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes)
diff -r 039995825488 drivers/kvm/paging_tmpl.h
--- a/drivers/kvm/paging_tmpl.h	Fri Aug 31 15:38:42 2007 +1000
+++ b/drivers/kvm/paging_tmpl.h	Fri Aug 31 15:42:48 2007 +1000
@@ -86,6 +86,11 @@ static int FNAME(walk_addr)(struct guest
 	walker->page = NULL;
 	walker->ptep = NULL;
 	root = vcpu->cr3;
+
+	/* Don't let them do anything with the nailed area. */
+	if (vcpu->mmu.nailed && addr >= SHADOW_NAILED_RESERVE)
+		goto not_present;
+
 #if PTTYPE == 64
 	if (!is_long_mode(vcpu)) {
 		walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
diff -r 039995825488 drivers/kvm/x86_emulate.c
--- a/drivers/kvm/x86_emulate.c	Fri Aug 31 15:38:42 2007 +1000
+++ b/drivers/kvm/x86_emulate.c	Fri Aug 31 15:42:48 2007 +1000
@@ -1451,7 +1451,9 @@ twobyte_special_insn:
 		rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
 		if (rc) {
 			kvm_arch_ops->inject_gp(ctxt->vcpu, 0);
-			_eip = ctxt->vcpu->rip;
+			/* Don't restore regs (inject_gp can change them) */
+			rc = X86EMUL_CONTINUE;
+			goto done;
 		}
 		rc = X86EMUL_CONTINUE;
 		break;
@@ -1460,7 +1462,9 @@ twobyte_special_insn:
 		rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
 		if (rc) {
 			kvm_arch_ops->inject_gp(ctxt->vcpu, 0);
-			_eip = ctxt->vcpu->rip;
+			/* Don't restore regs (inject_gp can change them) */
+			rc = X86EMUL_CONTINUE;
+			goto done;
 		} else {
 			_regs[VCPU_REGS_RAX] = (u32)msr_data;
 			_regs[VCPU_REGS_RDX] = msr_data >> 32;
diff -r 039995825488 include/linux/cpumask.h
--- a/include/linux/cpumask.h	Fri Aug 31 15:38:42 2007 +1000
+++ b/include/linux/cpumask.h	Fri Aug 31 15:42:48 2007 +1000
@@ -218,8 +218,8 @@ int __next_cpu(int n, const cpumask_t *s
 int __next_cpu(int n, const cpumask_t *srcp);
 #define next_cpu(n, src) __next_cpu((n), &(src))
 #else
-#define first_cpu(src)		0
-#define next_cpu(n, src)	1
+#define first_cpu(src)		((src).bits[0]&1?0:NR_CPUS)
+#define next_cpu(n, src)	NR_CPUS
 #endif
 
 #define cpumask_of_cpu(cpu)						\
diff -r 039995825488 include/linux/kvm_lite.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/linux/kvm_lite.h	Sun Sep 02 10:30:28 2007 +1000
@@ -0,0 +1,33 @@
+#ifndef __LINUX_KVM_LITE_H
+#define __LINUX_KVM_LITE_H
+#include <linux/time.h>
+
+#define KVM_HCALL_LOAD_IDT_ENTRY	100
+#define KVM_HCALL_LOAD_GDT_ENTRY	101
+#define KVM_HCALL_SET_STACK		102
+#define KVM_HCALL_HALT			103
+#define KVM_HCALL_HACK_WRITE		104
+#define KVM_HCALL_SET_CLOCKEVENT	105
+
+struct kvm_lite_data
+{
+	/* 512 == enabled (same as eflags in normal hardware).  The Guest
+	 * changes interrupts so often that a hypercall is too slow. */
+	unsigned int irq_enabled;
+
+	/* The Host writes the virtual address of the last page fault here,
+	 * which saves the Guest a hypercall.  CR2 is the native register where
+	 * this address would normally be found. */
+	unsigned long cr2;
+
+	int trace;
+
+/* Fields initialized by the Host at first hypercall: */
+	/* Memory not to try to access */
+	unsigned long reserve_mem;
+	/* Current time. */
+	struct timespec time;
+
+/* Fields initialized by the Guest at boot: */
+};
+#endif /* __LINUX_KVM_LITE_H */
diff -r 039995825488 include/linux/kvm_para.h
--- a/include/linux/kvm_para.h	Fri Aug 31 15:38:42 2007 +1000
+++ b/include/linux/kvm_para.h	Thu Sep 06 01:30:36 2007 +1000
@@ -90,4 +90,11 @@ static inline int kvm_para_has_feature(u
 
 #define KVM_ENOSYS		ENOSYS
 
+#define KVM_HCALL_LOAD_IDT_ENTRY	100
+#define KVM_HCALL_LOAD_GDT_ENTRY	101
+#define KVM_HCALL_SET_STACK		102
+#define KVM_HCALL_HALT			103
+#define KVM_HCALL_HACK_WRITE		104
+#define KVM_HCALL_SET_CLOCKEVENT	105
+
 #endif



-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >>  http://get.splunk.com/

next prev parent reply	other threads:[~2007-09-05 15:42 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-09-05 15:19 [PATCH 1/5] Clean up unloved invlpg: remove kvm_arch_ops.invlpg and tweak emulator Rusty Russell
     [not found] ` <1189005567.10802.127.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2007-09-05 15:20   ` [PATCH 2/5] Keep control regs in sync Rusty Russell
     [not found]     ` <1189005638.10802.129.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2007-09-05 15:21       ` [PATCH 3/5] Hoist SVM's get_cs_db_l_bits into core code Rusty Russell
     [not found]         ` <1189005692.10802.132.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2007-09-05 15:42           ` Rusty Russell [this message]
     [not found]             ` <1189006973.10802.140.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2007-09-05 15:44               ` [PATCH 5/5] kvm-lite qemu patch Rusty Russell
     [not found]                 ` <1189007087.10802.144.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2007-09-05 17:02                   ` Avi Kivity
2007-09-05 17:02                   ` Avi Kivity
     [not found]                     ` <46DEE137.5020102-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-09-05 17:36                       ` Rusty Russell
     [not found]                         ` <1189013805.10802.168.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2007-09-05 19:10                           ` Avi Kivity
2007-09-05 15:47               ` [PATCH 4/5] kvm-lite: "The Unbearable Liteness" Anthony Liguori
2007-09-05 16:16                 ` Rusty Russell
     [not found]                   ` <1189008968.10802.154.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2007-09-05 16:21                     ` Anthony Liguori
2007-09-05 17:08                       ` Avi Kivity
2007-09-05 17:02               ` Avi Kivity
2007-09-09 11:22           ` [PATCH 3/5] Hoist SVM's get_cs_db_l_bits into core code Avi Kivity
2007-09-05 15:30   ` [PATCH 1/5] Clean up unloved invlpg: remove kvm_arch_ops.invlpg and tweak emulator Avi Kivity
     [not found]     ` <46DECBA7.7020905-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-09-05 16:22       ` Rusty Russell
     [not found]         ` <1189009359.10802.157.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2007-09-09 11:13           ` Avi Kivity

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1189006973.10802.140.camel@localhost.localdomain \
    --to=rusty-8n+1lvoiyb80n/f98k4iww@public.gmane.org \
    --cc=kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox