* [PATCH 4/5] kvm-lite: "The Unbearable Liteness"
[not found] ` <1189005692.10802.132.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2007-09-05 15:42 ` Rusty Russell
[not found] ` <1189006973.10802.140.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2007-09-09 11:22 ` [PATCH 3/5] Hoist SVM's get_cs_db_l_bits into core code Avi Kivity
1 sibling, 1 reply; 18+ messages in thread
From: Rusty Russell @ 2007-09-05 15:42 UTC (permalink / raw)
To: kvm-devel
This patch is not for serious application, but makes interesting
reading. Requires Anthony's new hypercall patches.
Comments:
1) asm-offsets are required for lite's switcher/trampoline (lite_switcher.S)
and for guest's assembler code.
2) Includes proposed "platform_type" extension for boot protocol. May not be
necessary.
3) kvm/ now contains guest code, so needs to be obj-y.
4) Debugging flags just to prove code isn't ready
5) Changes emulator not to try to restore regs after inject_gp. May refactor
(we need to deal with this for inject_pf anyway).
6) Fixes "first_cpu()" macro.
7) Temporary hcalls, deliberately sucky.
diff -r 039995825488 arch/i386/kernel/asm-offsets.c
--- a/arch/i386/kernel/asm-offsets.c Fri Aug 31 15:38:42 2007 +1000
+++ b/arch/i386/kernel/asm-offsets.c Sun Sep 02 10:13:37 2007 +1000
@@ -22,6 +22,11 @@
#ifdef CONFIG_LGUEST_GUEST
#include <linux/lguest.h>
#include "../../../drivers/lguest/lg.h"
+#endif
+
+#if defined(CONFIG_KVM_LITE) || defined(CONFIG_KVM_LITE_MODULE)
+#include <linux/kvm_lite.h>
+#include "../../../drivers/kvm/lite.h"
#endif
#define DEFINE(sym, val) \
@@ -144,4 +149,19 @@ void foo(void)
OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
#endif
+
+#if defined(CONFIG_KVM_LITE) || defined(CONFIG_KVM_LITE_MODULE)
+ BLANK();
+ OFFSET(LITE_DATA_irq_enabled, kvm_lite_data, irq_enabled);
+ OFFSET(LITE_PAGES_host_gdt_desc, lite_pages, state.host_gdt_desc);
+ OFFSET(LITE_PAGES_host_idt_desc, lite_pages, state.host_idt_desc);
+ OFFSET(LITE_PAGES_host_cr3, lite_pages, state.host_cr3);
+ OFFSET(LITE_PAGES_host_sp, lite_pages, state.host_sp);
+ OFFSET(LITE_PAGES_guest_gdt_desc, lite_pages,state.guest_gdt_desc);
+ OFFSET(LITE_PAGES_guest_idt_desc, lite_pages,state.guest_idt_desc);
+ OFFSET(LITE_PAGES_guest_gdt, lite_pages, state.guest_gdt);
+ OFFSET(LITE_PAGES_regs_trapnum, lite_pages, regs.trapnum);
+ OFFSET(LITE_PAGES_regs_errcode, lite_pages, regs.errcode);
+ OFFSET(LITE_PAGES_regs, lite_pages, regs);
+#endif
}
diff -r 039995825488 arch/i386/kernel/head.S
--- a/arch/i386/kernel/head.S Fri Aug 31 15:38:42 2007 +1000
+++ b/arch/i386/kernel/head.S Fri Aug 31 15:42:48 2007 +1000
@@ -70,7 +70,14 @@ INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE +
*/
.section .text.head,"ax",@progbits
ENTRY(startup_32)
-
+/*
+ * Check if we're some special platform type.
+ * FIXME: check header version first!
+ */
+#ifdef CONFIG_KVM_LITE_GUEST
+ cmpl $2, 0x23c(%esi)
+ je kvm_lite_init_asm
+#endif
/*
* Set segments to known values.
*/
diff -r 039995825488 drivers/Makefile
--- a/drivers/Makefile Fri Aug 31 15:38:42 2007 +1000
+++ b/drivers/Makefile Fri Aug 31 15:42:48 2007 +1000
@@ -48,7 +48,7 @@ obj-$(CONFIG_PCCARD) += pcmcia/
obj-$(CONFIG_PCCARD) += pcmcia/
obj-$(CONFIG_DIO) += dio/
obj-$(CONFIG_SBUS) += sbus/
-obj-$(CONFIG_KVM) += kvm/
+obj-y += kvm/
obj-$(CONFIG_ZORRO) += zorro/
obj-$(CONFIG_MAC) += macintosh/
obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/
diff -r 039995825488 drivers/kvm/Kconfig
--- a/drivers/kvm/Kconfig Fri Aug 31 15:38:42 2007 +1000
+++ b/drivers/kvm/Kconfig Fri Aug 31 15:42:48 2007 +1000
@@ -41,4 +41,22 @@ config KVM_AMD
Provides support for KVM on AMD processors equipped with the AMD-V
(SVM) extensions.
+config KVM_LITE
+ tristate "KVM-lite for unsupported processors"
+ depends on KVM
+ select X86_PAE
+ ---help---
+ Provides minimal support for booting paravirtualized guests.
+ Does not require Intel VT or AMD AMD-V extensions, but does require
+ a processor which support PAE (Pentium Pro and better). If you say
+ Y or M here, your kernel will not boot on a processor without PAE
+ support.
+
+config KVM_LITE_GUEST
+ bool "KVM-lite guest support"
+ depends on PARAVIRT && EXPERIMENTAL
+ select HVC_DRIVER
+ ---help---
+ Allows this kernel to run under kvm-lite.
+
endif # VIRTUALIZATION
diff -r 039995825488 drivers/kvm/Makefile
--- a/drivers/kvm/Makefile Fri Aug 31 15:38:42 2007 +1000
+++ b/drivers/kvm/Makefile Fri Aug 31 15:42:48 2007 +1000
@@ -8,3 +8,11 @@ obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
kvm-amd-objs = svm.o
obj-$(CONFIG_KVM_AMD) += kvm-amd.o
+kvm-lite-objs = lite.o lite_switcher.o
+obj-$(CONFIG_KVM_LITE) += kvm-lite.o
+obj-$(CONFIG_KVM_LITE_GUEST) += lite_guest.o lite_guest_asm.o
+CFLAGS_lite.o += -O0 -g
+CFLAGS_lite_guest.o += -O0 -g
+CFLAGS_mmu.o += -O0 -g
+CFLAGS_kvm_main.o += -O0 -g
+CFLAGS_x86_emulate.o += -O0 -g
diff -r 039995825488 drivers/kvm/kvm.h
--- a/drivers/kvm/kvm.h Fri Aug 31 15:38:42 2007 +1000
+++ b/drivers/kvm/kvm.h Fri Aug 31 15:42:48 2007 +1000
@@ -153,6 +153,7 @@ struct kvm_mmu {
hpa_t root_hpa;
int root_level;
int shadow_root_level;
+ int nailed;
u64 *pae_root;
};
@@ -491,10 +492,14 @@ void kvm_exit_arch(void);
int kvm_mmu_module_init(void);
void kvm_mmu_module_exit(void);
+int kvm_mmu_init_nailed_mapping(int cpu, struct page *page[], unsigned num);
+void kvm_mmu_free_nailed_mapping(int cpu);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
int kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_use_nailed_mappings(struct kvm_vcpu *vcpu);
+void kvm_remove_nailed_mappings(struct kvm_vcpu *vcpu);
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
diff -r 039995825488 drivers/kvm/kvm_main.c
--- a/drivers/kvm/kvm_main.c Fri Aug 31 15:38:42 2007 +1000
+++ b/drivers/kvm/kvm_main.c Tue Sep 04 10:30:56 2007 +1000
@@ -530,6 +530,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsi
return;
}
kvm_arch_ops->set_cr4(vcpu, cr4);
+ vcpu->cr4 = cr4;
mutex_lock(&vcpu->kvm->lock);
kvm_mmu_reset_context(vcpu);
mutex_unlock(&vcpu->kvm->lock);
@@ -918,6 +919,7 @@ void mark_page_dirty(struct kvm *kvm, gf
set_bit(rel_gfn, memslot->dirty_bitmap);
}
}
+EXPORT_SYMBOL_GPL(mark_page_dirty);
int emulator_read_std(unsigned long addr,
void *val,
@@ -1146,10 +1148,8 @@ int emulate_invlpg(struct kvm_vcpu *vcpu
int emulate_clts(struct kvm_vcpu *vcpu)
{
- unsigned long cr0;
-
- cr0 = vcpu->cr0 & ~X86_CR0_TS;
- kvm_arch_ops->set_cr0(vcpu, cr0);
+ vcpu->cr0 &= ~X86_CR0_TS;
+ kvm_arch_ops->set_cr0(vcpu, vcpu->cr0);
return X86EMUL_CONTINUE;
}
@@ -2033,6 +2033,7 @@ static int kvm_vcpu_ioctl_set_sregs(stru
kvm_arch_ops->decache_cr4_guest_bits(vcpu);
mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
+ vcpu->cr0 = sregs->cr0;
kvm_arch_ops->set_cr0(vcpu, sregs->cr0);
mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
diff -r 039995825488 drivers/kvm/lite.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/kvm/lite.c Wed Sep 05 10:20:33 2007 +1000
@@ -0,0 +1,1716 @@
+/*
+ * Kernel-based Virtual Machine driver for old CPUs (paravirt OS only)
+ * Copyright 2007 Rusty Russell <rusty-8n+1lVoiYb80n/F98K4Iww@public.gmane.org>, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/kvm_lite.h>
+#include <asm/desc.h>
+
+#include "lite.h"
+#include "x86_emulate.h"
+
+/* Found in lite_switcher.S */
+extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
+extern unsigned long default_idt_entries[];
+
+/* Every guest maps the core switcher code. */
+#define SHARED_SWITCHER_PAGES \
+ DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
+/* Pages for switcher itself, then two pages per cpu */
+#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
+
+/* We map at -2M for ease of mapping into the guest (one PTE page). */
+#define SWITCHER_ADDR 0xFFE00000
+
+/* Guest runs in ring 1. */
+#define GUEST_PL 1
+
+static struct vm_struct *switcher_vma;
+static struct page **switcher_page;
+static int host_has_pge;
+
+/* Offset from where switcher.S was compiled to where we've copied it */
+static unsigned long switcher_offset(void)
+{
+ return SWITCHER_ADDR - (unsigned long)start_switcher_text;
+}
+
+static struct {
+ unsigned long offset;
+ unsigned short segment;
+} lite_entry;
+
+struct vcpu_lite
+{
+ /* If guest behaves badly, we refuse to run it. */
+ bool dead;
+
+ /* Should we inject a page fault? */
+ bool wants_page_fault;
+ unsigned long pf_addr;
+ unsigned int pf_errcode;
+
+ /* cpu we are running on. */
+ struct lite_pages *curr_pages;
+
+ struct kvm_vcpu vcpu;
+
+ /* GDT & IDT supplied by guest. */
+ struct descriptor_table gdt_desc, idt_desc;
+
+ /* The GDT entries copied into lite_ro_state when running. */
+ struct segment_descriptor gdt[GDT_ENTRIES];
+
+ /* The IDT entries: some copied into lite_ro_state when running. */
+ struct desc_struct idt[KVM_NR_INTERRUPTS];
+
+ /* We ignore all but the TS bit here. */
+ unsigned long cr0;
+
+ /* Head of the (shadow) page tables. */
+ unsigned long pgtable;
+
+ /* rflags of guest */
+ unsigned long rflags;
+
+ /* Guest kernel stack. */
+ unsigned long kstack;
+ u16 kstack_ss;
+
+ /* Guest physical address of lite_data */
+ /* FIXME: Permanently map page? */
+ gpa_t lite_data;
+
+ /* The segment registers. */
+ u32 sregs[6];
+};
+
+#define kill_guest(lite, fmt, ...) \
+do { \
+ if (!(lite)->dead) { \
+ (lite)->dead = true; \
+ if (printk_ratelimit()) \
+ printk(fmt"\n" , ## __VA_ARGS__); \
+ } \
+} while(0)
+
+#define get_lite_data(lite, member) \
+({ \
+ typeof(((struct kvm_lite_data *)0)->member) __v; \
+ \
+ if (emulator_read_std((lite)->lite_data \
+ + offsetof(struct kvm_lite_data, member), \
+ &__v, sizeof(__v), &(lite)->vcpu) \
+ != X86EMUL_CONTINUE) \
+ kill_guest(lite, "Reading " #member); \
+ __v; \
+})
+
+#define set_lite_data(lite, member, val) \
+({ \
+ typeof(((struct kvm_lite_data *)0)->member) __v = (val); \
+ if (emulator_write_emulated((lite)->lite_data \
+ + offsetof(struct kvm_lite_data, member), \
+ &(__v), sizeof(__v), &(lite)->vcpu) \
+ != X86EMUL_CONTINUE) \
+ kill_guest(lite, "Writing " #member); \
+})
+
+static void dump_regs(const char *str, int trap, const struct vcpu_lite *lite)
+{
+ if (trap >= 0)
+ printk("Trap %i: ", trap);
+#if 1
+ printk("%s@%#lx\n"
+ " eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n"
+ " esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
+ str, lite->vcpu.rip,
+ lite->vcpu.regs[VCPU_REGS_RAX],
+ lite->vcpu.regs[VCPU_REGS_RBX],
+ lite->vcpu.regs[VCPU_REGS_RCX],
+ lite->vcpu.regs[VCPU_REGS_RDX],
+ lite->vcpu.regs[VCPU_REGS_RSI],
+ lite->vcpu.regs[VCPU_REGS_RDI],
+ lite->vcpu.regs[VCPU_REGS_RBP],
+ lite->vcpu.regs[VCPU_REGS_RSP]);
+#else
+ printk("%s@%#lx (%lu)\n",
+ str, lite->vcpu.rip, lite->total_exits);
+#endif
+}
+
+static inline struct vcpu_lite *to_lite(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_lite, vcpu);
+}
+
+/* This cpu's struct lite_pages. */
+static struct lite_pages *lite_pages(unsigned int cpu)
+{
+ return &(((struct lite_pages *)
+ (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
+}
+
+static __init int lite_is_always_supported(void)
+{
+ return 1;
+}
+
+static __init int you_cant_stop_the_lite(void)
+{
+ return 0;
+}
+
+static __init void hardware_enable(void *dummy)
+{
+ if (host_has_pge) {
+ /* FIXME: Do this only before running. */
+ write_cr4(read_cr4() & ~X86_CR4_PGE);
+
+ /* Turn off the feature in the global feature set. */
+ clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+ }
+}
+
+static __init void hardware_disable(void *dummy)
+{
+ if (host_has_pge) {
+ write_cr4(read_cr4() | X86_CR4_PGE);
+ /* FIXME: Do this only after all cpus done. */
+ set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+ }
+}
+
+static void check_processor_compatibility(void *rtn)
+{
+ *(int *)rtn = 0;
+}
+
+/* This routine is called at boot or modprobe time for each CPU to set up the
+ * "constant" GDT entries for Guests running on that CPU. */
+static void setup_default_gdt_entries(struct lite_ro_state *state)
+{
+ /* FIXME: Use segment_descriptor */
+ struct desc_struct *gdt = (struct desc_struct *)state->guest_gdt;
+ unsigned long tss = (unsigned long)&state->guest_tss;
+
+ /* The hypervisor segments are full 0-4G segments, privilege level 0 */
+ gdt[GDT_ENTRY_LITE_CS] = FULL_EXEC_SEGMENT;
+ gdt[GDT_ENTRY_LITE_DS] = FULL_SEGMENT;
+
+ /* The TSS segment refers to the TSS entry for this CPU, so we cannot
+ * copy it from the Guest. Forgive the magic flags */
+ gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);
+ gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000)
+ | ((tss >> 16) & 0x000000FF);
+}
+
+/* The default entry for each interrupt points into the Switcher routines which
+ * simply return to the Host. handle_exit() will then bounce it back into the
+ * Guest. */
+static void default_idt_entry(struct desc_struct *idt,
+ int trap,
+ const unsigned long handler)
+{
+ /* A present interrupt gate. */
+ u32 flags = 0x8e00;
+
+ /* Set the privilege level on the entry for the hypercall: this allows
+ * the Guest to use the "int" instruction to trigger it. */
+ if (trap == KVM_LITE_HCALL_TRAP)
+ flags |= (GUEST_PL << 13);
+#if 1 /* FIXME: When we do direct traps, this is not necessary. */
+ if (trap == SYSCALL_VECTOR)
+ flags |= (3 << 13);
+#endif
+ /* Now pack it into the IDT entry in its weird format. */
+ idt->a = (LITE_CS<<16) | (handler&0x0000FFFF);
+ idt->b = (handler&0xFFFF0000) | flags;
+}
+
+/* When the Guest first starts, we put default entries into the IDT. */
+static void setup_default_idt_entries(struct lite_ro_state *state,
+ const unsigned long *def)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
+ default_idt_entry(&state->guest_idt[i], i, def[i]);
+}
+
+static int setup_nailed_mappings(void)
+{
+ int i, j, err;
+ struct page *pages[TOTAL_SWITCHER_PAGES];
+
+ /* Every CPU has the hypervisor text (read only). */
+ for (i = 0; i < SHARED_SWITCHER_PAGES; i++) {
+ pages[i] = switcher_page[i];
+ pages[i]->private = 0;
+ }
+
+ for_each_possible_cpu(i) {
+ memset(pages+SHARED_SWITCHER_PAGES, 0,
+ sizeof(pages) - SHARED_SWITCHER_PAGES*sizeof(pages[0]));
+ pages[SHARED_SWITCHER_PAGES + i*2]
+ = switcher_page[SHARED_SWITCHER_PAGES + i*2];
+ pages[SHARED_SWITCHER_PAGES + i*2 + 1]
+ = switcher_page[SHARED_SWITCHER_PAGES + i*2 + 1];
+ /* First page is writable, second isn't */
+ pages[SHARED_SWITCHER_PAGES + i*2]->private = 1;
+ pages[SHARED_SWITCHER_PAGES + i*2 + 1]->private = 0;
+ err = kvm_mmu_init_nailed_mapping(i, pages, ARRAY_SIZE(pages));
+ if (err)
+ goto undo;
+ }
+ return 0;
+
+undo:
+ for_each_possible_cpu(j) {
+ if (j == i)
+ break;
+ kvm_mmu_free_nailed_mapping(j);
+ }
+ return err;
+}
+
+static __init int map_switcher(void)
+{
+ int i, err;
+ struct page **pagep;
+
+ /*
+ * Map the Switcher in to high memory.
+ *
+ * It turns out that if we choose the address 0xFFE00000 (2MB under the
+ * top virtual address), it makes setting up the page tables really
+ * easy.
+ */
+
+ /* This is the "lite under lite" case. Just Say No. */
+ if (__FIXADDR_TOP <= SWITCHER_ADDR) {
+ printk(KERN_INFO "kvm-lite: top of memory already reserved\n");
+ err = -EEXIST;
+ goto out;
+ }
+
+ /* We allocate an array of "struct page"s. map_vm_area() wants the
+ * pages in this form, rather than just an array of pointers. */
+ switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
+ GFP_KERNEL);
+ if (!switcher_page) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Now we actually allocate the pages. The Guest will see these pages,
+ * so we make sure they're zeroed. */
+ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
+ unsigned long addr = get_zeroed_page(GFP_KERNEL);
+ if (!addr) {
+ err = -ENOMEM;
+ goto free_some_pages;
+ }
+ switcher_page[i] = virt_to_page(addr);
+ }
+
+ /* Now we reserve the "virtual memory area" we want: 0xFFC00000
+ * (SWITCHER_ADDR). We might not get it in theory, but in practice
+ * it's worked so far. */
+ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
+ VM_ALLOC, SWITCHER_ADDR, VMALLOC_END);
+ if (!switcher_vma) {
+ err = -ENOMEM;
+ printk("lite: could not map switcher pages high\n");
+ goto free_pages;
+ }
+
+ err = setup_nailed_mappings();
+ if (err) {
+ printk("lite: failed to set up nailed mappings: %i\n", err);
+ goto free_vma;
+ }
+
+ /* This code actually sets up the pages we've allocated to appear at
+ * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the
+ * kind of pages we're mapping (kernel pages), and a pointer to our
+ * array of struct pages. It increments that pointer, but we don't
+ * care. */
+ pagep = switcher_page;
+ err = map_vm_area(switcher_vma, __pgprot(_PAGE_KERNEL_EXEC), &pagep);
+ if (err) {
+ printk("lite: map_vm_area failed: %i\n", err);
+ goto free_nailed_mappings;
+ }
+
+ /* Now the switcher is mapped at the right address, we can't fail!
+ * Copy in the compiled-in Switcher code (from switcher.S). */
+ memcpy(switcher_vma->addr, start_switcher_text,
+ end_switcher_text - start_switcher_text);
+
+ /* Most of the switcher.S doesn't care that it's been moved; on Intel,
+ * jumps are relative, and it doesn't access any references to external
+ * code or data.
+ *
+ * The only exception is the interrupt handlers in switcher.S: their
+ * addresses are placed in a table (default_idt_entries), so we need to
+ * update the table with the new addresses. switcher_offset() is a
+ * convenience function which returns the distance between the builtin
+ * switcher code and the high-mapped copy we just made. */
+ for (i = 0; i < IDT_ENTRIES; i++)
+ default_idt_entries[i] += switcher_offset();
+
+ /*
+ * Set up the Switcher's per-cpu areas.
+ *
+ * Each CPU gets two pages of its own within the high-mapped region
+ * (aka. "struct lite_pages"). Much of this can be initialized now,
+ * but some depends on what Guest we are running (which is set up in
+ * copy_in_guest_info()).
+ */
+ for_each_possible_cpu(i) {
+ /* lite_pages() returns this CPU's two pages. */
+ struct lite_pages *pages = lite_pages(i);
+ /* This is a convenience pointer to make the code fit one
+ * statement to a line. */
+ struct lite_ro_state *state = &pages->state;
+
+ /* The Global Descriptor Table: the Host has a different one
+ * for each CPU. We keep a descriptor for the GDT which says
+ * where it is and how big it is (the limit is the last
+ * byte, not the size, hence the "-1"). */
+ state->host_gdt_desc.limit = GDT_SIZE-1;
+ state->host_gdt_desc.base = (long)get_cpu_gdt_table(i);
+
+ /* All CPUs on the Host use the same Interrupt Descriptor
+ * Table, so we just use get_idt(), which gets this CPU's IDT
+ * descriptor. */
+ get_idt(&state->host_idt_desc);
+
+ /* The descriptors for the Guest's GDT and IDT can be filled
+ * out now, too. We copy the GDT & IDT into ->guest_gdt and
+ * ->guest_idt before actually running the Guest. */
+ state->guest_idt_desc.limit = sizeof(state->guest_idt)-1;
+ state->guest_idt_desc.base = (long)&state->guest_idt;
+ state->guest_gdt_desc.limit = sizeof(state->guest_gdt)-1;
+ state->guest_gdt_desc.base = (long)&state->guest_gdt;
+
+ /* We know where we want the stack to be when the Guest enters
+ * the switcher: in pages->regs. The stack grows upwards, so
+ * we start it at the end of that structure. */
+ state->guest_tss.esp0 = (long)(&pages->regs + 1);
+ /* And this is the GDT entry to use for the stack: we keep a
+ * couple of special LITE entries. */
+ state->guest_tss.ss0 = LITE_DS;
+
+ /* x86 can have a finegrained bitmap which indicates what I/O
+ * ports the process can use. We set it to the end of our
+ * structure, meaning "none". */
+ state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
+
+ /* Some GDT entries are the same across all Guests, so we can
+ * set them up now. */
+ setup_default_gdt_entries(state);
+ /* Most IDT entries are the same for all Guests, too.*/
+ setup_default_idt_entries(state, default_idt_entries);
+
+ /* The Host needs to be able to use the LITE segments on this
+ * CPU, too, so put them in the Host GDT. */
+ get_cpu_gdt_table(i)[GDT_ENTRY_LITE_CS] = FULL_EXEC_SEGMENT;
+ get_cpu_gdt_table(i)[GDT_ENTRY_LITE_DS] = FULL_SEGMENT;
+ }
+
+ /* In the Switcher, we want the %cs segment register to use the
+ * LITE_CS GDT entry: we've put that in the Host and Guest GDTs, so
+ * it will be undisturbed when we switch. To change %cs and jump we
+ * need this structure to feed to Intel's "lcall" instruction. */
+ lite_entry.offset = (long)switch_to_guest + switcher_offset();
+ lite_entry.segment = LITE_CS;
+
+ printk(KERN_INFO "kvm-lite: mapped switcher at %p\n",
+ switcher_vma->addr);
+
+ /* Save whether PGE should be disabled & re-enabled. */
+ host_has_pge = cpu_has_pge;
+
+ /* And we succeeded... */
+ return 0;
+
+free_nailed_mappings:
+ for_each_possible_cpu(i)
+ kvm_mmu_free_nailed_mapping(i);
+free_vma:
+ vunmap(switcher_vma->addr);
+free_pages:
+ i = TOTAL_SWITCHER_PAGES;
+free_some_pages:
+ for (--i; i >= 0; i--)
+ __free_pages(switcher_page[i], 0);
+ kfree(switcher_page);
+out:
+ return err;
+}
+
+/* Cleaning up the mapping when the module is unloaded is almost...
+ * too easy. */
+static void unmap_switcher(void)
+{
+ unsigned int i;
+
+ /* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
+ vunmap(switcher_vma->addr);
+ /* Now we just need to free the pages we copied the switcher into */
+ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
+ __free_pages(switcher_page[i], 0);
+ for_each_possible_cpu(i)
+ kvm_mmu_free_nailed_mapping(i);
+}
+
+static struct kvm_vcpu *lite_vcpu_create(struct kvm *kvm, unsigned id)
+{
+ struct vcpu_lite *lite;
+ int err;
+
+ lite = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ if (!lite) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = kvm_vcpu_init(&lite->vcpu, kvm, id);
+ if (err)
+ goto free_lite;
+
+ fx_init(&lite->vcpu);
+ lite->vcpu.fpu_active = 1;
+ lite->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+ if (lite->vcpu.vcpu_id == 0)
+ lite->vcpu.apic_base |= MSR_IA32_APICBASE_BSP;
+
+ return &lite->vcpu;
+
+free_lite:
+ kmem_cache_free(kvm_vcpu_cache, lite);
+out:
+ return ERR_PTR(err);
+}
+
+static void lite_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_lite *lite = to_lite(vcpu);
+
+ kvm_vcpu_uninit(vcpu);
+ kmem_cache_free(kvm_vcpu_cache, lite);
+}
+
+/* FIXME: Allow direct traps. */
+static int direct_trap(const struct vcpu_lite *lite,
+ const struct desc_struct *trap,
+ unsigned int num)
+{
+ return 0;
+}
+
+/* We don't use the IDT entries in the "struct vcpu_lite" directly, instead we
+ * copy them into the IDT which we've set up for Guests on this CPU, just
+ * before we run the Guest. This routine does that copy. */
+static void copy_traps(const struct vcpu_lite *lite, struct desc_struct *idt,
+ const unsigned long *def)
+{
+ unsigned int i;
+
+ /* We can simply copy the direct traps, otherwise we use the default
+ * ones in the Switcher: they will return to the Host. */
+ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) {
+ if (direct_trap(lite, &lite->idt[i], i))
+ idt[i] = lite->idt[i];
+ else
+ default_idt_entry(&idt[i], i, def[i]);
+ }
+
+ /* Don't forget the system call trap! The IDT entries for other
+ * interupts never change, so no need to copy them. */
+ i = SYSCALL_VECTOR;
+ if (direct_trap(lite, &lite->idt[i], i))
+ idt[i] = lite->idt[i];
+ else
+ default_idt_entry(&idt[i], i, def[i]);
+}
+
+/* There are several entries we don't let the Guest set. The TSS entry is the
+ * "Task State Segment" which controls all kinds of delicate things. The
+ * LITE_CS and LITE_DS entries are reserved for the Switcher, and the
+ * the Guest can't be trusted to deal with double faults. */
+static int ignored_gdt(unsigned int num)
+{
+ return (num == GDT_ENTRY_TSS
+ || num == GDT_ENTRY_LITE_CS
+ || num == GDT_ENTRY_LITE_DS
+ || num == GDT_ENTRY_DOUBLEFAULT_TSS);
+}
+
+/* Like the IDT, we never simply use the GDT the Guest gives us. We set up the
+ * GDTs for each CPU, then we copy across the entries each time we want to run
+ * a different Guest on that CPU. */
+static void copy_gdt(const struct vcpu_lite *lite,
+ struct segment_descriptor *gdt)
+{
+ unsigned int i;
+
+ /* The default entries from setup_default_gdt_entries() are not
+ * replaced. See ignored_gdt() above. */
+ for (i = 0; i < GDT_ENTRIES; i++)
+ if (!ignored_gdt(i))
+ gdt[i] = lite->gdt[i];
+}
+
+/* Copies state into curr_pages, ready to run Guest. */
+static void copy_to_curr_pages(const struct vcpu_lite *lite)
+{
+ /* Set up the two "TSS" members which tell the CPU what stack to use
+ * for traps which do directly into the Guest (ie. traps at privilege
+ * level 1). */
+ lite->curr_pages->state.guest_tss.esp1 = lite->kstack;
+ lite->curr_pages->state.guest_tss.ss1 = lite->kstack_ss;
+
+ /* Copy direct-to-Guest trap entries. */
+ copy_traps(lite, lite->curr_pages->state.guest_idt,
+ default_idt_entries);
+
+ /* Copy all GDT entries which the Guest can change. */
+ copy_gdt(lite, lite->curr_pages->state.guest_gdt);
+
+ /* Copy the registers in. FIXME: Re-order to memcpy. */
+ lite->curr_pages->regs.eax = lite->vcpu.regs[VCPU_REGS_RAX];
+ lite->curr_pages->regs.ecx = lite->vcpu.regs[VCPU_REGS_RCX];
+ lite->curr_pages->regs.edx = lite->vcpu.regs[VCPU_REGS_RDX];
+ lite->curr_pages->regs.ebx = lite->vcpu.regs[VCPU_REGS_RBX];
+ lite->curr_pages->regs.ebp = lite->vcpu.regs[VCPU_REGS_RBP];
+ lite->curr_pages->regs.esi = lite->vcpu.regs[VCPU_REGS_RSI];
+ lite->curr_pages->regs.edi = lite->vcpu.regs[VCPU_REGS_RDI];
+ lite->curr_pages->regs.esp = lite->vcpu.regs[VCPU_REGS_RSP];
+ lite->curr_pages->regs.eip = lite->vcpu.rip;
+
+ lite->curr_pages->regs.cs = lite->sregs[VCPU_SREG_CS];
+ lite->curr_pages->regs.ds = lite->sregs[VCPU_SREG_DS];
+ lite->curr_pages->regs.es = lite->sregs[VCPU_SREG_ES];
+ lite->curr_pages->regs.fs = lite->sregs[VCPU_SREG_FS];
+ lite->curr_pages->regs.gs = lite->sregs[VCPU_SREG_GS];
+ lite->curr_pages->regs.ss = lite->sregs[VCPU_SREG_SS];
+
+ lite->curr_pages->regs.eflags = lite->rflags;
+}
+
+/* Copies state from curr_pages, after Guest has run. */
+static void copy_from_curr_pages(struct vcpu_lite *lite)
+{
+ /* Copy the registers in. FIXME: Re-order to memcpy. */
+ lite->vcpu.regs[VCPU_REGS_RAX] = lite->curr_pages->regs.eax;
+ lite->vcpu.regs[VCPU_REGS_RCX] = lite->curr_pages->regs.ecx;
+ lite->vcpu.regs[VCPU_REGS_RDX] = lite->curr_pages->regs.edx;
+ lite->vcpu.regs[VCPU_REGS_RBX] = lite->curr_pages->regs.ebx;
+ lite->vcpu.regs[VCPU_REGS_RBP] = lite->curr_pages->regs.ebp;
+ lite->vcpu.regs[VCPU_REGS_RSI] = lite->curr_pages->regs.esi;
+ lite->vcpu.regs[VCPU_REGS_RDI] = lite->curr_pages->regs.edi;
+ lite->vcpu.regs[VCPU_REGS_RSP] = lite->curr_pages->regs.esp;
+ lite->vcpu.rip = lite->curr_pages->regs.eip;
+ lite->sregs[VCPU_SREG_CS] = lite->curr_pages->regs.cs;
+ lite->sregs[VCPU_SREG_DS] = lite->curr_pages->regs.ds;
+ lite->sregs[VCPU_SREG_ES] = lite->curr_pages->regs.es;
+ lite->sregs[VCPU_SREG_FS] = lite->curr_pages->regs.fs;
+ lite->sregs[VCPU_SREG_GS] = lite->curr_pages->regs.gs;
+ lite->sregs[VCPU_SREG_SS] = lite->curr_pages->regs.ss;
+ lite->rflags = lite->curr_pages->regs.eflags;
+}
+
+static void pre_lite_run(struct vcpu_lite *lite)
+{
+ BUG_ON(lite->curr_pages);
+
+ /* FIXME: Do lazy */
+ kvm_load_guest_fpu(&lite->vcpu);
+ if (lite->cr0 & X86_CR0_TS)
+ stts();
+ else
+ clts();
+ lite->curr_pages = lite_pages(smp_processor_id());
+ copy_to_curr_pages(lite);
+}
+
+static void post_lite_run(struct vcpu_lite *lite)
+{
+ BUG_ON(!lite->curr_pages);
+ copy_from_curr_pages(lite);
+ lite->curr_pages = NULL;
+ kvm_put_guest_fpu(&lite->vcpu);
+}
+
+/* FIXME: Somehow do intelligent caching here. */
+static void lite_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+
+static void lite_vcpu_put(struct kvm_vcpu *vcpu)
+{
+}
+
+static void lite_cache_regs(struct kvm_vcpu *vcpu)
+{
+}
+
+static void lite_decache_regs(struct kvm_vcpu *vcpu)
+{
+}
+
+static void lite_vcpu_decache(struct kvm_vcpu *vcpu)
+{
+}
+
+static int lite_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
+{
+ return -EOPNOTSUPP;
+}
+
+static u32 *segreg(struct vcpu_lite *lite, int seg)
+{
+ BUG_ON(seg >= ARRAY_SIZE(lite->sregs));
+ return &lite->sregs[seg];
+}
+
+static u64 segbase(const struct segment_descriptor *d)
+{
+ return d->base_low
+ | ((unsigned long)d->base_mid << 16)
+ | ((unsigned long)d->base_high << 24);
+}
+
+static u64 lite_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ struct vcpu_lite *lite = to_lite(vcpu);
+ struct segment_descriptor *d;
+ u32 selector;
+
+ if (seg == VCPU_SREG_TR || seg == VCPU_SREG_LDTR) {
+ pr_unimpl(vcpu, "lite: get_segment_base(%i)\n", seg);
+ return 0;
+ }
+
+ selector = *segreg(lite, seg) >> 3;
+
+#ifdef CONFIG_X86_64
+#error "FIXME: X86-64 handles some segments strangely: see segment_base()"
+#endif
+ /* This works correctly for 0 segments. */
+ d = &lite->gdt[selector];
+ return segbase(d);
+}
+
+static void lite_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var,
+ int seg)
+{
+ struct vcpu_lite *lite = to_lite(vcpu);
+ struct segment_descriptor *d;
+ u32 selector;
+
+ if (seg == VCPU_SREG_TR || seg == VCPU_SREG_LDTR) {
+ pr_unimpl(vcpu, "lite: get_segment(%i)\n", seg);
+ memset(var, 0, sizeof(*var));
+ return;
+ }
+
+ selector = *segreg(lite, seg) >> 3;
+ d = &lite->gdt[selector];
+ var->base = segbase(d);
+ var->limit = d->limit_low | ((unsigned long)d->limit_high << 8);
+ var->selector = selector;
+ var->type = d->type;
+ var->s = d->system;
+ var->dpl = d->dpl;
+ var->present = d->present;
+ var->avl = d->avl;
+ var->l = d->long_mode;
+ var->db = d->default_op;
+ var->g = d->granularity;
+ var->unusable = (seg != 0);
+}
+
+static void write_timestamp(struct vcpu_lite *lite)
+{
+ struct timespec ts;
+
+ ktime_get_real_ts(&ts);
+ set_lite_data(lite, time, ts);
+}
+
+static void load_gdt_entry(struct vcpu_lite *lite, unsigned int num,
+ const struct segment_descriptor *desc)
+{
+ /* We use the base of the 0th GDT entry as ptr to lite_data. */
+ if (num == 0 && !lite->lite_data) {
+ lite->lite_data = segbase(desc);
+
+ if (lite->lite_data) {
+ /* Set up the initial fields for guest. */
+ set_lite_data(lite, reserve_mem, -SWITCHER_ADDR);
+ write_timestamp(lite);
+ }
+ }
+
+ /* We never copy these ones to real GDT, so don't care what they say */
+ if (ignored_gdt(num))
+ return;
+
+ lite->gdt[num] = *desc;
+ /* Segment descriptors contain a privilege level: the Guest is
+ * sometimes careless and leaves this as 0, even though it's running at
+ * privilege level 1. If so, we fix it here. */
+ if (lite->gdt[num].dpl == 0)
+ lite->gdt[num].dpl = GUEST_PL;
+
+ /* Each descriptor has an "accessed" bit. If we don't set it now, the
+ * CPU will try to set it when the Guest first loads that entry into a
+ * segment register. But the GDT isn't writable by the Guest, so bad
+ * things can happen. */
+ lite->gdt[num].type |= 1;
+}
+
+static void lite_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vcpu_lite *lite = to_lite(vcpu);
+ struct segment_descriptor d;
+ u32 *reg;
+
+ if (seg == VCPU_SREG_TR || seg == VCPU_SREG_LDTR) {
+ pr_unimpl(vcpu, "lite: set_segment(%i)\n", seg);
+ return;
+ }
+
+ if (var->selector >= sizeof(lite->gdt)) {
+ kill_guest(lite, "large segment selector %i\n", var->selector);
+ return;
+ }
+
+ if (var->selector & 4) {
+ pr_unimpl(vcpu, "lite: ldt selector %i\n", var->selector);
+ return;
+ }
+
+ reg = segreg(lite, seg);
+ *reg = var->selector;
+
+ /* We re-encode the cached kvm_segment into the GDT, since
+ * kvm-lite guests must keep their GDT and live segment
+ * registers in sync. */
+ d.limit_low = (var->limit & 0xFFFF);
+ d.limit_high = (var->limit >> 16);
+ d.base_low = (var->base & 0xFFFF);
+ d.base_mid = (var->base >> 16);
+ d.base_high = (var->base >> 24);
+ d.type = var->type;
+ d.system = var->s;
+ d.dpl = var->dpl;
+ d.present = var->present;
+ d.avl = var->avl;
+ d.long_mode = var->l;
+ d.default_op = var->db;
+ d.granularity = var->g;
+ printk("lite: setting segment %s: %#x type=%x\n",
+ seg == VCPU_SREG_SS ? "ss"
+ : seg == VCPU_SREG_GS ? "gs"
+ : seg == VCPU_SREG_DS ? "ds"
+ : seg == VCPU_SREG_ES ? "es"
+ : seg == VCPU_SREG_FS ? "fs"
+ : seg == VCPU_SREG_CS ? "cs" : "???",
+ var->selector, d.type);
+ load_gdt_entry(lite, var->selector >> 3, &d);
+}
+
+static void lite_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
+{
+}
+
+static void lite_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ struct vcpu_lite *lite = to_lite(vcpu);
+
+ lite->cr0 = cr0;
+}
+
+static void lite_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+{
+ struct vcpu_lite *lite = to_lite(vcpu);
+
+ lite->pgtable = root;
+}
+
+static void lite_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+}
+
+#ifdef CONFIG_X86_64
+#error need set_efer
+#endif
+
+static void lite_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+ struct vcpu_lite *lite = to_lite(vcpu);
+
+ *dt = lite->idt_desc;
+}
+
+#ifdef CONFIG_X86_64
+#error need IDT decoding
+#endif
+
+/* The address of the interrupt handler is split into two bits: */
+static unsigned long idt_address(u32 lo, u32 hi)
+{
+ return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
+}
+
+/* The "type" of the interrupt handler is a 4 bit field: we only support a
+ * couple of types. */
+static int idt_type(u32 lo, u32 hi)
+{
+ return (hi >> 8) & 0xF;
+}
+
+/* An IDT entry can't be used unless the "present" bit is set. */
+static int idt_present(u32 lo, u32 hi)
+{
+ return (hi & 0x8000);
+}
+
+/* This is the routine which actually checks the Guest's IDT entry and
+ * transfers it into our entry in "struct lite": */
+static void set_trap(struct vcpu_lite *lite, struct desc_struct *trap,
+ unsigned int num, u32 lo, u32 hi)
+{
+ u8 type = idt_type(lo, hi);
+
+ /* We zero-out a not-present entry */
+ if (!idt_present(lo, hi)) {
+ trap->a = trap->b = 0;
+ return;
+ }
+
+ /* We only support interrupt and trap gates. */
+ if (type != 0xE && type != 0xF)
+ kill_guest(lite, "bad IDT type %i", type);
+
+ /* We only copy the handler address, present bit, privilege level and
+ * type. The privilege level controls where the trap can be triggered
+ * manually with an "int" instruction. This is usually GUEST_PL,
+ * except for system calls which userspace can use. */
+ trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
+ trap->b = (hi&0xFFFFEF00);
+}
+
+static void load_idt_entry(struct vcpu_lite *lite, unsigned int num,
+ u32 lo, u32 hi)
+{
+ /* Guest never handles: NMI, doublefault, spurious interrupt or
+ * hypercall. We ignore when it tries to set them. */
+ if (num == 2 || num == 8 || num == 15 || num == KVM_LITE_HCALL_TRAP)
+ return;
+
+ if (num == 0x80)
+ printk("Setting IDT entry 0x80 to %08x %08x (DPL=%u)\n",
+ lo, hi, (hi >> 13) & 0x3);
+
+ set_trap(lite, &lite->idt[num], num, lo, hi);
+}
+
+static void lite_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+ struct vcpu_lite *lite = to_lite(vcpu);
+ unsigned int i, num;
+
+ /* QEMU sets up huge IDT and GDTs but we ignore them. */
+ num = (dt->limit + 1) / sizeof(lite->idt[0]);
+ if (num > ARRAY_SIZE(lite->idt))
+ num = ARRAY_SIZE(lite->idt);
+
+ lite->idt_desc = *dt;
+ for (i = 0; i < num; i++) {
+ struct desc_struct e;
+ if (emulator_read_std(lite->idt_desc.base
+ + i * sizeof(lite->idt[0]),
+ &e, sizeof(e), vcpu)
+ != X86EMUL_CONTINUE) {
+ kill_guest(lite, "Failed reading IDT %i", i);
+ return;
+ }
+ load_idt_entry(lite, i, e.a, e.b);
+ }
+}
+
+static void lite_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+ struct vcpu_lite *lite = to_lite(vcpu);
+
+ *dt = lite->gdt_desc;
+}
+
+static void lite_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+{
+ struct vcpu_lite *lite = to_lite(vcpu);
+ unsigned int i, num;
+
+ /* QEMU sets up huge IDT and GDTs but we ignore them. */
+ num = (dt->limit + 1) / sizeof(lite->gdt[0]);
+ if (num > ARRAY_SIZE(lite->gdt))
+ num = ARRAY_SIZE(lite->gdt);
+
+ lite->gdt_desc = *dt;
+ for (i = 0; i < num; i++) {
+ struct segment_descriptor d;
+ if (emulator_read_std(lite->gdt_desc.base
+ + i * sizeof(lite->gdt[0]),
+ &d, sizeof(d), vcpu)
+ != X86EMUL_CONTINUE) {
+ kill_guest(lite, "Failed reading GDT");
+ return;
+ }
+ load_gdt_entry(lite, i, &d);
+ }
+}
+
+static unsigned long lite_get_dr(struct kvm_vcpu *vcpu, int dr)
+{
+ pr_unimpl(vcpu, "lite_get_dr %i\n", dr);
+ return 0;
+}
+
+static void lite_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
+ int *exception)
+{
+ pr_unimpl(vcpu, "lite_set_dr %i\n", dr);
+}
+
+/* Get IF bit of guest's interrupt flags. */
+static int get_guest_eflags_if(struct vcpu_lite *lite)
+{
+ /* They don't enable before setting up GDT. */
+ if (!lite->lite_data)
+ return 0;
+
+ return get_lite_data(lite, irq_enabled) & X86_EFLAGS_IF;
+}
+
+static void set_guest_eflags_if(struct vcpu_lite *lite, int eflags)
+{
+ if (!lite->lite_data)
+ return;
+
+ printk("Setting EFLAGS to %08x\n", eflags);
+ set_lite_data(lite, irq_enabled, eflags);
+}
+
+static unsigned long lite_get_rflags(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_lite *lite = to_lite(vcpu);
+ unsigned long rflags;
+
+ /* Interrupts are always really enabled: read that bit from guest */
+ rflags = (lite->rflags & ~(unsigned long)X86_EFLAGS_IF);
+ return rflags | get_guest_eflags_if(lite);
+}
+
+static void lite_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ struct vcpu_lite *lite = to_lite(vcpu);
+
+ /* Interrupts are always really enabled, and "2" must be set. */
+ lite->rflags = rflags | X86_EFLAGS_IF | 0x2;
+ set_guest_eflags_if(lite, rflags & X86_EFLAGS_IF);
+}
+
+static void lite_flush_tlb(struct kvm_vcpu *vcpu)
+{
+}
+
+static void push_guest_stack(struct vcpu_lite *lite,
+ unsigned long *gstack,
+ unsigned long val)
+{
+ /* Stack grows upwards: move stack then write value. */
+ *gstack -= sizeof(long);
+ if (emulator_write_emulated(*gstack, &val, sizeof(val), &lite->vcpu)
+ != X86EMUL_CONTINUE)
+ kill_guest(lite, "Stack write to %#lx failed", *gstack);
+}
+
+/* FIXME: In-kernel apic might supply a generic version of this. */
+static void set_guest_interrupt(struct vcpu_lite *lite,
+ u32 lo, u32 hi, bool has_err,
+ unsigned long errcode)
+{
+ unsigned long gstack;
+ u32 eflags, ss;
+
+ /* There are two cases for interrupts: one where the Guest is already
+ * in the kernel, and a more complex one where the Guest is in
+ * userspace. We check the privilege level to find out. */
+ if ((lite->sregs[VCPU_SREG_SS]&0x3) != GUEST_PL) {
+ /* The Guest told us their kernel stack with the SET_STACK
+ * hypercall: both the virtual address and the segment */
+ gstack = lite->kstack;
+ ss = lite->kstack_ss;
+ /* We push the old stack segment and pointer onto the new
+ * stack: when the Guest does an "iret" back from the interrupt
+ * handler the CPU will notice they're dropping privilege
+ * levels and expect these here. */
+ push_guest_stack(lite, &gstack, lite->sregs[VCPU_SREG_SS]);
+ push_guest_stack(lite, &gstack,lite->vcpu.regs[VCPU_REGS_RSP]);
+ printk("IRQ from different PL\n");
+ } else {
+ /* We're staying on the same Guest (kernel) stack. */
+ gstack = lite->vcpu.regs[VCPU_REGS_RSP];
+ ss = lite->sregs[VCPU_SREG_SS];
+ }
+
+ /* Remember that we never let the Guest actually disable interrupts, so
+ * the "Interrupt Flag" bit is always set. We copy that bit from the
+ * Guest's "irq_enabled" field into the eflags word: the Guest copies
+ * it back in "lite_iret". */
+ eflags = lite->rflags;
+ if (get_guest_eflags_if(lite) == 0)
+ eflags &= ~X86_EFLAGS_IF;
+
+ /* An interrupt is expected to push three things on the stack: the old
+ * "eflags" word, the old code segment, the old instruction pointer and
+ * the error code. */
+ push_guest_stack(lite, &gstack, eflags);
+ push_guest_stack(lite, &gstack, lite->sregs[VCPU_SREG_CS]);
+ push_guest_stack(lite, &gstack, lite->vcpu.rip);
+ if (has_err)
+ push_guest_stack(lite, &gstack, errcode);
+
+ /* Now we've pushed all the old state, we change the stack, the code
+ * segment and the address to execute. */
+ lite->sregs[VCPU_SREG_SS] = ss;
+ lite->vcpu.regs[VCPU_REGS_RSP] = gstack;
+ lite->sregs[VCPU_SREG_CS] = (__KERNEL_CS|GUEST_PL);
+ lite->vcpu.rip = idt_address(lo, hi);
+
+ /* There are two kinds of interrupt handlers: 0xE is an "interrupt
+ * gate" which expects interrupts to be disabled on entry. */
+ if (idt_type(lo, hi) == 0xE)
+ set_guest_eflags_if(lite, 0);
+
+ /* Every time we deliver an interrupt, we update the timestamp in the
+ * Guest's kvm_lite_data struct. It would be better for the Guest if
+ * we did this more often, but it can actually be quite slow: doing it
+ * here is a compromise which means at least it gets updated every
+ * timer interrupt. */
+ write_timestamp(lite);
+}
+
+static void set_page_fault(struct vcpu_lite *lite)
+{
+ struct desc_struct *pf_idt;
+
+ if (!lite->lite_data) {
+ kill_guest(lite, "Early page fault");
+ return;
+ }
+
+ pf_idt = &lite->idt[PF_VECTOR];
+ if (!idt_present(pf_idt->a, pf_idt->b)) {
+ kill_guest(lite, "No handler for #PF@%#lx %#lx",
+ lite->vcpu.rip, lite->pf_addr);
+ return;
+ }
+
+ set_lite_data(lite, cr2, lite->pf_addr);
+ set_guest_interrupt(lite, pf_idt->a, pf_idt->b, true,
+ lite->pf_errcode);
+ lite->wants_page_fault = false;
+}
+
+/* We can't simply inject the page fault now, since the emulator will
+ * set registers and overwrite it. */
+static void lite_inject_page_fault(struct kvm_vcpu *vcpu,
+ unsigned long addr, u32 err_code)
+{
+ struct vcpu_lite *lite = to_lite(vcpu);
+
+ /* This can happen when set_page_fault writes to stack. */
+ if (lite->wants_page_fault) {
+ kill_guest(lite, "Recursive page fault");
+ return;
+ }
+
+ printk("pf inject for %#lx\n", addr);
+ lite->wants_page_fault = true;
+ lite->pf_addr = addr;
+ lite->pf_errcode = err_code;
+ if (!addr)
+ kill_guest(lite, "Page fault\n");
+}
+
+static void lite_inject_gp(struct kvm_vcpu *vcpu, u32 err_code)
+{
+ struct vcpu_lite *lite = to_lite(vcpu);
+
+ if (!lite->lite_data) {
+ kill_guest(lite, "Early #GP");
+ return;
+ }
+
+ printk("GP to %#lx %#lx\n",
+ lite->idt[GP_VECTOR].a, lite->idt[GP_VECTOR].b);
+ set_guest_interrupt(lite, lite->idt[GP_VECTOR].a,
+ lite->idt[GP_VECTOR].b, true, err_code);
+}
+
+static int dm_request_for_irq_injection(struct vcpu_lite *lite,
+ struct kvm_run *kvm_run)
+{
+ return (!lite->vcpu.irq_summary &&
+ kvm_run->request_interrupt_window &&
+ lite->vcpu.interrupt_window_open &&
+ get_guest_eflags_if(lite));
+}
+
+/* 0 or -errno means we stop running and return the userspace. Positive means
+ * it's all good. */
+static int handle_page_fault(struct vcpu_lite *lite, struct kvm_run *run,
+ unsigned long cr2, u32 errcode)
+{
+ int r;
+ unsigned long old_eip = lite->vcpu.rip;
+
+ if (cr2 < 0xc0000000 && cr2 > 0x8000000)
+ printk("Page fault at %#lx\n", cr2);
+
+ mutex_lock(&lite->vcpu.kvm->lock);
+ r = kvm_mmu_page_fault(&lite->vcpu, cr2, errcode);
+ if (old_eip != lite->vcpu.rip) {
+ printk("Woah! Trap at %#lx (%#lx) moved to %#lx\n",
+ old_eip, cr2, lite->vcpu.rip);
+ }
+
+ if (r > 0) {
+ /* FIXME: This is horrible. Guest page table needs to be
+ * updated: we use emulator */
+ enum emulation_result er;
+ er = emulate_instruction(&lite->vcpu, run, cr2, errcode);
+ mutex_unlock(&lite->vcpu.kvm->lock);
+
+// printk("Emaulte after pf says %u\n", er);
+ switch (er) {
+ case EMULATE_DONE:
+ return 1;
+ case EMULATE_DO_MMIO:
+ ++lite->vcpu.stat.mmio_exits;
+ return 0;
+ case EMULATE_FAIL:
+ vcpu_printf(&lite->vcpu, "%s: emulate fail\n",
+ __FUNCTION__);
+ break;
+ default:
+ BUG();
+ }
+ return -EIO;
+ } else
+ mutex_unlock(&lite->vcpu.kvm->lock);
+
+ if (r == 0)
+ return 1;
+
+ return r;
+}
+
+static int has_err(unsigned int trap)
+{
+ return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
+}
+
+/* FIXME: Put in x86_emulate.c. This just skips over instructions. */
+static int skip_io(struct vcpu_lite *lite)
+{
+ u8 insn;
+ unsigned int insnlen = 0, in = 0, shift = 0;
+
+ /* Decoding x86 instructions is icky. */
+ if (emulator_read_std(lite->vcpu.rip, &insn, 1, &lite->vcpu)
+ != X86EMUL_CONTINUE)
+ return 0;
+
+ /* 0x66 is an "operand prefix". It means it's using the upper 16 bits
+ of the eax register. */
+ if (insn == 0x66) {
+ shift = 16;
+ /* The instruction is 1 byte so far, read the next byte. */
+ insnlen = 1;
+ if (emulator_read_std(lite->vcpu.rip+insnlen,
+ &insn, 1, &lite->vcpu)
+ != X86EMUL_CONTINUE)
+ return 0;
+ }
+
+ /* We can ignore the lower bit for the moment and decode the 4 opcodes
+ * we need to emulate. */
+ switch (insn & 0xFE) {
+ case 0xE4: /* in <next byte>,%al */
+ insnlen += 2;
+ in = 1;
+ break;
+ case 0xEC: /* in (%dx),%al */
+ insnlen += 1;
+ in = 1;
+ break;
+ case 0xE6: /* out %al,<next byte> */
+ insnlen += 2;
+ break;
+ case 0xEE: /* out %al,(%dx) */
+ insnlen += 1;
+ break;
+ default:
+ /* OK, we don't know what this is, can't emulate. */
+ return 0;
+ }
+
+ /* If it was an "IN" instruction, they expect the result to be read
+ * into %eax, so we change %eax. We always return all-ones, which
+ * traditionally means "there's nothing there". */
+ if (in) {
+ /* Lower bit tells is whether it's a 16 or 32 bit access */
+ if (insn & 0x1)
+ lite->vcpu.regs[VCPU_REGS_RAX] = 0xFFFFFFFF;
+ else
+ lite->vcpu.regs[VCPU_REGS_RAX] |= (0xFFFF << shift);
+ }
+ /* Finally, we've "done" the instruction, so move past it. */
+ lite->vcpu.rip += insnlen;
+ /* Success! */
+ return 1;
+}
+
+/* FIXME: emulate_instruction() faults and doesn't update EIP. */
+static bool skip_invlpg(struct vcpu_lite *lite)
+{
+ const u8 invlpg[] = { 0x0f, 0x01, 0x38 };
+ u8 insn;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(invlpg); i++) {
+ if (emulator_read_std(lite->vcpu.rip+i, &insn, 1, &lite->vcpu)
+ != X86EMUL_CONTINUE)
+ return false;
+ if (insn != invlpg[i])
+ return false;
+ }
+ printk("invlpg %#lx!\n", lite->vcpu.rip);
+#if 0
+ lite->vcpu.rip += i;
+// kvm_mmu_reset_context(&lite->vcpu);
+#endif
+ return true;
+}
+
+static enum emulation_result emulate(struct vcpu_lite *lite,
+ struct kvm_run *run,
+ unsigned long cr2, u16 errcode)
+{
+ /* For safety, we do not emulate non-kernel commands */
+ if ((lite->sregs[VCPU_SREG_CS] & SEGMENT_RPL_MASK) != GUEST_PL)
+ return EMULATE_FAIL;
+
+ printk("Emulating @%lx: esp = %#lx\n", lite->vcpu.rip,
+ lite->vcpu.regs[VCPU_REGS_RSP]);
+
+ if (skip_io(lite))
+ return EMULATE_DONE;
+
+ if (skip_invlpg(lite)) {
+ enum emulation_result er;
+ er = emulate_instruction(&lite->vcpu, run, cr2, errcode);
+ printk("EIP after emulate = %#lx\n", lite->vcpu.rip);
+ return er;
+ }
+
+ return emulate_instruction(&lite->vcpu, run, cr2, errcode);
+}
+
+static void lite_set_stack(struct vcpu_lite *lite,
+ u32 seg, u32 esp, unsigned int pages)
+{
+ /* You are not allowd have a stack segment with privilege level 0: bad
+ * Guest! */
+ if ((seg & 0x3) != GUEST_PL)
+ kill_guest(lite, "bad stack segment %i", seg);
+ /* We only expect one or two stack pages. */
+ if (pages > 2)
+ kill_guest(lite, "bad stack pages %u", pages);
+
+ /* Save where the stack is */
+ lite->kstack_ss = seg;
+ lite->kstack = esp;
+
+ /* FIXME: When we want to deliver traps directly, we must ensure that
+ * the stack is always mapped. */
+#if 0
+ lg->stack_pages = pages;
+ /* Make sure the new stack pages are mapped */
+ pin_stack_pages(lg);
+#endif
+}
+
+static int do_hypercall(struct vcpu_lite *lite, struct kvm_run *run)
+{
+ unsigned long *regs = lite->vcpu.regs;
+
+ if (!lite->lite_data) {
+ kill_guest(lite, "Early hypercall");
+ return 0;
+ }
+
+ switch (regs[VCPU_REGS_RBX]) {
+ case KVM_HCALL_LOAD_IDT_ENTRY:
+ load_idt_entry(lite, regs[VCPU_REGS_RAX],
+ regs[VCPU_REGS_RCX], regs[VCPU_REGS_RDX]);
+ return 1;
+
+ case KVM_HCALL_LOAD_GDT_ENTRY: {
+ /* FIXME */
+ struct desc_struct d;
+ d.a = regs[VCPU_REGS_RCX];
+ d.b = regs[VCPU_REGS_RDX];
+ load_gdt_entry(lite, regs[VCPU_REGS_RAX],
+ (struct segment_descriptor *)&d);
+ return 1;
+ }
+ case KVM_HCALL_SET_STACK:
+ lite_set_stack(lite, regs[VCPU_REGS_RAX],
+ regs[VCPU_REGS_RCX], regs[VCPU_REGS_RDX]);
+ return 1;
+
+ case KVM_HCALL_HALT:
+ /* Re-enable interrupts, then try to halt. */
+ printk("Setting EFLAGS for halt\n");
+ set_lite_data(lite, irq_enabled, X86_EFLAGS_IF);
+ return kvm_emulate_halt(&lite->vcpu);
+
+ case KVM_HCALL_HACK_WRITE:
+ return kvm_emulate_pio(&lite->vcpu, run, 0, 1, 1);
+
+ case KVM_HCALL_SET_CLOCKEVENT:
+ return kvm_emulate_pio(&lite->vcpu, run, 0, 4, 2);
+
+ default:
+ kill_guest(lite, "Hypercall %lu\n", regs[VCPU_REGS_RBX]);
+ return -ENOENT;
+ }
+ return 0;
+}
+
+/* 0 or -errno means we stop running and return the userspace. Positive means
+ * it's all good. */
+static int handle_exit(struct vcpu_lite *lite, struct kvm_run *run,
+ unsigned trapnum, unsigned errcode, unsigned long cr2)
+{
+ /* OK, so what happened? */
+ switch (trapnum) {
+ case GP_VECTOR: /* We've intercepted a GPF. */
+ switch (emulate(lite, run, cr2, errcode)) {
+ case EMULATE_DONE:
+ return 1;
+ case EMULATE_DO_MMIO:
+ kill_guest(lite, "Doing MMIO!\n");
+ return 0;
+ default:
+ kill_guest(lite, "Emulate failed for %#lx",
+ lite->vcpu.rip);
+ }
+ break;
+
+ case PF_VECTOR:
+ /* If this wants page fault set to guest, it does it explicitly
+ * via ->inject_page_fault. */
+ return handle_page_fault(lite, run, cr2, errcode);
+
+ case NM_VECTOR:
+ /* Always reflect this. */
+ break;
+
+ case KVM_LITE_HCALL_TRAP:
+ return do_hypercall(lite, run);
+
+ case FIRST_EXTERNAL_VECTOR ... KVM_NR_INTERRUPTS-1:
+ /* FIXME: Direct system calls get rid of this. */
+ if (trapnum == SYSCALL_VECTOR) {
+ printk("System call %lu\n",
+ lite->vcpu.regs[VCPU_REGS_RAX]);
+ break;
+ }
+
+ /* An external interrupt, already delivered. */
+ lite->vcpu.stat.irq_exits++;
+ return 1;
+
+ case 256:
+ kill_guest(lite, "Guest faulted in switcher");
+ return -ENOENT;
+ }
+
+ /* We re-inject fault into guest. */
+ if (!idt_present(lite->idt[trapnum].a, lite->idt[trapnum].b)) {
+ kill_guest(lite, "No handler for trap %u", trapnum);
+ run->exit_reason = KVM_EXIT_EXCEPTION;
+ return -EIO;
+ }
+ set_guest_interrupt(lite, lite->idt[trapnum].a,
+ lite->idt[trapnum].b, has_err(trapnum), errcode);
+ return 1;
+}
+
+static void maybe_do_interrupt(struct vcpu_lite *lite,
+ const struct kvm_run *run)
+{
+ struct desc_struct *idt;
+
+ lite->vcpu.interrupt_window_open = get_guest_eflags_if(lite);
+
+ if (!lite->vcpu.interrupt_window_open || !lite->vcpu.irq_summary)
+ return;
+
+ /* Look at the IDT entry the Guest gave us for this interrupt. */
+ idt = &lite->idt[kvm_pop_irq(&lite->vcpu)];
+
+ if (lite->vcpu.rip >= 0xc0103070 && lite->vcpu.rip < 0xc01031a0)
+ printk("Interrupt in __switch_to!\n");
+
+ /* If they don't have a handler (yet?), we just ignore it */
+ if (idt_present(idt->a, idt->b)) {
+ /* set_guest_interrupt() takes the interrupt descriptor and a
+ * flag to say whether this interrupt pushes an error code onto
+ * the stack as well: virtual interrupts never do. */
+ set_guest_interrupt(lite, idt->a, idt->b, 0, 0);
+ }
+}
+
+static int lite_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+ struct vcpu_lite *lite = to_lite(vcpu);
+ int r;
+ unsigned long trapnum, errcode, cr2 = 0; /* Damn gcc */
+ /* This is a dummy value we need for GCC's sake. */
+ unsigned int clobber;
+
+again:
+ if (unlikely(lite->dead)) {
+ r = -ENOENT;
+ goto out;
+ }
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ goto out;
+
+ /* Set up the Guest's page tables to see this CPU's pages (and no
+ * other CPU's pages). */
+ kvm_use_nailed_mappings(vcpu);
+
+ if (lite->wants_page_fault)
+ set_page_fault(lite);
+
+ if (!vcpu->mmio_read_completed)
+ maybe_do_interrupt(lite, run);
+
+ kvm_load_guest_fpu(vcpu);
+
+#if 0
+ if (trapnum < KVM_LITE_HCALL_TRAP)
+ dump_regs("Entering guest", -1, lite);
+#endif
+
+ /* OK, now we're ready to jump into the Guest. First we put up
+ * the "Do Not Disturb" sign: */
+ local_irq_disable();
+
+ /* SYSENTER is an optimized way of doing system calls. We
+ * can't allow it because it always jumps to privilege level 0.
+ * A normal Guest won't try it because we don't advertise it in
+ * CPUID, but a malicious Guest (or malicious Guest userspace
+ * program) could, so we tell the CPU to disable it before
+ * running the Guest. */
+ /* FIXME: move to vcpu_load */
+ if (boot_cpu_has(X86_FEATURE_SEP))
+ wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
+
+ pre_lite_run(lite);
+
+ /* Set trap to impossible number so we know if Switcher faulted. */
+ lite->curr_pages->regs.trapnum = 256;
+
+ /* Save the current Host top-level page directory. */
+ lite->curr_pages->state.host_cr3 = __pa(current->mm->pgd);
+
+ /* Now: we push the "eflags" register on the stack, then do an "lcall".
+ * This is how we change from using the kernel code segment to using
+ * the dedicated lite code segment, as well as jumping into the
+ * Switcher.
+ *
+ * The lcall also pushes the old code segment (KERNEL_CS) onto the
+ * stack, then the address of this call. This stack layout happens to
+ * exactly match the stack of an interrupt... */
+ asm volatile("pushf; lcall *lite_entry"
+ /* This is how we tell GCC that %eax ("a") and %ebx ("b")
+ * are changed by this routine. The "=" means output. */
+ : "=a"(clobber), "=b"(clobber)
+ /* %eax contains the pages pointer. ("0" refers to the
+ * 0-th argument above, ie "a"). %ebx contains the
+ * physical address of the Guest's top-level page
+ * directory. */
+ : "0"(lite->curr_pages), "1"(lite->pgtable)
+ /* We tell gcc that all these registers could change,
+ * which means we don't have to save and restore them in
+ * the Switcher. */
+ : "memory", "%edx", "%ecx", "%edi", "%esi");
+
+ trapnum = lite->curr_pages->regs.trapnum;
+ errcode = lite->curr_pages->regs.errcode;
+
+ /* If the Guest page faulted, then the cr2 register will tell us the
+ * bad virtual address. We have to grab this now, because once we
+ * re-enable interrupts an interrupt could fault and thus overwrite
+ * cr2, or we could even move off to a different CPU. */
+ if (trapnum == 14)
+ cr2 = read_cr2();
+
+ /* Restore SYSENTER if it's supposed to be on. */
+ if (boot_cpu_has(X86_FEATURE_SEP))
+ wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+
+ post_lite_run(lite);
+
+ /* Now we're ready to be interrupted or moved to other CPUs */
+ local_irq_enable();
+
+ if (trapnum < KVM_LITE_HCALL_TRAP)
+ dump_regs("Exiting guest", trapnum, lite);
+
+ /* Remove nailed entries from page tables. */
+ kvm_remove_nailed_mappings(vcpu);
+
+ /* Positive means we've handled it, go around again. */
+ r = handle_exit(lite, run, trapnum, errcode, cr2);
+ if (r > 0) {
+ /* Give scheduler a change to reschedule. */
+ if (signal_pending(current)) {
+ r = -EINTR;
+ run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ goto out;
+ }
+ if (dm_request_for_irq_injection(lite, run)) {
+ r = -EINTR;
+ run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.request_irq_exits;
+ goto out;
+ }
+ kvm_resched(vcpu);
+ goto again;
+ }
+
+out:
+ run->ready_for_interrupt_injection
+ = (vcpu->interrupt_window_open && vcpu->irq_summary == 0);
+ run->if_flag = get_guest_eflags_if(lite);
+#ifdef CONFIG_X86_64
+#error FIXME: cr8 handling
+#endif
+ run->apic_base = vcpu->apic_base;
+ return r;
+}
+
+/* This is called after our "fake" IO, so we don't do anything. */
+static void lite_skip_emulated(struct kvm_vcpu *vcpu)
+{
+#if 0
+ printk("how to skip emulated insn %#lx?", vcpu->rip);
+ WARN_ON(1);
+#endif
+}
+
+static void lite_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *insns)
+{
+ /* "int <lite-trap-entry>". */
+ insns[0] = 0xcd;
+ insns[1] = KVM_LITE_HCALL_TRAP;
+
+ /* GENERIC_NOP1 */
+ insns[2] = 0x90;
+}
+
+static struct kvm_arch_ops lite_arch_ops = {
+ .cpu_has_kvm_support = lite_is_always_supported,
+ .disabled_by_bios = you_cant_stop_the_lite,
+ .hardware_enable = hardware_enable,
+ .hardware_disable = hardware_disable,
+ .check_processor_compatibility = check_processor_compatibility,
+ .hardware_setup = map_switcher,
+ .hardware_unsetup = unmap_switcher,
+ .vcpu_create = lite_vcpu_create,
+ .vcpu_free = lite_vcpu_free,
+ .vcpu_load = lite_vcpu_load,
+ .vcpu_put = lite_vcpu_put,
+ .vcpu_decache = lite_vcpu_decache,
+
+ .set_guest_debug = lite_guest_debug,
+ .get_msr = kvm_get_msr_common,
+ .set_msr = kvm_set_msr_common,
+ .get_segment_base = lite_get_segment_base,
+ .get_segment = lite_get_segment,
+ .set_segment = lite_set_segment,
+ .get_cs_db_l_bits = generic_get_cs_db_l_bits,
+ .decache_cr4_guest_bits = lite_decache_cr4_guest_bits,
+
+ .set_cr0 = lite_set_cr0,
+ .set_cr3 = lite_set_cr3,
+ .set_cr4 = lite_set_cr4,
+ .get_idt = lite_get_idt,
+ .set_idt = lite_set_idt,
+ .get_gdt = lite_get_gdt,
+ .set_gdt = lite_set_gdt,
+ .get_dr = lite_get_dr,
+ .set_dr = lite_set_dr,
+ .cache_regs = lite_cache_regs,
+ .decache_regs = lite_decache_regs,
+ .get_rflags = lite_get_rflags,
+ .set_rflags = lite_set_rflags,
+ .tlb_flush = lite_flush_tlb,
+
+ .inject_page_fault = lite_inject_page_fault,
+ .inject_gp = lite_inject_gp,
+
+ .run = lite_run,
+ .skip_emulated_instruction = lite_skip_emulated,
+ .patch_hypercall = lite_patch_hypercall,
+};
+
+static int __init lite_init(void)
+{
+ return kvm_init_arch(&lite_arch_ops, sizeof(struct vcpu_lite),
+ THIS_MODULE);
+}
+
+static void __exit lite_exit(void)
+{
+ kvm_exit_arch();
+}
+
+module_init(lite_init);
+module_exit(lite_exit);
+MODULE_LICENSE("GPL");
diff -r 039995825488 drivers/kvm/lite.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/kvm/lite.h Sun Sep 02 10:29:54 2007 +1000
@@ -0,0 +1,65 @@
+#ifndef __KVM_LITE_H
+#define __KVM_LITE_H
+
+#define GDT_ENTRY_LITE_CS 10
+#define GDT_ENTRY_LITE_DS 11
+#define LITE_CS (GDT_ENTRY_LITE_CS * 8)
+#define LITE_DS (GDT_ENTRY_LITE_DS * 8)
+
+#define KVM_LITE_HCALL_TRAP 0x1F
+
+#ifndef __ASSEMBLY__
+#include "kvm.h"
+#include "segment_descriptor.h"
+
+/* Full 4G segment descriptors, suitable for CS and DS. */
+#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
+#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300})
+
+struct lite_regs
+{
+ /* Manually saved part. */
+ unsigned long ebx, ecx, edx;
+ unsigned long esi, edi, ebp;
+ unsigned long gs;
+ unsigned long eax;
+ unsigned long fs, ds, es;
+ unsigned long trapnum, errcode;
+ /* Trap pushed part */
+ unsigned long eip;
+ unsigned long cs;
+ unsigned long eflags;
+ unsigned long esp;
+ unsigned long ss;
+};
+
+/* This is a guest-specific page (mapped ro) into the guest. */
+struct lite_ro_state
+{
+ /* Host information we need to restore when we switch back. */
+ u32 host_cr3;
+ struct descriptor_table host_idt_desc;
+ struct descriptor_table host_gdt_desc;
+ u32 host_sp;
+
+ /* Fields which are used when guest is running. */
+ struct descriptor_table guest_idt_desc;
+ struct descriptor_table guest_gdt_desc;
+ struct i386_hw_tss guest_tss;
+ struct desc_struct guest_idt[IDT_ENTRIES];
+ struct segment_descriptor guest_gdt[GDT_ENTRIES];
+};
+
+/* We have two pages shared with guests, per cpu. */
+struct lite_pages
+{
+ /* This is the stack page mapped rw in guest */
+ char spare[PAGE_SIZE - sizeof(struct lite_regs)];
+ struct lite_regs regs;
+
+ /* This is the host state & guest descriptor page, ro in guest */
+ struct lite_ro_state state;
+} __attribute__((aligned(PAGE_SIZE)));
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __KVM_LITE_H */
diff -r 039995825488 drivers/kvm/lite_guest.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/kvm/lite_guest.c Wed Sep 05 11:46:18 2007 +1000
@@ -0,0 +1,527 @@
+#include <linux/init.h>
+#include <linux/kvm_lite.h>
+#include <linux/start_kernel.h>
+#include <linux/console.h>
+#include <linux/irq.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <asm/page.h>
+#include <asm/bootparam.h>
+#include <asm/paravirt.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+#include <asm/fixmap.h>
+#include <asm/mce.h>
+#include "../char/hvc_console.h"
+
+/* This is accessed by lite_guest_asm.S */
+struct kvm_lite_data kvm_lite_data;
+
+static cycle_t clock_base;
+#define LITE_CLOCK_MIN_DELTA 100UL
+#define LITE_CLOCK_MAX_DELTA ULONG_MAX
+
+/* This is inside lite_guest_asm.S. */
+extern void kvm_lite_iret(void);
+
+static unsigned long hcall(unsigned long call,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3)
+{
+ /* FIXME: VMCALL does #GP, but we can't hand all #GP to emulator 8( */
+ asm volatile("int $0x1f; nop" /* Must be 3 bytes */
+ : "=a"(arg1)
+ : "b"(call), "a"(arg1), "c"(arg2), "d"(arg3)
+ : "memory");
+ return arg1;
+}
+
+/* save_flags() is expected to return the processor state (ie. "eflags"). The
+ * eflags word contains all kind of stuff, but in practice Linux only cares
+ * about the interrupt flag. Our "save_flags()" just returns that. */
+static unsigned long save_fl(void)
+{
+ return kvm_lite_data.irq_enabled;
+}
+
+/* "restore_flags" just sets the flags back to the value given. */
+static void restore_fl(unsigned long flags)
+{
+ kvm_lite_data.irq_enabled = flags;
+}
+
+/* Interrupts go off... */
+static void irq_disable(void)
+{
+ kvm_lite_data.irq_enabled = 0;
+}
+
+/* Interrupts go on... */
+static void irq_enable(void)
+{
+ kvm_lite_data.irq_enabled = X86_EFLAGS_IF;
+}
+
+/* Note that these assume we're writing to the active IDT/GDT. */
+static void lite_write_idt_entry(struct desc_struct *dt,
+ int entrynum, u32 low, u32 high)
+{
+ /* Keep the local copy up to date. */
+ write_dt_entry(dt, entrynum, low, high);
+ /* Tell Host about this new entry. */
+ hcall(KVM_HCALL_LOAD_IDT_ENTRY, entrynum, low, high);
+}
+
+static void lite_write_gdt_entry(struct desc_struct *dt,
+ int entrynum, u32 low, u32 high)
+{
+ write_dt_entry(dt, entrynum, low, high);
+ hcall(KVM_HCALL_LOAD_GDT_ENTRY, entrynum, low, high);
+}
+
+/* We always tell host where our kvm_lite_data is using base of entry
+ * 0 in GDT. */
+static void lite_load_gdt(const struct Xgt_desc_struct *desc)
+{
+ struct desc_struct *gdt = (void *)desc->address;
+
+ gdt[0].a = ((unsigned long)&kvm_lite_data << 16);
+ gdt[0].b = (((unsigned long)&kvm_lite_data >> 16) & 0xFF)
+ | ((unsigned long)&kvm_lite_data & 0xFF000000);
+ native_load_gdt(desc);
+}
+
+/* CR2 is the virtual address of the last page fault, which the Guest only ever
+ * reads. The Host kindly writes this into our "struct lguest_data", so we
+ * just read it out of there. */
+static unsigned long lite_read_cr2(void)
+{
+ return kvm_lite_data.cr2;
+}
+
+static void lite_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ unsigned int i;
+ struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+
+ /* There's one problem which normal hardware doesn't have: the Host
+ * can't handle us removing entries we're currently using. So we clear
+ * the GS register here: if it's needed it'll be reloaded anyway. */
+ loadsegment(gs, 0);
+
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+ lite_write_gdt_entry(gdt, GDT_ENTRY_TLS_MIN + i,
+ t->tls_array[i].a, t->tls_array[i].b);
+}
+
+/* FIXME: Host should decide what we report here. */
+static void lite_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ int function = *eax;
+
+ native_cpuid(eax, ebx, ecx, edx);
+ switch (function) {
+ case 1: /* Basic feature request. */
+ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
+ *ecx &= 0x00002201;
+ /* SSE, SSE2, FXSR, MMX, CMOV, PGE, CMPXCHG8B, PAE, PSE, FPU. */
+ *edx &= 0x0780A149;
+ break;
+ case 0x80000000:
+ /* Futureproof this a little: if they ask how much extended
+ * processor information there is, limit it to known fields. */
+ if (*eax > 0x80000008)
+ *eax = 0x80000008;
+ break;
+ }
+}
+
+static void lite_load_esp0(struct tss_struct *tss,
+ struct thread_struct *thread)
+{
+ hcall(KVM_HCALL_SET_STACK, __KERNEL_DS|0x1,
+ thread->esp0, THREAD_SIZE/PAGE_SIZE);
+}
+
+static void lite_safe_halt(void)
+{
+ hcall(KVM_HCALL_HALT, 0, 0, 0);
+}
+
+static void disable_lite_irq(unsigned int irq)
+{
+#if 0 /* FIXME */
+ set_bit(irq, lguest_data.blocked_interrupts);
+#endif
+}
+
+static void enable_lite_irq(unsigned int irq)
+{
+#if 0 /* FIXME */
+ clear_bit(irq, lguest_data.blocked_interrupts);
+#endif
+}
+
+/* This structure describes the IRQ controller. */
+static struct irq_chip lite_irq_controller = {
+ .name = "kvm-lite",
+ .mask = disable_lite_irq,
+ .mask_ack = disable_lite_irq,
+ .unmask = enable_lite_irq,
+};
+
+/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
+ * interrupt (except 128, which is used for system calls), and then tells the
+ * Linux infrastructure that each interrupt is controlled by our level-based
+ * interrupt controller. */
+static void __init lite_init_IRQ(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < 256 - FIRST_EXTERNAL_VECTOR; i++) {
+ int vector = FIRST_EXTERNAL_VECTOR + i;
+ if (vector != SYSCALL_VECTOR) {
+ set_intr_gate(vector, interrupt[i]);
+ set_irq_chip_and_handler(i, &lite_irq_controller,
+ handle_level_irq);
+ }
+ }
+ /* This call is required to set up for 4k stacks, where we have
+ * separate stacks for hard and soft interrupts. */
+ irq_ctx_init(smp_processor_id());
+}
+
+/*
+ * Time.
+ *
+ * It would be far better for everyone if the Guest had its own clock, but
+ * until then the Host gives us the time on every interrupt.
+ */
+static unsigned long lite_get_wallclock(void)
+{
+ return kvm_lite_data.time.tv_sec;
+}
+
+static cycle_t lite_clock_read(void)
+{
+ unsigned long sec, nsec;
+
+ /* We read the time value written by the Host. Since it's in two parts
+ * (seconds and nanoseconds), we risk reading it just as it's changing
+ * from 99 & 0.999999999 to 100 and 0, and getting 99 and 0. As Linux
+ * tends to come apart under the stress of time travel, we must be
+ * careful: */
+ do {
+ /* First we read the seconds part. */
+ sec = kvm_lite_data.time.tv_sec;
+ /* This read memory barrier tells the compiler and the CPU that
+ * this can't be reordered: we have to complete the above
+ * before going on. */
+ rmb();
+ /* Now we read the nanoseconds part. */
+ nsec = kvm_lite_data.time.tv_nsec;
+ /* Make sure we've done that. */
+ rmb();
+ /* Now if the seconds part has changed, try again. */
+ } while (unlikely(kvm_lite_data.time.tv_sec != sec));
+
+ /* Our non-TSC clock is in real nanoseconds. */
+ return sec*1000000000ULL + nsec;
+}
+
+/* This is what we tell the kernel is our clocksource. */
+static struct clocksource lite_clock = {
+ .name = "kvm-lite",
+ .rating = 400,
+ .read = lite_clock_read,
+ .mask = CLOCKSOURCE_MASK(64),
+ .mult = 1 << 22,
+ .shift = 22,
+};
+
+/* The "scheduler clock" is just our real clock, adjusted to start at zero */
+static unsigned long long lite_sched_clock(void)
+{
+ return cyc2ns(&lite_clock, lite_clock_read() - clock_base);
+}
+
+/* We also need a "struct clock_event_device": Linux asks us to set it to go
+ * off some time in the future. Actually, James Morris figured all this out, I
+ * just applied the patch. */
+static int lite_clockevent_set_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ if (delta < LITE_CLOCK_MIN_DELTA) {
+ if (printk_ratelimit())
+ printk(KERN_DEBUG "%s: small delta %lu ns\n",
+ __FUNCTION__, delta);
+ return -ETIME;
+ }
+ hcall(KVM_HCALL_SET_CLOCKEVENT, delta, 0, 0);
+ return 0;
+}
+
+static void lite_clockevent_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ switch (mode) {
+ case CLOCK_EVT_MODE_UNUSED:
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ /* A 0 argument shuts the clock down. */
+ hcall(KVM_HCALL_SET_CLOCKEVENT, 0, 0, 0);
+ break;
+ case CLOCK_EVT_MODE_ONESHOT:
+ /* This is what we expect. */
+ break;
+ case CLOCK_EVT_MODE_PERIODIC:
+ BUG();
+ case CLOCK_EVT_MODE_RESUME:
+ break;
+ }
+}
+
+/* This describes our primitive timer chip. */
+static struct clock_event_device lite_clockevent = {
+ .name = "kvm-lite",
+ .features = CLOCK_EVT_FEAT_ONESHOT,
+ .set_next_event = lite_clockevent_set_next_event,
+ .set_mode = lite_clockevent_set_mode,
+ .rating = INT_MAX,
+ .mult = 1,
+ .shift = 0,
+ .min_delta_ns = LITE_CLOCK_MIN_DELTA,
+ .max_delta_ns = LITE_CLOCK_MAX_DELTA,
+};
+
+/* This is the Guest timer interrupt handler (hardware interrupt 0). We just
+ * call the clockevent infrastructure and it does whatever needs doing. */
+static void lite_time_irq(unsigned int irq, struct irq_desc *desc)
+{
+ unsigned long flags;
+
+ /* Don't interrupt us while this is running. */
+ local_irq_save(flags);
+ lite_clockevent.event_handler(&lite_clockevent);
+ local_irq_restore(flags);
+}
+
+/* At some point in the boot process, we get asked to set up our timing
+ * infrastructure. The kernel doesn't expect timer interrupts before this, but
+ * we cleverly initialized the "blocked_interrupts" field of "struct
+ * lite_data" so that timer interrupts were blocked until now. */
+static void lite_time_init(void)
+{
+ /* Set up the timer interrupt (0) to go to our simple timer routine */
+ set_irq_handler(0, lite_time_irq);
+
+ clock_base = lite_clock_read();
+ clocksource_register(&lite_clock);
+
+ /* Now we've set up our clock, we can use it as the scheduler clock */
+ paravirt_ops.sched_clock = lite_sched_clock;
+
+ /* We can't set cpumask in the initializer: damn C limitations! Set it
+ * here and register our timer device. */
+ lite_clockevent.cpumask = cpumask_of_cpu(0);
+ clockevents_register_device(&lite_clockevent);
+
+ /* Finally, we unblock the timer interrupt. */
+ enable_lite_irq(0);
+}
+
+/* FIXME */
+static unsigned lite_patch(u8 type, u16 clobbers, void *insnbuf,
+ unsigned long addr, unsigned len)
+{
+ return len;
+}
+
+static int lite_panic(struct notifier_block *nb, unsigned long l, void *p)
+{
+ hcall(9999, __pa(p), 0, 0);
+ /* The hcall won't return, but to keep gcc happy, we're "done". */
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block paniced = {
+ .notifier_call = lite_panic
+};
+
+/* FIXME: we only need this because copy_e820_map disbelieves our 1-element
+ * memory map If we had emulated VGA, our mem map would probably pass. */
+static __init char *lite_memory_setup(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &paniced);
+
+ add_memory_region(E820_MAP->addr, E820_MAP->size, E820_MAP->type);
+
+ /* This string is for the boot messages. */
+ return "kvm-lite";
+}
+
+static int put_chars(u32 vtermno, const char *buf, int count)
+{
+ unsigned int i;
+
+ for (i = 0; i < count; i++)
+ hcall(KVM_HCALL_HACK_WRITE, buf[i], 0, 0);
+
+ /* We're expected to return the amount of data we wrote: all of it. */
+ return count;
+}
+
+static int get_chars(u32 vtermno, char *buf, int count)
+{
+ return 0;
+}
+
+static struct hv_ops lite_cons = {
+ .get_chars = get_chars,
+ .put_chars = put_chars,
+};
+
+static int __init cons_init(void)
+{
+ if (strcmp(paravirt_ops.name, "kvm-lite") != 0)
+ return 0;
+
+ return hvc_instantiate(0, 0, &lite_cons);
+}
+console_initcall(cons_init);
+
+/* The standard init function */
+static int __init hvc_lite_init(void)
+{
+ hvc_alloc(0, 0, &lite_cons, 256);
+ return 0;
+}
+module_init(hvc_lite_init);
+
+void lite_trace_on(void)
+{
+ kvm_lite_data.trace = 1;
+}
+
+static void lite_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ /* FIXME: This is the UP version, works around a qemu cmpxchg8b bug */
+ *pmdp = pmd;
+}
+
+static void lite_set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+ /* FIXME: This is the UP version, works around a qemu cmpxchg8b bug */
+ *ptep = pteval;
+}
+
+/* The boot parameters also tell us where the command-line is: save it. */
+static __init void copy_cmdline(void)
+{
+ const char *cmdline;
+
+ /* QEMU uses an old boot protocol version: hardcoded cmdline addr */
+ if (boot_params.hdr.cmd_line_ptr == 0) {
+ u16 *cl_offset = __va(OLD_CL_OFFSET);
+ cmdline = __va(OLD_CL_BASE_ADDR) + *cl_offset;
+ } else
+ cmdline = __va(boot_params.hdr.cmd_line_ptr);
+
+ memcpy(boot_command_line, cmdline, COMMAND_LINE_SIZE);
+}
+
+__init void lite_init(void *boot)
+{
+ extern struct Xgt_desc_struct boot_gdt_descr;
+
+ /* Ensures host knows where our kvm_lite_data is */
+ lite_load_gdt(&boot_gdt_descr);
+
+ /* Copy boot parameters first: the Launcher put the physical location
+ * in %esi, and lite_guest_asm.S converted that to a virtual address
+ * and handed it to us. */
+ memcpy(&boot_params, boot, PARAM_SIZE);
+
+ copy_cmdline();
+
+ /* We're under kvm-lite, paravirt is enabled, and we're running at
+ * privilege level 1, not 0 as normal. */
+ paravirt_ops.name = "kvm-lite";
+ paravirt_ops.paravirt_enabled = 1;
+ paravirt_ops.kernel_rpl = 1;
+
+ paravirt_ops.save_fl = save_fl;
+ paravirt_ops.restore_fl = restore_fl;
+ paravirt_ops.irq_disable = irq_disable;
+ paravirt_ops.irq_enable = irq_enable;
+ paravirt_ops.cpuid = lite_cpuid;
+ paravirt_ops.iret = kvm_lite_iret;
+ paravirt_ops.load_esp0 = lite_load_esp0;
+ paravirt_ops.load_tls = lite_load_tls;
+ paravirt_ops.read_cr2 = lite_read_cr2;
+ paravirt_ops.safe_halt = lite_safe_halt;
+ paravirt_ops.write_gdt_entry = lite_write_gdt_entry;
+ paravirt_ops.write_idt_entry = lite_write_idt_entry;
+ paravirt_ops.iret = kvm_lite_iret;
+ paravirt_ops.memory_setup = lite_memory_setup;
+ paravirt_ops.init_IRQ = lite_init_IRQ;
+ paravirt_ops.get_wallclock = lite_get_wallclock;
+ paravirt_ops.time_init = lite_time_init;
+ paravirt_ops.patch = lite_patch;
+ paravirt_ops.set_pmd = lite_set_pmd;
+ paravirt_ops.set_pte_atomic = lite_set_pte_atomic;
+
+ /* FIXME: If the emulator handled ltr, we wouldn't need this. But we
+ * probably want to suppress it and simply write the tr value into the
+ * kvm_lite_para. */
+ paravirt_ops.load_tr_desc = paravirt_nop;
+ /* FIXME */
+ paravirt_ops.set_ldt = paravirt_nop;
+
+ /* Load the %fs segment register (the per-cpu segment register) with
+ * the normal data segment to get through booting. */
+ asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
+
+ /* The Host uses the top of the Guest's virtual address space for the
+ * Host<->Guest Switcher, and it tells us how much it needs in
+ * lite.reserve_mem, set up by the initial wrmsr. */
+ reserve_top_address(kvm_lite_data.reserve_mem);
+
+ /* If we don't initialize the lock dependency checker now, it crashes
+ * paravirt_disable_iospace. */
+ lockdep_init();
+
+ /* FIXME: If we want emulated devices, remove this. */
+ /* The IDE code spends about 3 seconds probing for disks: if we reserve
+ * all the I/O ports up front it can't get them and so doesn't probe.
+ * Other device drivers are similar (but less severe). This cuts the
+ * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */
+ paravirt_disable_iospace();
+
+ /* This is messy CPU setup stuff which the native boot code does before
+ * start_kernel, so we have to do, too: */
+ cpu_detect(&new_cpu_data);
+ /* head.S usually sets up the first capability word, so do it here. */
+ new_cpu_data.x86_capability[0] = cpuid_edx(1);
+
+ /* Math is always hard! */
+ new_cpu_data.hard_math = 1;
+
+#ifdef CONFIG_X86_MCE
+ mce_disabled = 1;
+#endif
+#ifdef CONFIG_ACPI
+ acpi_disabled = 1;
+ acpi_ht = 0;
+#endif
+
+ /* We set the perferred console to "hvc" */
+ add_preferred_console("hvc", 0, NULL);
+
+ /* Now we're set up, call start_kernel() in init/main.c and we proceed
+ * to boot as normal. It never returns. */
+ start_kernel();
+}
diff -r 039995825488 drivers/kvm/lite_guest_asm.S
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/kvm/lite_guest_asm.S Wed Sep 05 11:46:48 2007 +1000
@@ -0,0 +1,92 @@
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/processor-flags.h>
+
+LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
+
+#if PTRS_PER_PMD > 1
+PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
+#else
+PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
+#endif
+BOOTBITMAP_SIZE = LOW_PAGES / 8
+ALLOCATOR_SLOP = 4
+
+INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
+
+/* This is where we begin: head.S notes that the boot header's platform
+ * type field is "2" (kvm-lite), so calls us here. The boot header is in %esi.
+ *
+ * WARNING: be very careful here! We're running at addresses equal to physical
+ * addesses (around 0), not above PAGE_OFFSET as most code expectes
+ * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
+ * data.
+ *
+ * The .section line puts this code in .init.text so it will be discarded after
+ * boot. */
+.section .init.text, "ax", @progbits
+ENTRY(kvm_lite_init_asm)
+ /* Clear BSS first so that there are no surprises... */
+ xorl %eax,%eax
+ movl $__bss_start - __PAGE_OFFSET,%edi
+ movl $__bss_stop - __PAGE_OFFSET,%ecx
+ subl %edi,%ecx
+ shrl $2,%ecx
+ rep ; stosl
+
+ /* Set up swapper_pg_dir page tables as per head.S */
+page_pde_offset = (__PAGE_OFFSET >> 20);
+
+ movl $(pg0 - __PAGE_OFFSET), %edi
+ movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
+ movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */
+10:
+ leal 0x007(%edi),%ecx /* Create PDE entry */
+ movl %ecx,(%edx) /* Store identity PDE entry */
+ movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
+ addl $4,%edx
+ movl $1024, %ecx
+11:
+ stosl
+ addl $0x1000,%eax
+ loop 11b
+ /* End condition: we must map up to and including INIT_MAP_BEYOND_END */
+ /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */
+ leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp
+ cmpl %ebp,%eax
+ jb 10b
+ movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
+
+ movl $swapper_pg_dir-__PAGE_OFFSET,%eax
+ movl %eax,%cr3 /* set the page table pointer.. */
+
+ /* Set up the initial stack so we can run C code. */
+ movl $(init_thread_union+THREAD_SIZE),%esp
+
+ /* Set up boot information pointer to hand to lite_init(): it wants
+ * a virtual address. */
+ movl %esi, %eax
+ addl $__PAGE_OFFSET, %eax
+
+ pushl $0 /* fake return address for unwinder */
+
+ /* Jumps are relative, and we're running __PAGE_OFFSET too low at the
+ * moment. */
+ jmp lite_init+__PAGE_OFFSET
+
+/* FIXME: tell host not to interrupt us between lite_noirq_start & end? */
+.text
+ENTRY(kvm_lite_iret)
+ pushl %eax
+ movl 12(%esp), %eax
+lite_noirq_start:
+ /* Note the %ss: segment prefix here. Normal data accesses use the
+ * "ds" segment, but that will have already been restored for whatever
+ * we're returning to (such as userspace): we can't trust it. The %ss:
+ * prefix makes sure we use the stack segment, which is still valid. */
+ movl %eax,%ss:kvm_lite_data+LITE_DATA_irq_enabled
+ popl %eax
+ iret
+lite_noirq_end:
+
diff -r 039995825488 drivers/kvm/lite_switcher.S
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/kvm/lite_switcher.S Fri Aug 31 15:42:48 2007 +1000
@@ -0,0 +1,309 @@
+// Not all kernel headers work from assembler
+// But these ones are needed: the ENTRY() define
+// And constants extracted from struct offsets
+// To avoid magic numbers and breakage:
+// Should they change the compiler can't save us
+// Down here in the depths of assembler code.
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include "lite.h"
+
+// We mark the start of the code to copy
+// It's placed in .text tho it's never run here
+// You'll see the trick macro at the end
+// Which interleaves data and text to effect.
+.text
+ENTRY(start_switcher_text)
+
+// When we reach switch_to_guest we have just left
+// The safe and comforting shores of C code
+// %eax has the "struct lite_pages" that we need
+// Where we save state and still see it from the Guest
+// And %ebx holds the Guest shadow pagetable:
+// Once set we have truly left Host behind.
+ENTRY(switch_to_guest)
+ // We told gcc all its regs could fade,
+ // Clobbered by our journey into the Guest
+ // We could have saved them, if we tried
+ // But time is our master and cycles count.
+
+ // Segment registers must be saved for the Host
+ // We push them on the Host stack for later
+ pushl %es
+ pushl %ds
+ pushl %gs
+ pushl %fs
+ // But the compiler is fickle, and heeds
+ // No warning of %ebp clobbers
+ // When frame pointers are used. That register
+ // Must be saved and restored or chaos strikes.
+ pushl %ebp
+ // The Host's stack is done, now save it away
+ // In our "struct lite_pages" at the offset
+ // Distilled into asm-offsets.h
+ movl %esp, LITE_PAGES_host_sp(%eax)
+
+ // All saved and there's now five steps before us:
+ // Stack, GDT, IDT, TSS
+ // And last of all the page tables are flipped.
+
+ // Yet beware that our stack pointer must be
+ // Always valid lest an NMI hits
+ // %edx does the duty here as we juggle
+ // %eax is lite_pages and our stack lies within.
+ movl %eax, %edx
+ addl $LITE_PAGES_regs, %edx
+ movl %edx, %esp
+
+ // The Guest's GDT we so carefully
+ // Placed in the "struct lite_pages" earlier
+ lgdt LITE_PAGES_guest_gdt_desc(%eax)
+
+ // The Guest's IDT we did partially
+ // Copy to the "struct lite_pages" as well.
+ lidt LITE_PAGES_guest_idt_desc(%eax)
+
+ // The TSS entry which controls traps
+ // Must be loaded up with "ltr" now:
+ // For after we switch over our page tables
+ // It (as the rest) will be writable no more.
+ // (The GDT entry TSS needs
+ // Changes type when we load it: damn Intel!)
+ movl $(GDT_ENTRY_TSS*8), %edx
+ ltr %dx
+
+ // Look back now, before we take this last step!
+ // The Host's TSS entry was also marked used;
+ // Let's clear it again, ere we return.
+ // The GDT descriptor of the Host
+ // Points to the table after two "size" bytes
+ movl (LITE_PAGES_host_gdt_desc+2)(%eax), %edx
+ // Clear the type field of "used" (byte 5, bit 2)
+ andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
+
+ // Once our page table's switched, the Guest is live!
+ // The Host fades as we run this final step.
+ // Our "struct lite_pages" is now half read-only.
+ movl %ebx, %cr3
+
+ // The Host did put our registers in "regs"
+ // Which waits deep within the "struct lite_pages"
+ // We can simply pop off all Guest regs.
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %gs
+ popl %eax
+ popl %fs
+ popl %ds
+ popl %es
+
+ // Near the base of the stack lurk two strange fields
+ // Which we fill as we exit the Guest
+ // These are the trap number and its error
+ // We can simply step past them on our way.
+ addl $8, %esp
+
+ // The last five stack slots hold return address
+ // And everything needed to change privilege
+ // Into the Guest privilege level of 1,
+ // And the stack where the Guest had last left it.
+ // Interrupts are turned back on: we are Guest.
+ iret
+
+// There are two paths where we switch to the Host
+// So we put the routine in a macro.
+// We are on our way home, back to the Host
+// Interrupted out of the Guest, we come here.
+#define SWITCH_TO_HOST \
+ /* We save the Guest state: all registers first \
+ * Laid out just as "struct lite_regs" has defined */ \
+ pushl %es; \
+ pushl %ds; \
+ pushl %fs; \
+ pushl %eax; \
+ pushl %gs; \
+ pushl %ebp; \
+ pushl %edi; \
+ pushl %esi; \
+ pushl %edx; \
+ pushl %ecx; \
+ pushl %ebx; \
+ /* Our stack and our code are using segments \
+ * Set in the TSS and IDT \
+ * Yet if we were to touch data we'd use \
+ * Whatever data segment the Guest had. \
+ * Load the lite ds segment to begin. */ \
+ movl $(LITE_DS), %eax; \
+ movl %eax, %ds; \
+ /* So where are we? Which CPU, which struct? \
+ * The stack is our clue: our TSS starts \
+ * It at the end of the "struct lite_pages". \
+ * Or we may have stumbled while restoring \
+ * Our Guest segment regs while in switch_to_guest, \
+ * The fault pushed atop that part-unwound stack. \
+ * If we round the stack down to the page start \
+ * To find the start of our "struct lite_pages". */ \
+ movl %esp, %eax; \
+ andl $(~(1 << PAGE_SHIFT - 1)), %eax; \
+ /* Save our trap number: the switch will obscure it \
+ * (The Guest regs are not mapped here in the Host) \
+ * %ebx holds it safe for deliver_to_host */ \
+ movl LITE_PAGES_regs_trapnum(%eax), %ebx; \
+ /* The Host GDT, IDT and stack! \
+ * All these lie safely hidden from the Guest: \
+ * We must return to the Host page tables \
+ * (Hence that was saved in our struct lite_pages) */ \
+ movl LITE_PAGES_host_cr3(%eax), %edx; \
+ movl %edx, %cr3; \
+ /* As before, when we looked back at the Host \
+ * As we left and marked TSS unused \
+ * So must we now for the Guest left behind. */ \
+ andb $0xFD, (LITE_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
+ /* Switch to Host's GDT, IDT. */ \
+ lgdt LITE_PAGES_host_gdt_desc(%eax); \
+ lidt LITE_PAGES_host_idt_desc(%eax); \
+ /* Restore the Host's stack where it's saved regs lie */ \
+ movl LITE_PAGES_host_sp(%eax), %esp; \
+ /* Last the TSS: our Host is complete */ \
+ movl $(GDT_ENTRY_TSS*8), %edx; \
+ ltr %dx; \
+ /* Restore now the regs saved right at the first. */ \
+ popl %ebp; \
+ popl %fs; \
+ popl %gs; \
+ popl %ds; \
+ popl %es
+
+// Here's where we come when the Guest has just trapped:
+// (Which trap we'll see has been pushed on the stack).
+// We need only switch back, and the Host will decode
+// Why we came home, and what needs to be done.
+return_to_host:
+ SWITCH_TO_HOST
+ iret
+
+// An interrupt, with some cause external
+// Has ajerked us rudely from the Guest's code
+// Again we must return home to the Host
+deliver_to_host:
+ SWITCH_TO_HOST
+ // But now we must go home via that place
+ // Where that interrupt was supposed to go
+ // Had we not been ensconced, running the Guest.
+ // Here we see the cleverness of our stack:
+ // The Host stack is formed like an interrupt
+ // With EIP, CS and EFLAGS layered.
+ // Interrupt handlers end with "iret"
+ // And that will take us home at long long last.
+
+ // But first we must find the handler to call!
+ // The IDT descriptor for the Host
+ // Has two bytes for size, and four for address:
+ // %edx will hold it for us for now.
+ movl (LITE_PAGES_host_idt_desc+2)(%eax), %edx
+ // We now know the table address we need,
+ // And saved the trap's number inside %ebx.
+ // Yet the pointer to the handler is smeared
+ // Across the bits of the table entry.
+ // What oracle can tell us how to extract
+ // From such a convoluted encoding?
+ // I consulted gcc, and it gave
+ // These instructions, which I gladly credit:
+ leal (%edx,%ebx,8), %eax
+ movzwl (%eax),%edx
+ movl 4(%eax), %eax
+ xorw %ax, %ax
+ orl %eax, %edx
+ // Now the address of the handler's in %edx
+ // We call it now: its "iret" takes us home.
+ jmp *%edx
+
+// Every interrupt can come to us here
+// But we must truly tell each apart.
+// They number two hundred and fifty six
+// And each must land in a different spot,
+// Push its number on stack, and join the stream.
+
+// And worse, a mere six of the traps stand apart
+// And push on their stack an addition:
+// An error number, thirty two bits long
+// So we punish the other two fifty
+// And make them push a zero so they match.
+
+// Yet two fifty six entries is long
+// And all will look most the same as the last
+// So we create a macro which can make
+// As many entries as we need to fill.
+
+// Note the change to .data then .text:
+// We plant the address of each entry
+// Into a (data) table for the Host
+// To know where each Guest interrupt should go.
+.macro IRQ_STUB N TARGET
+ .data; .long 1f; .text; 1:
+ // Trap eight, ten through fourteen and seventeen
+ // Supply an error number. Else zero.
+ .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
+ pushl $0
+ .endif
+ pushl $\N
+ jmp \TARGET
+ ALIGN
+.endm
+
+// This macro creates numerous entries
+// Using GAS macros which out-power C's.
+.macro IRQ_STUBS FIRST LAST TARGET
+ irq=\FIRST
+ .rept \LAST-\FIRST+1
+ IRQ_STUB irq \TARGET
+ irq=irq+1
+ .endr
+.endm
+
+// Here's the marker for our pointer table
+// Laid in the data section just before
+// Each macro places the address of code
+// Forming an array: each one points to text
+// Which handles interrupt in its turn.
+.data
+.global default_idt_entries
+default_idt_entries:
+.text
+ // The first two traps go straight back to the Host
+ IRQ_STUBS 0 1 return_to_host
+ // We'll say nothing, yet, about NMI
+ IRQ_STUB 2 handle_nmi
+ // Other traps also return to the Host
+ IRQ_STUBS 3 31 return_to_host
+ // All interrupts go via their handlers
+ IRQ_STUBS 32 127 deliver_to_host
+ // 'Cept system calls coming from userspace
+ // Are to go to the Guest, never the Host.
+ IRQ_STUB 128 return_to_host
+ IRQ_STUBS 129 255 deliver_to_host
+
+// The NMI, what a fabulous beast
+// Which swoops in and stops us no matter that
+// We're suspended between heaven and hell,
+// (Or more likely between the Host and Guest)
+// When in it comes! We are dazed and confused
+// So we do the simplest thing which one can.
+// Though we've pushed the trap number and zero
+// We discard them, return, and hope we live.
+handle_nmi:
+ addl $8, %esp
+ iret
+
+// We are done; all that's left is Mastery
+// And "make Mastery" is a journey long
+// Designed to make your fingers itch to code.
+
+// Here ends the text, the file and poem.
+ENTRY(end_switcher_text)
diff -r 039995825488 drivers/kvm/mmu.c
--- a/drivers/kvm/mmu.c Fri Aug 31 15:38:42 2007 +1000
+++ b/drivers/kvm/mmu.c Fri Aug 31 15:42:48 2007 +1000
@@ -26,6 +26,7 @@
#include <linux/highmem.h>
#include <linux/module.h>
+#include <asm/io.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
@@ -132,6 +133,7 @@ static int dbg = 1;
#define PT32_DIR_BASE_ADDR_MASK \
(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
+#define SHADOW_NAILED_RESERVE ((unsigned long)-(PT64_ENT_PER_PAGE * PAGE_SIZE))
#define PFERR_PRESENT_MASK (1U << 0)
#define PFERR_WRITE_MASK (1U << 1)
@@ -1120,6 +1122,131 @@ static void mmu_pte_write_new_pte(struct
else
paging64_update_pte(vcpu, page, spte, new, bytes);
}
+
+/* FIXME: SMP guests cannot share toplevels with this. */
+struct nailed_mappings
+{
+ /* We might need to fake up numerous levels. */
+#ifdef CONFIG_X86_64
+ struct page *ptes[4];
+#else
+ struct page *ptes[2];
+#endif
+};
+static DEFINE_PER_CPU(struct nailed_mappings, nailed_mappings);
+
+static u64 nailed_pte_of(struct page *page)
+{
+ return ((u64)page_to_pfn(page) << PAGE_SHIFT)
+ | PT_PRESENT_MASK | PT_WRITABLE_MASK
+ | PT_ACCESSED_MASK | PT_DIRTY_MASK;
+}
+
+/* Nailed mappings are 4k page mappings at -2M */
+int kvm_mmu_init_nailed_mapping(int cpu, struct page *page[], unsigned int num)
+{
+ int i;
+ u64 *ptepage, pte;
+ struct nailed_mappings *nm = &per_cpu(nailed_mappings, cpu);
+
+ printk("SHADOW_NAILED_RESERVE = %#lx\n", SHADOW_NAILED_RESERVE);
+
+ printk("Initializing %u nailed mappings for cpu %i\n", num, cpu);
+ for (i = 0; i < ARRAY_SIZE(nm->ptes); i++) {
+ nm->ptes[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+ if (!nm->ptes[i])
+ goto free;
+ }
+
+ /* Chain last pte entries to point to our PTE page. */
+ for (i = 1; i < ARRAY_SIZE(nm->ptes); i++) {
+ ptepage = page_address(nm->ptes[i]);
+ ptepage[PT64_ENT_PER_PAGE - 1] = nailed_pte_of(nm->ptes[i-1]);
+ }
+
+ /* Bottom page contains pages as given by args. */
+ ptepage = page_address(nm->ptes[0]);
+ for (i = 0; i < num; i++) {
+ if (!page[i])
+ continue;
+ pte = (u64)page_to_pfn(page[i]) << PAGE_SHIFT;
+ pte |= (PT_PRESENT_MASK|PT_ACCESSED_MASK|PT_DIRTY_MASK);
+ if (page[i]->private & 1)
+ pte |= PT_WRITABLE_MASK;
+ printk("%p[%i] = %llx\n", ptepage, i, pte);
+ ptepage[i] = pte;
+ }
+ return 0;
+
+free:
+ while (--i >= 0)
+ __free_page(nm->ptes[i]);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_init_nailed_mapping);
+
+void kvm_mmu_free_nailed_mapping(int cpu)
+{
+ unsigned int i;
+ struct nailed_mappings *nm = &per_cpu(nailed_mappings, cpu);
+
+ for (i = 0; i < ARRAY_SIZE(nm->ptes); i++)
+ __free_page(nm->ptes[i]);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_nailed_mapping);
+
+void kvm_use_nailed_mappings(struct kvm_vcpu *vcpu)
+{
+ struct nailed_mappings *nm = &__get_cpu_var(nailed_mappings);
+ hpa_t shadow_page = vcpu->mmu.root_hpa;
+ u64 *shadow_ent = NULL;
+ int level;
+
+ vcpu->mmu.nailed = 1;
+
+ level = vcpu->mmu.shadow_root_level;
+ if (level == PT32E_ROOT_LEVEL) {
+ shadow_page = vcpu->mmu.pae_root[3] & PT64_BASE_ADDR_MASK;
+ level--;
+ }
+
+ for (; level > PT_PAGE_TABLE_LEVEL; level--) {
+ shadow_ent = ((u64 *)__va(shadow_page)) + PT64_ENT_PER_PAGE-1;
+ if (!is_present_pte(*shadow_ent))
+ break;
+ shadow_page = *shadow_ent & PT64_BASE_ADDR_MASK;
+ }
+ *shadow_ent = nailed_pte_of(nm->ptes[level-2]);
+}
+EXPORT_SYMBOL_GPL(kvm_use_nailed_mappings);
+
+/* FIXME: We should be able to handle this in the rest of the code, and only
+ * remove when needed. */
+void kvm_remove_nailed_mappings(struct kvm_vcpu *vcpu)
+{
+ struct nailed_mappings *nm = &__get_cpu_var(nailed_mappings);
+ hpa_t shadow_page = vcpu->mmu.root_hpa;
+ u64 *shadow_ent = NULL;
+ int level;
+
+ vcpu->mmu.nailed = 1;
+
+ level = vcpu->mmu.shadow_root_level;
+ if (level == PT32E_ROOT_LEVEL) {
+ shadow_page = vcpu->mmu.pae_root[3] & PT64_BASE_ADDR_MASK;
+ level--;
+ }
+
+ for (; level > PT_PAGE_TABLE_LEVEL; level--) {
+ shadow_ent = ((u64 *)__va(shadow_page)) + PT64_ENT_PER_PAGE-1;
+ if (*shadow_ent == nailed_pte_of(nm->ptes[level-2]))
+ break;
+ BUG_ON(!is_present_pte(*shadow_ent));
+ shadow_page = *shadow_ent & PT64_BASE_ADDR_MASK;
+ }
+ *shadow_ent = 0;
+}
+EXPORT_SYMBOL_GPL(kvm_remove_nailed_mappings);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes)
diff -r 039995825488 drivers/kvm/paging_tmpl.h
--- a/drivers/kvm/paging_tmpl.h Fri Aug 31 15:38:42 2007 +1000
+++ b/drivers/kvm/paging_tmpl.h Fri Aug 31 15:42:48 2007 +1000
@@ -86,6 +86,11 @@ static int FNAME(walk_addr)(struct guest
walker->page = NULL;
walker->ptep = NULL;
root = vcpu->cr3;
+
+ /* Don't let them do anything with the nailed area. */
+ if (vcpu->mmu.nailed && addr >= SHADOW_NAILED_RESERVE)
+ goto not_present;
+
#if PTTYPE == 64
if (!is_long_mode(vcpu)) {
walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
diff -r 039995825488 drivers/kvm/x86_emulate.c
--- a/drivers/kvm/x86_emulate.c Fri Aug 31 15:38:42 2007 +1000
+++ b/drivers/kvm/x86_emulate.c Fri Aug 31 15:42:48 2007 +1000
@@ -1451,7 +1451,9 @@ twobyte_special_insn:
rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
if (rc) {
kvm_arch_ops->inject_gp(ctxt->vcpu, 0);
- _eip = ctxt->vcpu->rip;
+ /* Don't restore regs (inject_gp can change them) */
+ rc = X86EMUL_CONTINUE;
+ goto done;
}
rc = X86EMUL_CONTINUE;
break;
@@ -1460,7 +1462,9 @@ twobyte_special_insn:
rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
if (rc) {
kvm_arch_ops->inject_gp(ctxt->vcpu, 0);
- _eip = ctxt->vcpu->rip;
+ /* Don't restore regs (inject_gp can change them) */
+ rc = X86EMUL_CONTINUE;
+ goto done;
} else {
_regs[VCPU_REGS_RAX] = (u32)msr_data;
_regs[VCPU_REGS_RDX] = msr_data >> 32;
diff -r 039995825488 include/linux/cpumask.h
--- a/include/linux/cpumask.h Fri Aug 31 15:38:42 2007 +1000
+++ b/include/linux/cpumask.h Fri Aug 31 15:42:48 2007 +1000
@@ -218,8 +218,8 @@ int __next_cpu(int n, const cpumask_t *s
int __next_cpu(int n, const cpumask_t *srcp);
#define next_cpu(n, src) __next_cpu((n), &(src))
#else
-#define first_cpu(src) 0
-#define next_cpu(n, src) 1
+#define first_cpu(src) ((src).bits[0]&1?0:NR_CPUS)
+#define next_cpu(n, src) NR_CPUS
#endif
#define cpumask_of_cpu(cpu) \
diff -r 039995825488 include/linux/kvm_lite.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/include/linux/kvm_lite.h Sun Sep 02 10:30:28 2007 +1000
@@ -0,0 +1,33 @@
+#ifndef __LINUX_KVM_LITE_H
+#define __LINUX_KVM_LITE_H
+#include <linux/time.h>
+
+#define KVM_HCALL_LOAD_IDT_ENTRY 100
+#define KVM_HCALL_LOAD_GDT_ENTRY 101
+#define KVM_HCALL_SET_STACK 102
+#define KVM_HCALL_HALT 103
+#define KVM_HCALL_HACK_WRITE 104
+#define KVM_HCALL_SET_CLOCKEVENT 105
+
+struct kvm_lite_data
+{
+ /* 512 == enabled (same as eflags in normal hardware). The Guest
+ * changes interrupts so often that a hypercall is too slow. */
+ unsigned int irq_enabled;
+
+ /* The Host writes the virtual address of the last page fault here,
+ * which saves the Guest a hypercall. CR2 is the native register where
+ * this address would normally be found. */
+ unsigned long cr2;
+
+ int trace;
+
+/* Fields initialized by the Host at first hypercall: */
+ /* Memory not to try to access */
+ unsigned long reserve_mem;
+ /* Current time. */
+ struct timespec time;
+
+/* Fields initialized by the Guest at boot: */
+};
+#endif /* __LINUX_KVM_LITE_H */
diff -r 039995825488 include/linux/kvm_para.h
--- a/include/linux/kvm_para.h Fri Aug 31 15:38:42 2007 +1000
+++ b/include/linux/kvm_para.h Thu Sep 06 01:30:36 2007 +1000
@@ -90,4 +90,11 @@ static inline int kvm_para_has_feature(u
#define KVM_ENOSYS ENOSYS
+#define KVM_HCALL_LOAD_IDT_ENTRY 100
+#define KVM_HCALL_LOAD_GDT_ENTRY 101
+#define KVM_HCALL_SET_STACK 102
+#define KVM_HCALL_HALT 103
+#define KVM_HCALL_HACK_WRITE 104
+#define KVM_HCALL_SET_CLOCKEVENT 105
+
#endif
-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems? Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/
^ permalink raw reply [flat|nested] 18+ messages in thread