From mboxrd@z Thu Jan 1 00:00:00 1970 From: Rusty Russell Subject: [PATCH 6/10] lguest code: the little linux hypervisor. Date: Fri, 09 Feb 2007 20:20:27 +1100 Message-ID: <1171012827.2718.42.camel@localhost.localdomain> References: <1171012296.2718.26.camel@localhost.localdomain> <1171012458.2718.30.camel@localhost.localdomain> <1171012693.2718.37.camel@localhost.localdomain> <1171012761.2718.40.camel@localhost.localdomain> Mime-Version: 1.0 Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: quoted-printable Return-path: In-Reply-To: <1171012761.2718.40.camel@localhost.localdomain> List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: virtualization-bounces@lists.osdl.org Errors-To: virtualization-bounces@lists.osdl.org To: lkml - Kernel Mailing List Cc: Andrew Morton , Andi Kleen , Stephen Rothwell , Paul Mackerras , virtualization List-Id: virtualization@lists.linuxfoundation.org This is the core of lguest: both the guest code (always compiled in to the image so it can boot under lguest), and the host code (lg.ko). There is only one config prompt at the moment: lguest is currently designed to run exactly the same guest and host kernels so we can frob the ABI freely. Unfortunately, we don't have the build infrastructure for "private" asm-offsets.h files, so there's a not-so-neat include in arch/i386/kernel/asm-offsets.c. Signed-off-by: Rusty Russell =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -226,6 +226,27 @@ config ES7000_CLUSTERED_APIC depends on SMP && X86_ES7000 && MPENTIUMIII = source "arch/i386/Kconfig.cpu" + +config LGUEST + tristate "Linux hypervisor example code" + depends on X86 && PARAVIRT && EXPERIMENTAL && !X86_PAE + select LGUEST_GUEST + select HVC_DRIVER + ---help--- + This is a very simple module which allows you to run + multiple instances of the same Linux kernel, using the + "lguest" command found in the Documentation/lguest directory. + Note that "lguest" is pronounced to rhyme with "fell quest", + not "rustyvisor". See Documentation/lguest/lguest.txt. + + If unsure, say N. If curious, say M. If masochistic, say Y. + +config LGUEST_GUEST + bool + help + The guest needs code built-in, even if the host has lguest + support as a module. The drivers are tiny, so we build them + in too. = config HPET_TIMER bool "HPET Timer Support" =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -108,6 +108,7 @@ drivers-$(CONFIG_PCI) +=3D arch/i386/pci # must be linked after kernel/ drivers-$(CONFIG_OPROFILE) +=3D arch/i386/oprofile/ drivers-$(CONFIG_PM) +=3D arch/i386/power/ +drivers-$(CONFIG_LGUEST_GUEST) +=3D arch/i386/lguest/ = CFLAGS +=3D $(mflags-y) AFLAGS +=3D $(mflags-y) =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -16,6 +16,10 @@ #include #include #include +#ifdef CONFIG_LGUEST_GUEST +#include +#include "../lguest/lg.h" +#endif = #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -111,4 +115,19 @@ void foo(void) OFFSET(PARAVIRT_iret, paravirt_ops, iret); OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); #endif + +#ifdef CONFIG_LGUEST_GUEST + BLANK(); + OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); + OFFSET(LGUEST_STATE_host_stackptr, lguest_state, host.stackptr); + OFFSET(LGUEST_STATE_host_pgdir, lguest_state, host.pgdir); + OFFSET(LGUEST_STATE_host_gdt, lguest_state, host.gdt); + OFFSET(LGUEST_STATE_host_idt, lguest_state, host.idt); + OFFSET(LGUEST_STATE_regs, lguest_state, regs); + OFFSET(LGUEST_STATE_gdt, lguest_state, gdt); + OFFSET(LGUEST_STATE_idt, lguest_state, idt); + OFFSET(LGUEST_STATE_gdt_table, lguest_state, gdt_table); + OFFSET(LGUEST_STATE_trapnum, lguest_state, regs.trapnum); + OFFSET(LGUEST_STATE_errcode, lguest_state, regs.errcode); +#endif } =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/lguest/Makefile @@ -0,0 +1,22 @@ +# Guest requires the paravirt_ops replacement and the bus driver. +obj-$(CONFIG_LGUEST_GUEST) +=3D lguest.o lguest_bus.o + +# Host requires the other files, which can be a module. +obj-$(CONFIG_LGUEST) +=3D lg.o +lg-objs :=3D core.o hypercalls.o page_tables.o interrupts_and_traps.o \ + segments.o io.o lguest_user.o + +# We use top 4MB for guest traps page, then hypervisor. */ +HYPE_ADDR :=3D (0xFFC00000+4096) +# The data is only 1k (256 interrupt handler pointers) +HYPE_DATA_SIZE :=3D 1024 +CFLAGS +=3D -DHYPE_ADDR=3D"$(HYPE_ADDR)" -DHYPE_DATA_SIZE=3D"$(HYPE_DATA_S= IZE)" + +$(obj)/core.o: $(obj)/hypervisor-blob.c +# This links the hypervisor in the right place and turns it into a C array. +$(obj)/hypervisor-raw: $(obj)/hypervisor.o + @$(LD) -static -Tdata=3D`printf %#x $$(($(HYPE_ADDR)))` -Ttext=3D`printf = %#x $$(($(HYPE_ADDR)+$(HYPE_DATA_SIZE)))` -o $@ $< && $(OBJCOPY) -O binary = $@ +$(obj)/hypervisor-blob.c: $(obj)/hypervisor-raw + @od -tx1 -An -v $< | sed -e 's/^ /0x/' -e 's/$$/,/' -e 's/ /,0x/g' > $@ + +clean-files :=3D hypervisor-blob.c hypervisor-raw =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/lguest/core.c @@ -0,0 +1,425 @@ +/* World's simplest hypervisor, to test paravirt_ops and show + * unbelievers that virtualization is the future. Plus, it's fun! */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "lg.h" + +/* This is our hypervisor, compiled from hypervisor.S. */ +static char __initdata hypervisor_blob[] =3D { +#include "hypervisor-blob.c" +}; + +#define MAX_LGUEST_GUESTS \ + ((HYPERVISOR_SIZE-sizeof(hypervisor_blob))/sizeof(struct lguest_state)) + +static struct vm_struct *hypervisor_vma; +static int cpu_had_pge; +static struct { + unsigned long offset; + unsigned short segment; +} lguest_entry; +struct page *hype_pages; /* Contiguous pages. */ +struct lguest lguests[MAX_LGUEST_GUESTS]; +DECLARE_MUTEX(lguest_lock); + +/* IDT entries are at start of hypervisor. */ +const unsigned long *__lguest_default_idt_entries(void) +{ + return (void *)HYPE_ADDR; +} + +/* Next is switch_to_guest */ +static void *__lguest_switch_to_guest(void) +{ + return (void *)HYPE_ADDR + HYPE_DATA_SIZE; +} + +/* Then we use everything else to hold guest state. */ +struct lguest_state *__lguest_states(void) +{ + return (void *)HYPE_ADDR + sizeof(hypervisor_blob); +} + +static __init int map_hypervisor(void) +{ + unsigned int i; + int err; + struct page *pages[HYPERVISOR_PAGES], **pagep =3D pages; + + hype_pages =3D alloc_pages(GFP_KERNEL|__GFP_ZERO, + get_order(HYPERVISOR_SIZE)); + if (!hype_pages) + return -ENOMEM; + + hypervisor_vma =3D __get_vm_area(HYPERVISOR_SIZE, VM_ALLOC, + HYPE_ADDR, VMALLOC_END); + if (!hypervisor_vma) { + err =3D -ENOMEM; + printk("lguest: could not map hypervisor pages high\n"); + goto free_pages; + } + + for (i =3D 0; i < HYPERVISOR_PAGES; i++) + pages[i] =3D hype_pages + i; + + err =3D map_vm_area(hypervisor_vma, PAGE_KERNEL, &pagep); + if (err) { + printk("lguest: map_vm_area failed: %i\n", err); + goto free_vma; + } + memcpy(hypervisor_vma->addr, hypervisor_blob, sizeof(hypervisor_blob)); + + /* Setup LGUEST segments on all cpus */ + for_each_possible_cpu(i) { + get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] =3D FULL_EXEC_SEGMENT; + get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] =3D FULL_SEGMENT; + } + + /* Initialize entry point into hypervisor. */ + lguest_entry.offset =3D (long)__lguest_switch_to_guest(); + lguest_entry.segment =3D LGUEST_CS; + + printk("lguest: mapped hypervisor at %p\n", hypervisor_vma->addr); + return 0; + +free_vma: + vunmap(hypervisor_vma->addr); +free_pages: + __free_pages(hype_pages, get_order(HYPERVISOR_SIZE)); + return err; +} + +static __exit void unmap_hypervisor(void) +{ + vunmap(hypervisor_vma->addr); + __free_pages(hype_pages, get_order(HYPERVISOR_SIZE)); +} + +/* IN/OUT insns: enough to get us past boot-time probing. */ +static int emulate_insn(struct lguest *lg) +{ + u8 insn; + unsigned int insnlen =3D 0, in =3D 0, shift =3D 0; + unsigned long physaddr =3D guest_pa(lg, lg->state->regs.eip); + + /* This only works for addresses in linear mapping... */ + if (lg->state->regs.eip < lg->page_offset) + return 0; + lhread(lg, &insn, physaddr, 1); + + /* Operand size prefix means it's actually for ax. */ + if (insn =3D=3D 0x66) { + shift =3D 16; + insnlen =3D 1; + lhread(lg, &insn, physaddr + insnlen, 1); + } + + switch (insn & 0xFE) { + case 0xE4: /* in ,%al */ + insnlen +=3D 2; + in =3D 1; + break; + case 0xEC: /* in (%dx),%al */ + insnlen +=3D 1; + in =3D 1; + break; + case 0xE6: /* out %al, */ + insnlen +=3D 2; + break; + case 0xEE: /* out %al,(%dx) */ + insnlen +=3D 1; + break; + default: + return 0; + } + + if (in) { + /* Lower bit tells is whether it's a 16 or 32 bit access */ + if (insn & 0x1) + lg->state->regs.eax =3D 0xFFFFFFFF; + else + lg->state->regs.eax |=3D (0xFFFF << shift); + } + lg->state->regs.eip +=3D insnlen; + return 1; +} + +int find_free_guest(void) +{ + unsigned int i; + for (i =3D 0; i < MAX_LGUEST_GUESTS; i++) + if (!lguests[i].state) + return i; + return -1; +} + +int lguest_address_ok(const struct lguest *lg, unsigned long addr) +{ + return addr / PAGE_SIZE < lg->pfn_limit; +} + +/* Just like get_user, but don't let guest access lguest binary. */ +u32 lhread_u32(struct lguest *lg, u32 addr) +{ + u32 val =3D 0; + + /* Don't let them access lguest_add */ + if (!lguest_address_ok(lg, addr) + || get_user(val, (u32 __user *)addr) !=3D 0) + kill_guest(lg, "bad read address %u", addr); + return val; +} + +void lhwrite_u32(struct lguest *lg, u32 addr, u32 val) +{ + if (!lguest_address_ok(lg, addr) + || put_user(val, (u32 __user *)addr) !=3D 0) + kill_guest(lg, "bad write address %u", addr); +} + +void lhread(struct lguest *lg, void *b, u32 addr, unsigned bytes) +{ + if (addr + bytes < addr || !lguest_address_ok(lg, addr+bytes) + || copy_from_user(b, (void __user *)addr, bytes) !=3D 0) { + /* copy_from_user should do this, but as we rely on it... */ + memset(b, 0, bytes); + kill_guest(lg, "bad read address %u len %u", addr, bytes); + } +} + +void lhwrite(struct lguest *lg, u32 addr, const void *b, unsigned bytes) +{ + if (addr + bytes < addr + || !lguest_address_ok(lg, addr+bytes) + || copy_to_user((void __user *)addr, b, bytes) !=3D 0) + kill_guest(lg, "bad write address %u len %u", addr, bytes); +} + +/* Saves exporting idt_table from kernel */ +static struct desc_struct *get_idt_table(void) +{ + struct Xgt_desc_struct idt; + + asm("sidt %0":"=3Dm" (idt)); + return (void *)idt.address; +} + +extern asmlinkage void math_state_restore(void); + +static int usermode(struct lguest_regs *regs) +{ + return (regs->cs & SEGMENT_RPL_MASK) =3D=3D USER_RPL; +} + +/* Trap page resets this when it reloads gs. */ +static int new_gfp_eip(struct lguest *lg, struct lguest_regs *regs) +{ + u32 eip; + get_user(eip, &lg->lguest_data->gs_gpf_eip); + if (eip =3D=3D regs->eip) + return 0; + put_user(regs->eip, &lg->lguest_data->gs_gpf_eip); + return 1; +} + +static void set_ts(unsigned int guest_ts) +{ + u32 cr0; + if (guest_ts) { + asm("movl %%cr0,%0":"=3Dr" (cr0)); + if (!(cr0 & 8)) + asm("movl %0,%%cr0": :"r" (cr0|8)); + } +} + +static void run_guest_once(struct lguest *lg) +{ + unsigned int clobber; + + /* Put eflags on stack, lcall does rest. */ + asm volatile("pushf; lcall *lguest_entry" + : "=3Da"(clobber), "=3Dd"(clobber) + : "0"(lg->state), "1"(get_idt_table()) + : "memory"); +} + +int run_guest(struct lguest *lg, char *__user user) +{ + struct lguest_regs *regs =3D &lg->state->regs; + + while (!lg->dead) { + unsigned int cr2 =3D 0; /* Damn gcc */ + + /* Hypercalls first: we might have been out to userspace */ + if (do_async_hcalls(lg)) + goto pending_dma; + + if (regs->trapnum =3D=3D LGUEST_TRAP_ENTRY) { + /* Only do hypercall once. */ + regs->trapnum =3D 255; + if (hypercall(lg, regs)) + goto pending_dma; + } + + if (signal_pending(current)) + return -EINTR; + maybe_do_interrupt(lg); + + if (lg->dead) + break; + + if (lg->halted) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + continue; + } + + /* Restore limits on TLS segments if in user mode. */ + if (usermode(regs)) { + unsigned int i; + for (i =3D 0; i < ARRAY_SIZE(lg->tls_limits); i++) + lg->state->gdt_table[GDT_ENTRY_TLS_MIN+i].a + |=3D lg->tls_limits[i]; + } + + local_irq_disable(); + map_trap_page(lg); + + /* Host state to be restored after the guest returns. */ + asm("sidt %0":"=3Dm"(lg->state->host.idt)); + lg->state->host.gdt =3D __get_cpu_var(cpu_gdt_descr); + + /* Even if *we* don't want FPU trap, guest might... */ + set_ts(lg->ts); + + run_guest_once(lg); + + /* Save cr2 now if we page-faulted. */ + if (regs->trapnum =3D=3D 14) + asm("movl %%cr2,%0" :"=3Dr" (cr2)); + else if (regs->trapnum =3D=3D 7) + math_state_restore(); + local_irq_enable(); + + switch (regs->trapnum) { + case 13: /* We've intercepted a GPF. */ + if (regs->errcode =3D=3D 0) { + if (emulate_insn(lg)) + continue; + + /* FIXME: If it's reloading %gs in a loop? */ + if (usermode(regs) && new_gfp_eip(lg,regs)) + continue; + } + + if (reflect_trap(lg, &lg->gpf_trap, 1)) + continue; + break; + case 14: /* We've intercepted a page fault. */ + if (demand_page(lg, cr2, regs->errcode & 2)) + continue; + + /* If lguest_data is NULL, this won't hurt. */ + put_user(cr2, &lg->lguest_data->cr2); + if (reflect_trap(lg, &lg->page_trap, 1)) + continue; + kill_guest(lg, "unhandled page fault at %#x" + " (eip=3D%#x, errcode=3D%#x)", + cr2, regs->eip, regs->errcode); + break; + case 7: /* We've intercepted a Device Not Available fault. */ + /* If they don't want to know, just absorb it. */ + if (!lg->ts) = + continue; + if (reflect_trap(lg, &lg->fpu_trap, 0)) + continue; + kill_guest(lg, "unhandled FPU fault at %#x", + regs->eip); + break; + case 32 ... 255: /* Real interrupt, fall thru */ + cond_resched(); + case LGUEST_TRAP_ENTRY: /* Handled at top of loop */ + continue; + case 6: /* Invalid opcode before they installed handler */ + check_bug_kill(lg); + } + kill_guest(lg,"unhandled trap %i at %#x (err=3D%i)", + regs->trapnum, regs->eip, regs->errcode); + } + return -ENOENT; + +pending_dma: + put_user(lg->pending_dma, (unsigned long *)user); + put_user(lg->pending_addr, (unsigned long *)user+1); + return sizeof(unsigned long)*2; +} + +#define STRUCT_LGUEST_ELEM_SIZE(elem) sizeof(((struct lguest_state *)0)->e= lem) + +static void adjust_pge(void *on) +{ + if (on) + write_cr4(read_cr4() | X86_CR4_PGE); + else + write_cr4(read_cr4() & ~X86_CR4_PGE); +} + = +static int __init init(void) +{ + int err; + + if (paravirt_enabled()) + return -EPERM; + + err =3D map_hypervisor(); + if (err) + return err; + + err =3D init_pagetables(hype_pages); + if (err) { + unmap_hypervisor(); + return err; + } + lguest_io_init(); + + err =3D lguest_device_init(); + if (err) { + free_pagetables(); + unmap_hypervisor(); + return err; + } + if (cpu_has_pge) { /* We have a broader idea of "global". */ + cpu_had_pge =3D 1; + on_each_cpu(adjust_pge, 0, 0, 1); + clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); + } + return 0; +} + +static void __exit fini(void) +{ + lguest_device_remove(); + free_pagetables(); + unmap_hypervisor(); + if (cpu_had_pge) { + set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); + on_each_cpu(adjust_pge, (void *)1, 0, 1); + } +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell "); =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/lguest/hypercalls.c @@ -0,0 +1,199 @@ +/* Actual hypercalls, which allow guests to actually do something. + Copyright (C) 2006 Rusty Russell IBM Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 = USA +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include "lg.h" + +static void guest_set_stack(struct lguest *lg, + u32 seg, u32 esp, unsigned int pages) +{ + /* You cannot have a stack segment with priv level 0. */ + if ((seg & 0x3) !=3D GUEST_DPL) + kill_guest(lg, "bad stack segment %i", seg); + if (pages > 2) + kill_guest(lg, "bad stack pages %u", pages); + lg->state->tss.ss1 =3D seg; + lg->state->tss.esp1 =3D esp; + lg->stack_pages =3D pages; + pin_stack_pages(lg); +} + +/* Return true if DMA to host userspace now pending. */ +static int do_hcall(struct lguest *lg, struct lguest_regs *regs) +{ + switch (regs->eax) { + case LHCALL_FLUSH_ASYNC: + break; + case LHCALL_LGUEST_INIT: + kill_guest(lg, "already have lguest_data"); + break; + case LHCALL_CRASH: { + char msg[128]; + lhread(lg, msg, regs->edx, sizeof(msg)); + msg[sizeof(msg)-1] =3D '\0'; + kill_guest(lg, "CRASH: %s", msg); + break; + } + case LHCALL_LOAD_GDT: + load_guest_gdt(lg, regs->edx, regs->ebx); + break; + case LHCALL_NEW_PGTABLE: + guest_new_pagetable(lg, regs->edx); + break; + case LHCALL_FLUSH_TLB: + if (regs->edx) + guest_pagetable_clear_all(lg); + else + guest_pagetable_flush_user(lg); + break; + case LHCALL_LOAD_IDT_ENTRY: + load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx); + break; + case LHCALL_SET_STACK: + guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx); + break; + case LHCALL_TS: + lg->ts =3D regs->edx; + break; + case LHCALL_TIMER_READ: { + u32 now =3D jiffies; + mb(); + regs->eax =3D now - lg->last_timer; + lg->last_timer =3D now; + break; + } + case LHCALL_TIMER_START: + lg->timer_on =3D 1; + if (regs->edx !=3D HZ) + kill_guest(lg, "Bad clock speed %i", regs->edx); + lg->last_timer =3D jiffies; + break; + case LHCALL_HALT: + lg->halted =3D 1; + break; + case LHCALL_GET_WALLCLOCK: { + struct timeval tv; + do_gettimeofday(&tv); + regs->eax =3D tv.tv_sec; + break; + } + case LHCALL_BIND_DMA: + regs->eax =3D bind_dma(lg, regs->edx, regs->ebx, + regs->ecx >> 8, regs->ecx & 0xFF); + break; + case LHCALL_SEND_DMA: + return send_dma(lg, regs->edx, regs->ebx); + case LHCALL_SET_PTE: + guest_set_pte(lg, regs->edx, regs->ebx, regs->ecx); + break; + case LHCALL_SET_UNKNOWN_PTE: + guest_pagetable_clear_all(lg); + break; + case LHCALL_SET_PUD: + guest_set_pud(lg, regs->edx, regs->ebx); + break; + case LHCALL_LOAD_TLS: + guest_load_tls(lg, (struct desc_struct __user*)regs->edx); + break; + default: + kill_guest(lg, "Bad hypercall %i\n", regs->eax); + } + return 0; +} + +#define log(...) \ + do { \ + mm_segment_t oldfs =3D get_fs(); \ + char buf[100]; \ + sprintf(buf, "lguest:" __VA_ARGS__); \ + set_fs(KERNEL_DS); \ + sys_write(1, buf, strlen(buf)); \ + set_fs(oldfs); \ + } while(0) + +/* We always do queued calls before actual hypercall. */ +int do_async_hcalls(struct lguest *lg) +{ + unsigned int i, pending; + u8 st[LHCALL_RING_SIZE]; + + if (!lg->lguest_data) + return 0; + + copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)); + for (i =3D 0; i < ARRAY_SIZE(st); i++) { + struct lguest_regs regs; + unsigned int n =3D lg->next_hcall; + + if (st[n] =3D=3D 0xFF) + break; + + if (++lg->next_hcall =3D=3D LHCALL_RING_SIZE) + lg->next_hcall =3D 0; + + get_user(regs.eax, &lg->lguest_data->hcalls[n].eax); + get_user(regs.edx, &lg->lguest_data->hcalls[n].edx); + get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx); + get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx); + pending =3D do_hcall(lg, ®s); + put_user(0xFF, &lg->lguest_data->hcall_status[n]); + if (pending) + return 1; + } + + set_wakeup_process(lg, NULL); + return 0; +} + +int hypercall(struct lguest *lg, struct lguest_regs *regs) +{ + int pending; + + if (!lg->lguest_data) { + if (regs->eax !=3D LHCALL_LGUEST_INIT) { + kill_guest(lg, "hypercall %i before LGUEST_INIT", + regs->eax); + return 0; + } + + lg->lguest_data =3D (struct lguest_data __user *)regs->edx; + /* We check here so we can simply copy_to_user/from_user */ + if (!lguest_address_ok(lg, (long)lg->lguest_data) + || !lguest_address_ok(lg, (long)(lg->lguest_data+1))){ + kill_guest(lg, "bad guest page %p", lg->lguest_data); + return 0; + } + get_user(lg->noirq_start, &lg->lguest_data->noirq_start); + get_user(lg->noirq_end, &lg->lguest_data->noirq_end); + /* We reserve the top pgd entry. */ + put_user(4U*1024*1024, &lg->lguest_data->reserve_mem); + put_user(lg->guestid, &lg->lguest_data->guestid); + put_user(clocksource_khz2mult(tsc_khz, 22), + &lg->lguest_data->clock_mult); + return 0; + } + pending =3D do_hcall(lg, regs); + set_wakeup_process(lg, NULL); + return pending; +} =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/lguest/hypervisor.S @@ -0,0 +1,170 @@ +/* This code sits at 0xFFFF1000 to do the low-level guest<->host switch. + Layout is: default_idt_entries (1k), then switch_to_guest entry point. = */ +#include +#include +#include "lg.h" + +#define SAVE_REGS \ + /* Save old guest/host state */ \ + pushl %es; \ + pushl %ds; \ + pushl %fs; \ + pushl %eax; \ + pushl %gs; \ + pushl %ebp; \ + pushl %edi; \ + pushl %esi; \ + pushl %edx; \ + pushl %ecx; \ + pushl %ebx; \ + +.text +ENTRY(_start) /* ld complains unless _start is defined. */ +/* %eax contains ptr to target guest state, %edx contains host idt. */ +switch_to_guest: + pushl %ss + SAVE_REGS + /* Save old stack, switch to guest's stack. */ + movl %esp, LGUEST_STATE_host_stackptr(%eax) + movl %eax, %esp + /* Guest registers will be at: %esp-$LGUEST_STATE_regs */ + addl $LGUEST_STATE_regs, %esp + /* Switch to guest's GDT, IDT. */ + lgdt LGUEST_STATE_gdt(%eax) + lidt LGUEST_STATE_idt(%eax) + /* Save page table top. */ + movl %cr3, %ebx + movl %ebx, LGUEST_STATE_host_pgdir(%eax) + /* Set host's TSS to available (clear byte 5 bit 2). */ + movl (LGUEST_STATE_host_gdt+2)(%eax), %ebx + andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%ebx) + /* Switch to guest page tables */ + popl %ebx + movl %ebx, %cr3 + /* Switch to guest's TSS. */ + movl $(GDT_ENTRY_TSS*8), %ebx + ltr %bx + /* Restore guest regs */ + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %gs + /* Now we've loaded gs, neuter the TLS entries down to 1 byte/page */ + addl $(LGUEST_STATE_gdt_table+GDT_ENTRY_TLS_MIN*8), %eax + movw $0,(%eax) + movw $0,8(%eax) + movw $0,16(%eax) + popl %eax + popl %fs + popl %ds + popl %es + /* Skip error code and trap number */ + addl $8, %esp + iret + +#define SWITCH_TO_HOST \ + SAVE_REGS; \ + /* Save old pgdir */ \ + movl %cr3, %eax; \ + pushl %eax; \ + /* Load lguest ds segment for convenience. */ \ + movl $(LGUEST_DS), %eax; \ + movl %eax, %ds; \ + /* Now figure out who we are */ \ + movl %esp, %eax; \ + subl $LGUEST_STATE_regs, %eax; \ + /* Switch to host page tables (GDT, IDT and stack are in host \ + mem, so need this first) */ \ + movl LGUEST_STATE_host_pgdir(%eax), %ebx; \ + movl %ebx, %cr3; \ + /* Set guest's TSS to available (clear byte 5 bit 2). */ \ + andb $0xFD, (LGUEST_STATE_gdt_table+GDT_ENTRY_TSS*8+5)(%eax);\ + /* Switch to host's GDT & IDT. */ \ + lgdt LGUEST_STATE_host_gdt(%eax); \ + lidt LGUEST_STATE_host_idt(%eax); \ + /* Switch to host's stack. */ \ + movl LGUEST_STATE_host_stackptr(%eax), %esp; \ + /* Switch to host's TSS */ \ + movl $(GDT_ENTRY_TSS*8), %eax; \ + ltr %ax; \ + /* Restore host regs */ \ + popl %ebx; \ + popl %ecx; \ + popl %edx; \ + popl %esi; \ + popl %edi; \ + popl %ebp; \ + popl %gs; \ + popl %eax; \ + popl %fs; \ + popl %ds; \ + popl %es; \ + popl %ss + = +/* Return to run_guest_once. */ +return_to_host: + SWITCH_TO_HOST + iret + +deliver_to_host: + SWITCH_TO_HOST +decode_idt_and_jmp: + /* Decode IDT and jump to hosts' irq handler. When that does iret, it + * will return to run_guest_once. This is a feature. */ + /* We told gcc we'd clobber edx and eax... */ + movl LGUEST_STATE_trapnum(%eax), %eax + leal (%edx,%eax,8), %eax + movzwl (%eax),%edx + movl 4(%eax), %eax + xorw %ax, %ax + orl %eax, %edx + jmp *%edx + +deliver_to_host_with_errcode: + SWITCH_TO_HOST + pushl LGUEST_STATE_errcode(%eax) + jmp decode_idt_and_jmp + +/* Real hardware interrupts are delivered straight to the host. Others + cause us to return to run_guest_once so it can decide what to do. Note + that some of these are overridden by the guest to deliver directly, and + never enter here (see load_guest_idt_entry). */ +.macro IRQ_STUB N TARGET + .data; .long 1f; .text; 1: + /* Make an error number for most traps, which don't have one. */ + .if (\N <> 2) && (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17) + pushl $0 + .endif + pushl $\N + jmp \TARGET + ALIGN +.endm + +.macro IRQ_STUBS FIRST LAST TARGET + irq=3D\FIRST + .rept \LAST-\FIRST+1 + IRQ_STUB irq \TARGET + irq=3Dirq+1 + .endr +.endm + = +/* We intercept every interrupt, because we may need to switch back to + * host. Unfortunately we can't tell them apart except by entry + * point, so we need 256 entry points. + */ +irq_stubs: +.data +default_idt_entries: = +.text + IRQ_STUBS 0 1 return_to_host /* First two traps */ + IRQ_STUB 2 deliver_to_host_with_errcode /* NMI */ + IRQ_STUBS 3 31 return_to_host /* Rest of traps */ + IRQ_STUBS 32 127 deliver_to_host /* Real interrupts */ + IRQ_STUB 128 return_to_host /* System call (overridden) */ + IRQ_STUBS 129 255 deliver_to_host /* Other real interrupts */ + +/* Everything after this is used for the lguest_state structs. */ +ALIGN =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/lguest/interrupts_and_traps.c @@ -0,0 +1,221 @@ +#include +#include "lg.h" + +static void push_guest_stack(struct lguest *lg, u32 __user **gstack, u32 v= al) +{ + lhwrite_u32(lg, (u32)--(*gstack), val); +} + +int reflect_trap(struct lguest *lg, const struct host_trap *trap, int has_= err) +{ + u32 __user *gstack; + u32 eflags, ss, irq_enable; + struct lguest_regs *regs =3D &lg->state->regs; + + if (!trap->addr) + return 0; + + /* If they want a ring change, we use new stack and push old ss/esp */ + if ((regs->ss&0x3) !=3D GUEST_DPL) { + gstack =3D (u32 __user *)guest_pa(lg, lg->state->tss.esp1); + ss =3D lg->state->tss.ss1; + push_guest_stack(lg, &gstack, regs->ss); + push_guest_stack(lg, &gstack, regs->esp); + } else { + gstack =3D (u32 __user *)guest_pa(lg, regs->esp); + ss =3D regs->ss; + } + + /* We use IF bit in eflags to indicate whether irqs were disabled + (it's always 0, since irqs are enabled when guest is running). */ + eflags =3D regs->eflags; + get_user(irq_enable, &lg->lguest_data->irq_enabled); + eflags |=3D (irq_enable & 512); + + push_guest_stack(lg, &gstack, eflags); + push_guest_stack(lg, &gstack, regs->cs); + push_guest_stack(lg, &gstack, regs->eip); + + if (has_err) + push_guest_stack(lg, &gstack, regs->errcode); + + /* Change the real stack so hypervisor returns to trap handler */ + regs->ss =3D ss; + regs->esp =3D (u32)gstack + lg->page_offset; + regs->cs =3D (__KERNEL_CS|GUEST_DPL); + regs->eip =3D trap->addr; + + /* GS will be neutered on way back to guest. */ + put_user(0, &lg->lguest_data->gs_gpf_eip); + + /* Disable interrupts for an interrupt gate. */ + if (trap->disable_interrupts) + put_user(0, &lg->lguest_data->irq_enabled); + return 1; +} + +void maybe_do_interrupt(struct lguest *lg) +{ + unsigned int irq; + DECLARE_BITMAP(irqs, LGUEST_IRQS); + + if (!lg->lguest_data) + return; + + /* If timer has changed, set timer interrupt. */ + if (lg->timer_on && jiffies !=3D lg->last_timer) + set_bit(0, lg->irqs_pending); + + /* Mask out any interrupts they have blocked. */ + copy_from_user(&irqs, lg->lguest_data->interrupts, sizeof(irqs)); + bitmap_andnot(irqs, lg->irqs_pending, irqs, LGUEST_IRQS); + + irq =3D find_first_bit(irqs, LGUEST_IRQS); + if (irq >=3D LGUEST_IRQS) + return; + + /* If they're halted, we re-enable interrupts. */ + if (lg->halted) { + /* Re-enable interrupts. */ + put_user(512, &lg->lguest_data->irq_enabled); + lg->halted =3D 0; + } else { + /* Maybe they have interrupts disabled? */ + u32 irq_enabled; + get_user(irq_enabled, &lg->lguest_data->irq_enabled); + if (!irq_enabled) + return; + } + + if (lg->interrupt[irq].addr !=3D 0) { + clear_bit(irq, lg->irqs_pending); + reflect_trap(lg, &lg->interrupt[irq], 0); + } +} + +void check_bug_kill(struct lguest *lg) +{ +#ifdef CONFIG_BUG + u32 eip =3D lg->state->regs.eip - PAGE_OFFSET; + u16 insn; + + /* This only works for addresses in linear mapping... */ + if (lg->state->regs.eip < PAGE_OFFSET) + return; + lhread(lg, &insn, eip, sizeof(insn)); + if (insn =3D=3D 0x0b0f) { +#ifdef CONFIG_DEBUG_BUGVERBOSE + u16 l; + u32 f; + char file[128]; + lhread(lg, &l, eip+sizeof(insn), sizeof(l)); + lhread(lg, &f, eip+sizeof(insn)+sizeof(l), sizeof(f)); + lhread(lg, file, f - PAGE_OFFSET, sizeof(file)); + file[sizeof(file)-1] =3D 0; + kill_guest(lg, "BUG() at %#x %s:%u", eip, file, l); +#else + kill_guest(lg, "BUG() at %#x", eip); +#endif /* CONFIG_DEBUG_BUGVERBOSE */ + } +#endif /* CONFIG_BUG */ +} + +static void copy_trap(struct lguest *lg, + struct host_trap *trap, + const struct desc_struct *desc) +{ + u8 type =3D ((desc->b >> 8) & 0xF); + + /* Not present? */ + if (!(desc->b & 0x8000)) { + trap->addr =3D 0; + return; + } + if (type !=3D 0xE && type !=3D 0xF) + kill_guest(lg, "bad IDT type %i", type); + trap->disable_interrupts =3D (type =3D=3D 0xE); + trap->addr =3D ((desc->a & 0x0000FFFF) | (desc->b & 0xFFFF0000)); +} + +/* FIXME: Put this in hypervisor.S and do something clever with relocs? */ +static u8 tramp[] = +=3D { 0x0f, 0xa8, 0x0f, 0xa9, /* push %gs; pop %gs */ + 0x36, 0xc7, 0x05, 0x55, 0x55, 0x55, 0x55, 0x00, 0x00, 0x00, 0x00, + /* movl 0, %ss:lguest_data.gs_gpf_eip */ + 0xe9, 0x55, 0x55, 0x55, 0x55 /* jmp dstaddr */ +}; +#define TRAMP_MOVL_TARGET_OFF 7 +#define TRAMP_JMP_TARGET_OFF 16 + +static u32 setup_trampoline(struct lguest *lg, unsigned int i, u32 dstaddr) +{ + u32 addr, off; + + off =3D sizeof(tramp)*i; + memcpy(lg->trap_page + off, tramp, sizeof(tramp)); + + /* 0 is to be placed in lguest_data.gs_gpf_eip. */ + addr =3D (u32)&lg->lguest_data->gs_gpf_eip + lg->page_offset; + memcpy(lg->trap_page + off + TRAMP_MOVL_TARGET_OFF, &addr, 4); + + /* Address is relative to where end of jmp will be. */ + addr =3D dstaddr - ((-4*1024*1024) + off + sizeof(tramp)); + memcpy(lg->trap_page + off + TRAMP_JMP_TARGET_OFF, &addr, 4); + return (-4*1024*1024) + off; +} + +/* We bounce through the trap page, for two reasons: firstly, we need + the interrupt destination always mapped, to avoid double faults, + secondly we want to reload %gs to make it innocuous on entering kernel. + */ +static void setup_idt(struct lguest *lg, + unsigned int i, + const struct desc_struct *desc) +{ + u8 type =3D ((desc->b >> 8) & 0xF); + u32 taddr; + + /* Not present? */ + if (!(desc->b & 0x8000)) { + /* FIXME: When we need this, we'll know... */ + if (lg->state->idt_table[i].a & 0x8000) + kill_guest(lg, "removing interrupts not supported"); + return; + } + + /* We could reflect and disable interrupts, but guest can do itself. */ + if (type !=3D 0xF) + kill_guest(lg, "bad direct IDT %i type %i", i, type); + + taddr =3D setup_trampoline(lg, i, (desc->a&0xFFFF)|(desc->b&0xFFFF0000)); + + lg->state->idt_table[i].a =3D (((__KERNEL_CS|GUEST_DPL)<<16) + | (taddr & 0x0000FFFF)); + lg->state->idt_table[i].b =3D (desc->b&0xEF00)|(taddr&0xFFFF0000); +} + +void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 = high) +{ + struct desc_struct d =3D { low, high }; + + /* Ignore NMI, doublefault, hypercall, spurious interrupt. */ + if (i =3D=3D 2 || i =3D=3D 8 || i =3D=3D 15 || i =3D=3D LGUEST_TRAP_ENTRY) + return; + /* FIXME: We should handle debug and int3 */ + else if (i =3D=3D 1 || i =3D=3D 3) + return; + /* We intercept page fault, general protection fault and fpu missing */ + else if (i =3D=3D 13) + copy_trap(lg, &lg->gpf_trap, &d); + else if (i =3D=3D 14) + copy_trap(lg, &lg->page_trap, &d); + else if (i =3D=3D 7) + copy_trap(lg, &lg->fpu_trap, &d); + /* Other traps go straight to guest. */ + else if (i < FIRST_EXTERNAL_VECTOR || i =3D=3D SYSCALL_VECTOR) + setup_idt(lg, i, &d); + /* A virtual interrupt */ + else if (i < FIRST_EXTERNAL_VECTOR + LGUEST_IRQS) + copy_trap(lg, &lg->interrupt[i-FIRST_EXTERNAL_VECTOR], &d); +} + =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/lguest/io.c @@ -0,0 +1,413 @@ +/* Simple I/O model for guests, based on shared memory. + * Copyright (C) 2006 Rusty Russell IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 = USA + */ +#include +#include +#include +#include +#include +#include +#include "lg.h" + +static struct list_head dma_hash[64]; + +/* FIXME: allow multi-page lengths. */ +static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma) +{ + unsigned int i; + + for (i =3D 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { + if (!dma->len[i]) + return 1; + if (!lguest_address_ok(lg, dma->addr[i])) + goto kill; + if (dma->len[i] > PAGE_SIZE) + goto kill; + /* We could do over a page, but is it worth it? */ + if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE) + goto kill; + } + return 1; + +kill: + kill_guest(lg, "bad DMA entry: %u@%#x", dma->len[i], dma->addr[i]); + return 0; +} + +static unsigned int hash(const union futex_key *key) +{ + return jhash2((u32*)&key->both.word, + (sizeof(key->both.word)+sizeof(key->both.ptr))/4, + key->both.offset) + % ARRAY_SIZE(dma_hash); +} + +/* Must hold read lock on dmainfo owner's current->mm->mmap_sem */ +static void unlink_dma(struct lguest_dma_info *dmainfo) +{ + BUG_ON(down_trylock(&lguest_lock) =3D=3D 0); + dmainfo->interrupt =3D 0; + list_del(&dmainfo->list); + drop_futex_key_refs(&dmainfo->key); +} + +static inline int key_eq(const union futex_key *a, const union futex_key *= b) +{ + return (a->both.word =3D=3D b->both.word + && a->both.ptr =3D=3D b->both.ptr + && a->both.offset =3D=3D b->both.offset); +} + +static u32 unbind_dma(struct lguest *lg, + const union futex_key *key, + unsigned long dmas) +{ + int i, ret =3D 0; + + for (i =3D 0; i < LGUEST_MAX_DMA; i++) { + if (key_eq(key, &lg->dma[i].key) && dmas =3D=3D lg->dma[i].dmas) { + unlink_dma(&lg->dma[i]); + ret =3D 1; + break; + } + } + return ret; +} + +u32 bind_dma(struct lguest *lg, + unsigned long addr, unsigned long dmas, u16 numdmas, u8 interrupt) +{ + unsigned int i; + u32 ret =3D 0; + union futex_key key; + + if (interrupt >=3D LGUEST_IRQS) + return 0; + + down(&lguest_lock); + down_read(¤t->mm->mmap_sem); + if (get_futex_key((u32 __user *)addr, &key) !=3D 0) { + kill_guest(lg, "bad dma address %#lx", addr); + goto unlock; + } + get_futex_key_refs(&key); + + if (interrupt =3D=3D 0) + ret =3D unbind_dma(lg, &key, dmas); + else { + for (i =3D 0; i < LGUEST_MAX_DMA; i++) { + if (lg->dma[i].interrupt =3D=3D 0) { + lg->dma[i].dmas =3D dmas; + lg->dma[i].num_dmas =3D numdmas; + lg->dma[i].next_dma =3D 0; + lg->dma[i].key =3D key; + lg->dma[i].guestid =3D lg->guestid; + lg->dma[i].interrupt =3D interrupt; + list_add(&lg->dma[i].list, + &dma_hash[hash(&key)]); + ret =3D 1; + goto unlock; + } + } + } + drop_futex_key_refs(&key); +unlock: + up_read(¤t->mm->mmap_sem); + up(&lguest_lock); + return ret; +} + +/* lhread from another guest */ +static int lhread_other(struct lguest *lg, + void *buf, u32 addr, unsigned bytes) +{ + if (addr + bytes < addr + || !lguest_address_ok(lg, addr+bytes) + || access_process_vm(lg->tsk, addr, buf, bytes, 0) !=3D bytes) { + memset(buf, 0, bytes); + kill_guest(lg, "bad address in registered DMA struct"); + return 0; + } + return 1; +} + +/* lhwrite to another guest */ +static int lhwrite_other(struct lguest *lg, u32 addr, + const void *buf, unsigned bytes) +{ + if (addr + bytes < addr + || !lguest_address_ok(lg, addr+bytes) + || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1) + !=3D bytes)) { + kill_guest(lg, "bad address writing to registered DMA"); + return 0; + } + return 1; +} + +static u32 copy_data(const struct lguest_dma *src, + const struct lguest_dma *dst, + struct page *pages[]) +{ + unsigned int totlen, si, di, srcoff, dstoff; + void *maddr =3D NULL; + + totlen =3D 0; + si =3D di =3D 0; + srcoff =3D dstoff =3D 0; + while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si] + && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) { + u32 len =3D min(src->len[si] - srcoff, dst->len[di] - dstoff); + + if (!maddr) + maddr =3D kmap(pages[di]); + + /* FIXME: This is not completely portable, since + archs do different things for copy_to_user_page. */ + if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE, + (void *__user)src->addr[si], len) !=3D 0) { + totlen =3D 0; + break; + } + + totlen +=3D len; + srcoff +=3D len; + dstoff +=3D len; + if (srcoff =3D=3D src->len[si]) { + si++; + srcoff =3D 0; + } + if (dstoff =3D=3D dst->len[di]) { + kunmap(pages[di]); + maddr =3D NULL; + di++; + dstoff =3D 0; + } + } + + if (maddr) + kunmap(pages[di]); + + return totlen; +} + +/* Src is us, ie. current. */ +static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src, + struct lguest *dstlg, const struct lguest_dma *dst) +{ + int i; + u32 ret; + struct page *pages[LGUEST_MAX_DMA_SECTIONS]; + + if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src)) + return 0; + + /* First get the destination pages */ + for (i =3D 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { + if (dst->len[i] =3D=3D 0) + break; + if (get_user_pages(dstlg->tsk, dstlg->mm, + dst->addr[i], 1, 1, 1, pages+i, NULL) + !=3D 1) { + ret =3D 0; + goto drop_pages; + } + } + + /* Now copy until we run out of src or dst. */ + ret =3D copy_data(src, dst, pages); + +drop_pages: + while (--i >=3D 0) + put_page(pages[i]); + return ret; +} + +/* We cache one process to wakeup: helps for batching & wakes outside lock= s. */ +void set_wakeup_process(struct lguest *lg, struct task_struct *p) +{ + if (p =3D=3D lg->wake) + return; + + if (lg->wake) { + wake_up_process(lg->wake); + put_task_struct(lg->wake); + } + lg->wake =3D p; + if (lg->wake) + get_task_struct(lg->wake); +} + +static int dma_transfer(struct lguest *srclg, + unsigned long udma, + struct lguest_dma_info *dst) +{ + struct lguest_dma dst_dma, src_dma; + struct lguest *dstlg; + u32 i, dma =3D 0; + + dstlg =3D &lguests[dst->guestid]; + /* Get our dma list. */ + lhread(srclg, &src_dma, udma, sizeof(src_dma)); + + /* We can't deadlock against them dmaing to us, because this + * is all under the lguest_lock. */ + down_read(&dstlg->mm->mmap_sem); + + for (i =3D 0; i < dst->num_dmas; i++) { + dma =3D (dst->next_dma + i) % dst->num_dmas; + if (!lhread_other(dstlg, &dst_dma, + dst->dmas + dma * sizeof(struct lguest_dma), + sizeof(dst_dma))) { + goto fail; + } + if (!dst_dma.used_len) + break; + } + if (i !=3D dst->num_dmas) { + unsigned long used_lenp; + unsigned int ret; + + ret =3D do_dma(srclg, &src_dma, dstlg, &dst_dma); + /* Put used length in src. */ + lhwrite_u32(srclg, + udma+offsetof(struct lguest_dma, used_len), ret); + if (ret =3D=3D 0 && src_dma.len[0] !=3D 0) + goto fail; + + /* Make sure destination sees contents before length. */ + mb(); + used_lenp =3D dst->dmas + + dma * sizeof(struct lguest_dma) + + offsetof(struct lguest_dma, used_len); + lhwrite_other(dstlg, used_lenp, &ret, sizeof(ret)); + dst->next_dma++; + } + up_read(&dstlg->mm->mmap_sem); + + /* Do this last so dst doesn't simply sleep on lock. */ + set_bit(dst->interrupt, dstlg->irqs_pending); + set_wakeup_process(srclg, dstlg->tsk); + return i =3D=3D dst->num_dmas; + +fail: + up_read(&dstlg->mm->mmap_sem); + return 0; +} + +int send_dma(struct lguest *lg, unsigned long addr, unsigned long udma) +{ + union futex_key key; + int pending =3D 0, empty =3D 0; + +again: + down(&lguest_lock); + down_read(¤t->mm->mmap_sem); + if (get_futex_key((u32 __user *)addr, &key) !=3D 0) { + kill_guest(lg, "bad sending DMA address"); + goto unlock; + } + /* Shared mapping? Look for other guests... */ + if (key.shared.offset & 1) { + struct lguest_dma_info *i, *n; + list_for_each_entry_safe(i, n, &dma_hash[hash(&key)], list) { + if (i->guestid =3D=3D lg->guestid) + continue; + if (!key_eq(&key, &i->key)) + continue; + + empty +=3D dma_transfer(lg, udma, i); + break; + } + if (empty =3D=3D 1) { + /* Give any recipients one chance to restock. */ + up_read(¤t->mm->mmap_sem); + up(&lguest_lock); + yield(); + empty++; + goto again; + } + pending =3D 0; + } else { + /* Private mapping: tell our userspace. */ + lg->dma_is_pending =3D 1; + lg->pending_dma =3D udma; + lg->pending_addr =3D addr; + pending =3D 1; + } +unlock: + up_read(¤t->mm->mmap_sem); + up(&lguest_lock); + return pending; +} + +void release_all_dma(struct lguest *lg) +{ + unsigned int i; + + BUG_ON(down_trylock(&lguest_lock) =3D=3D 0); + + down_read(&lg->mm->mmap_sem); + for (i =3D 0; i < LGUEST_MAX_DMA; i++) { + if (lg->dma[i].interrupt) + unlink_dma(&lg->dma[i]); + } + up_read(&lg->mm->mmap_sem); +} + +/* Userspace wants a dma buffer from this guest. */ +unsigned long get_dma_buffer(struct lguest *lg, + unsigned long addr, unsigned long *interrupt) +{ + unsigned long ret =3D 0; + union futex_key key; + struct lguest_dma_info *i; + + down(&lguest_lock); + down_read(¤t->mm->mmap_sem); + if (get_futex_key((u32 __user *)addr, &key) !=3D 0) { + kill_guest(lg, "bad registered DMA buffer"); + goto unlock; + } + list_for_each_entry(i, &dma_hash[hash(&key)], list) { + if (key_eq(&key, &i->key) && i->guestid =3D=3D lg->guestid) { + unsigned int j; + for (j =3D 0; j < i->num_dmas; j++) { + struct lguest_dma dma; + + ret =3D i->dmas + j * sizeof(struct lguest_dma); + lhread(lg, &dma, ret, sizeof(dma)); + if (dma.used_len =3D=3D 0) + break; + } + *interrupt =3D i->interrupt; + break; + } + } +unlock: + up_read(¤t->mm->mmap_sem); + up(&lguest_lock); + return ret; +} + +void lguest_io_init(void) +{ + unsigned int i; + + for (i =3D 0; i < ARRAY_SIZE(dma_hash); i++) + INIT_LIST_HEAD(&dma_hash[i]); +} =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/lguest/lg.h @@ -0,0 +1,274 @@ +#ifndef _LGUEST_H +#define _LGUEST_H + +#include +/* 64k ought to be enough for anybody! */ +#define HYPERVISOR_SIZE 65536 +#define HYPERVISOR_PAGES (HYPERVISOR_SIZE/PAGE_SIZE) + +#define GDT_ENTRY_LGUEST_CS 10 +#define GDT_ENTRY_LGUEST_DS 11 +#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) +#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) + +#if 0 +/* FIXME: Use asm-offsets here... */ +#define LGUEST_TSS_OFF 0 +#define LGUEST_TSS_SIZE (26*4) +#define LGUEST_GDT_OFF (LGUEST_TSS_OFF + LGUEST_TSS_SIZE) +#define LGUEST_GDTABLE_OFF (LGUEST_GDT_OFF + 8) +#define LGUEST_GDTABLE_SIZE (8 * GDT_ENTRIES) +#define LGUEST_IDT_OFF (LGUEST_GDTABLE_OFF + LGUEST_GDTABLE_SIZE) +#define LGUEST_IDTABLE_SIZE (8 * IDT_ENTRIES) +#define LGUEST_IDTABLE_OFF (LGUEST_IDT_OFF + 8) +#define LGUEST_HOST_OFF (LGUEST_IDTABLE_OFF + LGUEST_IDTABLE_SIZE) +#define LGUEST_HOST_GDT_OFF LGUEST_HOST_OFF +#define LGUEST_HOST_IDT_OFF (LGUEST_HOST_OFF + 8) +#define LGUEST_HOST_PGDIR_OFF (LGUEST_HOST_IDT_OFF + 8) +#define LGUEST_HOST_STKP_OFF (LGUEST_HOST_PGDIR_OFF + 4) +#define LGUEST_HOST_SIZE (8+8+4+4) +#define LGUEST_REGS_OFF (LGUEST_HOST_OFF + LGUEST_HOST_SIZE) = +#define LGUEST_TRAPNUM_OFF (LGUEST_REGS_OFF + 12*4) +#define LGUEST_ERRCODE_OFF (LGUEST_REGS_OFF + 13*4) +#endif + +#ifndef __ASSEMBLY__ +#include +#include +#include +#include +#include +#include +#include +#include +#include "irq_vectors.h" + +#define GUEST_DPL 1 + +struct lguest_regs +{ + /* Manually saved part. */ + u32 cr3; + u32 ebx, ecx, edx; + u32 esi, edi, ebp; + u32 gs; + u32 eax; + u32 fs, ds, es; + u32 trapnum, errcode; + /* Trap pushed part */ + u32 eip; + u32 cs; + u32 eflags; + u32 esp; + u32 ss; +}; + +__exit void free_pagetables(void); +__init int init_pagetables(struct page *hype_pages); + +/* Full 4G segment descriptors, suitable for CS and DS. */ +#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00}) = +#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300}) = + +/* Simplified version of IDT. */ +struct host_trap +{ + unsigned long addr; + int disable_interrupts; +}; + +struct lguest_dma_info +{ + struct list_head list; + union futex_key key; + unsigned long dmas; + u16 next_dma; + u16 num_dmas; + u16 guestid; + u8 interrupt; /* 0 when not registered */ +}; + +struct pgdir +{ + u32 cr3; + u32 *pgdir; +}; + +/* The private info the thread maintains about the guest. */ +struct lguest +{ + struct lguest_state *state; + struct lguest_data __user *lguest_data; + struct task_struct *tsk; + struct mm_struct *mm; /* =3D=3D tsk->mm, but that becomes NULL on exit */ + u16 guestid; + u32 pfn_limit; + u32 page_offset; + u32 cr2; + int timer_on; + int halted; + int ts; + u32 gpf_eip; + u32 last_timer; + u32 next_hcall; + u16 tls_limits[GDT_ENTRY_TLS_ENTRIES]; + + /* We keep a small number of these. */ + u32 pgdidx; + struct pgdir pgdirs[4]; + void *trap_page; + + /* Cached wakeup: we hold a reference to this task. */ + struct task_struct *wake; + + unsigned long noirq_start, noirq_end; + int dma_is_pending; + unsigned long pending_dma; /* struct lguest_dma */ + unsigned long pending_addr; /* address they're sending to */ + + unsigned int stack_pages; + + struct lguest_dma_info dma[LGUEST_MAX_DMA]; + + /* Dead? */ + const char *dead; + + /* We intercept page fault (demand shadow paging & cr2 saving) + protection fault (in/out emulation, TLS handling) and + device not available (TS handling). */ + struct host_trap page_trap, gpf_trap, fpu_trap; + + /* Virtual interrupts */ + DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); + struct host_trap interrupt[LGUEST_IRQS]; +}; + +extern struct page *hype_pages; /* Contiguous pages. */ +extern struct lguest lguests[]; +extern struct semaphore lguest_lock; + +/* core.c: */ +/* Entry points in hypervisor */ +const unsigned long *__lguest_default_idt_entries(void); +struct lguest_state *__lguest_states(void); +u32 lhread_u32(struct lguest *lg, u32 addr); +void lhwrite_u32(struct lguest *lg, u32 val, u32 addr); +void lhread(struct lguest *lg, void *buf, u32 addr, unsigned bytes); +void lhwrite(struct lguest *lg, u32 addr, const void *buf, unsigned bytes); +int lguest_address_ok(const struct lguest *lg, unsigned long addr); +int run_guest(struct lguest *lg, char *__user user); +int find_free_guest(void); + +/* interrupts_and_traps.c: */ +void maybe_do_interrupt(struct lguest *lg); +int reflect_trap(struct lguest *lg, const struct host_trap *trap, int has_= err); +void check_bug_kill(struct lguest *lg); +void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 = hi); + +/* segments.c: */ +void load_guest_gdt(struct lguest *lg, u32 table, u32 num); +void guest_load_tls(struct lguest *lg, + const struct desc_struct __user *tls_array); + +int init_guest_pagetable(struct lguest *lg, u32 pgtable); +void free_guest_pagetable(struct lguest *lg); +void guest_new_pagetable(struct lguest *lg, u32 pgtable); +void guest_set_pud(struct lguest *lg, unsigned long cr3, u32 i); +void guest_pagetable_clear_all(struct lguest *lg); +void guest_pagetable_flush_user(struct lguest *lg); +void guest_set_pte(struct lguest *lg, unsigned long cr3, + unsigned long vaddr, u32 val); +void map_trap_page(struct lguest *info); +int demand_page(struct lguest *info, u32 cr2, int write); +void pin_stack_pages(struct lguest *lg); + +int lguest_device_init(void); +void lguest_device_remove(void); +void lguest_io_init(void); +u32 bind_dma(struct lguest *lg, + unsigned long addr, unsigned long udma, u16 numdmas,u8 interrupt); +int send_dma(struct lguest *info, unsigned long addr, + unsigned long udma); +void release_all_dma(struct lguest *lg); +unsigned long get_dma_buffer(struct lguest *lg, unsigned long addr, + unsigned long *interrupt); + +void set_wakeup_process(struct lguest *lg, struct task_struct *p); +int do_async_hcalls(struct lguest *info); +int hypercall(struct lguest *info, struct lguest_regs *regs); + +#define kill_guest(lg, fmt...) \ +do { \ + if (!(lg)->dead) { \ + (lg)->dead =3D kasprintf(GFP_ATOMIC, fmt); \ + if (!(lg)->dead) \ + (lg)->dead =3D (void *)1; \ + } \ +} while(0) + +static inline unsigned long guest_pa(struct lguest *lg, unsigned long vadd= r) +{ + return vaddr - lg->page_offset; +} + +/* Hardware-defined TSS structure. */ +struct x86_tss +{ + unsigned short back_link,__blh; + unsigned long esp0; + unsigned short ss0,__ss0pad; + unsigned long esp1; + unsigned short ss1,__ss1pad; + unsigned long esp2; + unsigned short ss2,__ss2pad; + unsigned long cr3; + unsigned long eip; + unsigned long eflags; + unsigned long eax,ecx,edx,ebx; + unsigned long esp; /* We actually use this one to save esp. */ + unsigned long ebp; + unsigned long esi; + unsigned long edi; + unsigned short es, __espad; + unsigned short cs, __cspad; + unsigned short ss, __sspad; + unsigned short ds, __dspad; + unsigned short fs, __fspad; + unsigned short gs, __gspad; + unsigned short ldt, __ldtpad; + unsigned short trace, io_bitmap_base; +}; + +int fixup_gdt_table(struct desc_struct *gdt, unsigned int num, + struct lguest_regs *regs, struct x86_tss *tss); + +struct lguest_host_state +{ + struct Xgt_desc_struct gdt; + struct Xgt_desc_struct idt; + unsigned long pgdir; + unsigned long stackptr; +}; + +/* This sits in the high-mapped shim. */ +struct lguest_state +{ + /* Task struct. */ + struct x86_tss tss; + + /* Gate descriptor table. */ + struct Xgt_desc_struct gdt; + struct desc_struct gdt_table[GDT_ENTRIES]; + + /* Interrupt descriptor table. */ + struct Xgt_desc_struct idt; + struct desc_struct idt_table[IDT_ENTRIES]; + + /* Host state we store while the guest runs. */ + struct lguest_host_state host; + + /* This is the stack on which we push our regs. */ + struct lguest_regs regs; +}; +#endif /* __ASSEMBLY__ */ +#endif /* _LGUEST_H */ =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/lguest/lguest.c @@ -0,0 +1,595 @@ +/* + * Lguest specific paravirt-ops implementation + * + * Copyright (C) 2006, Rusty Russell IBM Corporati= on. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern int mce_disabled; + +struct lguest_data lguest_data; +struct lguest_device_desc *lguest_devices; +static __initdata const struct lguest_boot_info *boot =3D __va(0); + +void async_hcall(unsigned long call, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + /* Note: This code assumes we're uniprocessor. */ + static unsigned int next_call; + unsigned long flags; + + local_irq_save(flags); + if (lguest_data.hcall_status[next_call] !=3D 0xFF) { + /* Table full, so do normal hcall which will flush table. */ + hcall(call, arg1, arg2, arg3); + } else { + lguest_data.hcalls[next_call].eax =3D call; + lguest_data.hcalls[next_call].edx =3D arg1; + lguest_data.hcalls[next_call].ebx =3D arg2; + lguest_data.hcalls[next_call].ecx =3D arg3; + wmb(); + lguest_data.hcall_status[next_call] =3D 0; + if (++next_call =3D=3D LHCALL_RING_SIZE) + next_call =3D 0; + } + local_irq_restore(flags); +} + +#ifdef PARAVIRT_LAZY_NONE /* Not in 2.6.20. */ +static int lazy_mode; +static void fastcall lguest_lazy_mode(int mode) +{ + lazy_mode =3D mode; + if (mode =3D=3D PARAVIRT_LAZY_NONE) + hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); +} + +static void lazy_hcall(unsigned long call, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3) +{ + if (lazy_mode =3D=3D PARAVIRT_LAZY_NONE) + hcall(call, arg1, arg2, arg3); + else + async_hcall(call, arg1, arg2, arg3); +} +#else +#define lazy_hcall hcall +#endif + +static unsigned long fastcall save_fl(void) +{ + return lguest_data.irq_enabled; +} + +static void fastcall restore_fl(unsigned long flags) +{ + /* FIXME: Check if interrupt pending... */ + lguest_data.irq_enabled =3D flags; +} + +static void fastcall irq_disable(void) +{ + lguest_data.irq_enabled =3D 0; +} + +static void fastcall irq_enable(void) +{ + /* Linux i386 code expects bit 9 set. */ + /* FIXME: Check if interrupt pending... */ + lguest_data.irq_enabled =3D 512; +} + +static void fastcall lguest_load_gdt(const struct Xgt_desc_struct *desc) +{ + BUG_ON((desc->size+1)/8 !=3D GDT_ENTRIES); + hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0); +} + +static void fastcall lguest_load_idt(const struct Xgt_desc_struct *desc) +{ + unsigned int i; + struct desc_struct *idt =3D (void *)desc->address; + + for (i =3D 0; i < (desc->size+1)/8; i++) + hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); +} + +static int lguest_panic(struct notifier_block *nb, unsigned long l, void *= p) +{ + hcall(LHCALL_CRASH, __pa(p), 0, 0); + return NOTIFY_DONE; +} + +static struct notifier_block paniced =3D { + .notifier_call =3D lguest_panic +}; + +static cycle_t lguest_clock_read(void) +{ + /* FIXME: This is just the native one. Account stolen time! */ + return paravirt_ops.read_tsc(); +} + +/* FIXME: Update iff tsc rate changes. */ +static struct clocksource lguest_clock =3D { + .name =3D "lguest", + .rating =3D 400, + .read =3D lguest_clock_read, + .mask =3D CLOCKSOURCE_MASK(64), + .mult =3D 0, /* to be set */ + .shift =3D 22, + .is_continuous =3D 1, +}; + +static char *lguest_memory_setup(void) +{ + /* We do these here because lockcheck barfs if before start_kernel */ + atomic_notifier_chain_register(&panic_notifier_list, &paniced); + lguest_clock.mult =3D lguest_data.clock_mult; + clocksource_register(&lguest_clock); + + e820.nr_map =3D 0; + add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM); + return "LGUEST"; +} + +static fastcall void lguest_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + int is_feature =3D (*eax =3D=3D 1); + + asm volatile ("cpuid" + : "=3Da" (*eax), + "=3Db" (*ebx), + "=3Dc" (*ecx), + "=3Dd" (*edx) + : "0" (*eax), "2" (*ecx)); + + if (is_feature) { + unsigned long *excap =3D (unsigned long *)ecx, + *features =3D (unsigned long *)edx; + /* Hypervisor needs to know when we flush kernel pages. */ + set_bit(X86_FEATURE_PGE, features); + /* We don't have any features! */ + clear_bit(X86_FEATURE_VME, features); + clear_bit(X86_FEATURE_DE, features); + clear_bit(X86_FEATURE_PSE, features); + clear_bit(X86_FEATURE_PAE, features); + clear_bit(X86_FEATURE_SEP, features); + clear_bit(X86_FEATURE_APIC, features); + clear_bit(X86_FEATURE_MTRR, features); + /* No MWAIT, either */ + clear_bit(3, excap); + } +} + +static unsigned long current_cr3; +static void fastcall lguest_write_cr3(unsigned long cr3) +{ + hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0); + current_cr3 =3D cr3; +} + +static void fastcall lguest_flush_tlb(void) +{ + lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0); +} + +static void fastcall lguest_flush_tlb_kernel(void) +{ + lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); +} + +static void fastcall lguest_flush_tlb_single(u32 addr) +{ + /* Simply set it to zero, and it will fault back in. */ + lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0); +} + +/* FIXME: Eliminate all callers of this. */ +static fastcall void lguest_set_pte(pte_t *ptep, pte_t pteval) +{ + *ptep =3D pteval; + /* Don't bother with hypercall before initial setup. */ + if (current_cr3) + hcall(LHCALL_SET_UNKNOWN_PTE, 0, 0, 0); +} + +static fastcall void lguest_set_pte_at(struct mm_struct *mm, u32 addr, pte= _t *ptep, pte_t pteval) +{ + *ptep =3D pteval; + lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low); +} + +/* We only support two-level pagetables at the moment. */ +static fastcall void lguest_set_pud(pmd_t *pmdp, pmd_t pmdval) +{ + *pmdp =3D pmdval; + lazy_hcall(LHCALL_SET_PUD, __pa(pmdp)&PAGE_MASK, + (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); +} + +#ifdef CONFIG_X86_LOCAL_APIC +static fastcall void lguest_apic_write(unsigned long reg, unsigned long v) +{ +} + +static fastcall void lguest_apic_write_atomic(unsigned long reg, unsigned = long v) +{ +} + +static fastcall unsigned long lguest_apic_read(unsigned long reg) +{ + return 0; +} +#endif + +/* We move eflags word to lguest_data.irq_enabled to restore interrupt + state. For page faults, gpfs and virtual interrupts, the + hypervisor has saved eflags manually, otherwise it was delivered + directly and so eflags reflects the real machine IF state, + ie. interrupts on. Since the kernel always dies if it takes such a + trap with interrupts disabled anyway, turning interrupts back on + unconditionally here is OK. */ +asm("lguest_iret:" + " pushl %eax;" + " movl 12(%esp), %eax;" + "lguest_noirq_start:;" + " movl %eax,%ss:lguest_data+"__stringify(LGUEST_DATA_irq_enabled)";" + " popl %eax;" + " iret;" + "lguest_noirq_end:"); +extern void fastcall lguest_iret(void); +extern char lguest_noirq_start[], lguest_noirq_end[]; + +static void fastcall lguest_load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0, + THREAD_SIZE/PAGE_SIZE); +} + +static fastcall void lguest_load_tr_desc(void) +{ +} + +static fastcall void lguest_set_ldt(const void *addr, unsigned entries) +{ + /* FIXME: Implement. */ + BUG_ON(entries); +} + +static fastcall void lguest_load_tls(struct thread_struct *t, unsigned int= cpu) +{ + lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0); +} + +static fastcall void lguest_set_debugreg(int regno, unsigned long value) +{ + /* FIXME: Implement */ +} + +static unsigned int lguest_cr0; +static fastcall void lguest_clts(void) +{ + lazy_hcall(LHCALL_TS, 0, 0, 0); + lguest_cr0 &=3D ~8U; +} + +static fastcall unsigned long lguest_read_cr0(void) +{ + return lguest_cr0; +} + +static fastcall void lguest_write_cr0(unsigned long val) +{ + hcall(LHCALL_TS, val & 8, 0, 0); + lguest_cr0 =3D val; +} + +static fastcall unsigned long lguest_read_cr2(void) +{ + return lguest_data.cr2; +} + +static fastcall unsigned long lguest_read_cr3(void) +{ + return current_cr3; +} + +/* Used to enable/disable PGE, but we don't care. */ +static fastcall unsigned long lguest_read_cr4(void) +{ + return 0; +} + +static fastcall void lguest_write_cr4(unsigned long val) +{ +} + +/* FIXME: These should be in a header somewhere */ +extern unsigned long init_pg_tables_end; + +static void fastcall lguest_time_irq(unsigned int irq, struct irq_desc *de= sc) +{ + do_timer(hcall(LHCALL_TIMER_READ, 0, 0, 0)); + update_process_times(user_mode_vm(get_irq_regs())); +} + +static void disable_lguest_irq(unsigned int irq) +{ + set_bit(irq, lguest_data.interrupts); +} + +static void enable_lguest_irq(unsigned int irq) +{ + clear_bit(irq, lguest_data.interrupts); + /* FIXME: If it's pending? */ +} + +static struct irq_chip lguest_irq_controller =3D { + .name =3D "lguest", + .mask =3D disable_lguest_irq, + .mask_ack =3D disable_lguest_irq, + .unmask =3D enable_lguest_irq, +}; + +static void lguest_time_init(void) +{ + set_irq_handler(0, lguest_time_irq); + hcall(LHCALL_TIMER_START,HZ,0,0); +} + +static void __init lguest_init_IRQ(void) +{ + unsigned int i; + + for (i =3D 0; i < LGUEST_IRQS; i++) { + int vector =3D FIRST_EXTERNAL_VECTOR + i; + if (i >=3D NR_IRQS) + break; + if (vector !=3D SYSCALL_VECTOR) { + set_intr_gate(vector, interrupt[i]); + set_irq_chip_and_handler(i, &lguest_irq_controller, + handle_level_irq); + } + } + irq_ctx_init(smp_processor_id()); +} + +static inline void native_write_dt_entry(void *dt, int entry, u32 entry_lo= w, u32 entry_high) +{ + u32 *lp =3D (u32 *)((char *)dt + entry*8); + lp[0] =3D entry_low; + lp[1] =3D entry_high; +} + +static fastcall void lguest_write_ldt_entry(void *dt, int entrynum, u32 lo= w, u32 high) +{ + /* FIXME: Allow this. */ + BUG(); +} + +static fastcall void lguest_write_gdt_entry(void *dt, int entrynum, + u32 low, u32 high) +{ + native_write_dt_entry(dt, entrynum, low, high); + hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0); +} + +static fastcall void lguest_write_idt_entry(void *dt, int entrynum, + u32 low, u32 high) +{ + native_write_dt_entry(dt, entrynum, low, high); + hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high); +} + +#define LGUEST_IRQ "lguest_data+"__stringify(LGUEST_DATA_irq_enabled) +#define DEF_LGUEST(name, code) \ + extern const char start_##name[], end_##name[]; \ + asm("start_" #name ": " code "; end_" #name ":") +DEF_LGUEST(cli, "movl $0," LGUEST_IRQ); +DEF_LGUEST(sti, "movl $512," LGUEST_IRQ); +DEF_LGUEST(popf, "movl %eax," LGUEST_IRQ); +DEF_LGUEST(pushf, "movl " LGUEST_IRQ ",%eax"); +DEF_LGUEST(pushf_cli, "movl " LGUEST_IRQ ",%eax; movl $0," LGUEST_IRQ); +DEF_LGUEST(iret, ".byte 0xE9,0,0,0,0"); /* jmp ... */ + +static const struct lguest_insns +{ + const char *start, *end; +} lguest_insns[] =3D { + [PARAVIRT_IRQ_DISABLE] =3D { start_cli, end_cli }, + [PARAVIRT_IRQ_ENABLE] =3D { start_sti, end_sti }, + [PARAVIRT_RESTORE_FLAGS] =3D { start_popf, end_popf }, + [PARAVIRT_SAVE_FLAGS] =3D { start_pushf, end_pushf }, + [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] =3D { start_pushf_cli, end_pushf_cli }, + [PARAVIRT_INTERRUPT_RETURN] =3D { start_iret, end_iret }, +}; +static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned l= en) +{ + unsigned int insn_len; + + /* Don't touch it if we don't have a replacement */ + if (type >=3D ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) + return len; + + insn_len =3D lguest_insns[type].end - lguest_insns[type].start; + + /* Similarly if we can't fit replacement. */ + if (len < insn_len) + return len; + + memcpy(insns, lguest_insns[type].start, insn_len); + if (type =3D=3D PARAVIRT_INTERRUPT_RETURN) { + /* Jumps are relative. */ + u32 off =3D (u32)lguest_iret - ((u32)insns + insn_len); + memcpy(insns+1, &off, sizeof(off)); + } + return insn_len; +} + +static void fastcall lguest_safe_halt(void) +{ + hcall(LHCALL_HALT, 0, 0, 0); +} + +static unsigned long lguest_get_wallclock(void) +{ + return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0); +} + +static void lguest_power_off(void) +{ + hcall(LHCALL_CRASH, __pa("Power down"), 0, 0); +} + +static __attribute_used__ __init void lguest_init(void) +{ + extern struct Xgt_desc_struct cpu_gdt_descr; + extern struct i386_pda boot_pda; + + paravirt_ops.name =3D "lguest"; + paravirt_ops.paravirt_enabled =3D 1; + paravirt_ops.kernel_rpl =3D 1; + + paravirt_ops.save_fl =3D save_fl; + paravirt_ops.restore_fl =3D restore_fl; + paravirt_ops.irq_disable =3D irq_disable; + paravirt_ops.irq_enable =3D irq_enable; + paravirt_ops.load_gdt =3D lguest_load_gdt; + paravirt_ops.memory_setup =3D lguest_memory_setup; + paravirt_ops.cpuid =3D lguest_cpuid; + paravirt_ops.write_cr3 =3D lguest_write_cr3; + paravirt_ops.flush_tlb_user =3D lguest_flush_tlb; + paravirt_ops.flush_tlb_single =3D lguest_flush_tlb_single; + paravirt_ops.flush_tlb_kernel =3D lguest_flush_tlb_kernel; + paravirt_ops.set_pte =3D lguest_set_pte; + paravirt_ops.set_pte_at =3D lguest_set_pte_at; + paravirt_ops.set_pmd =3D lguest_set_pud; +#ifdef CONFIG_X86_LOCAL_APIC + paravirt_ops.apic_write =3D lguest_apic_write; + paravirt_ops.apic_write_atomic =3D lguest_apic_write_atomic; + paravirt_ops.apic_read =3D lguest_apic_read; +#endif + paravirt_ops.load_idt =3D lguest_load_idt; + paravirt_ops.iret =3D lguest_iret; + paravirt_ops.load_esp0 =3D lguest_load_esp0; + paravirt_ops.load_tr_desc =3D lguest_load_tr_desc; + paravirt_ops.set_ldt =3D lguest_set_ldt; + paravirt_ops.load_tls =3D lguest_load_tls; + paravirt_ops.set_debugreg =3D lguest_set_debugreg; + paravirt_ops.clts =3D lguest_clts; + paravirt_ops.read_cr0 =3D lguest_read_cr0; + paravirt_ops.write_cr0 =3D lguest_write_cr0; + paravirt_ops.init_IRQ =3D lguest_init_IRQ; + paravirt_ops.read_cr2 =3D lguest_read_cr2; + paravirt_ops.read_cr3 =3D lguest_read_cr3; + paravirt_ops.read_cr4 =3D lguest_read_cr4; + paravirt_ops.write_cr4 =3D lguest_write_cr4; + paravirt_ops.write_ldt_entry =3D lguest_write_ldt_entry; + paravirt_ops.write_gdt_entry =3D lguest_write_gdt_entry; + paravirt_ops.write_idt_entry =3D lguest_write_idt_entry; + paravirt_ops.patch =3D lguest_patch; + paravirt_ops.safe_halt =3D lguest_safe_halt; + paravirt_ops.get_wallclock =3D lguest_get_wallclock; + paravirt_ops.time_init =3D lguest_time_init; +#ifdef PARAVIRT_LAZY_NONE + paravirt_ops.set_lazy_mode =3D lguest_lazy_mode; +#endif + + memset(lguest_data.hcall_status,0xFF,sizeof(lguest_data.hcall_status)); + lguest_data.noirq_start =3D (u32)lguest_noirq_start; + lguest_data.noirq_end =3D (u32)lguest_noirq_end; + hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0); + strncpy(saved_command_line, boot->cmdline, COMMAND_LINE_SIZE); + + /* We use top of mem for initial pagetables. */ + init_pg_tables_end =3D __pa(pg0); + + /* set up PDA descriptor */ + pack_descriptor((u32 *)&cpu_gdt_table[GDT_ENTRY_PDA].a, + (u32 *)&cpu_gdt_table[GDT_ENTRY_PDA].b, + (unsigned)&boot_pda, sizeof(boot_pda)-1, + 0x80 | DESCTYPE_S | 0x02, 0); + load_gdt(&cpu_gdt_descr); + asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory"); + + reserve_top_address(lguest_data.reserve_mem); + + cpu_detect(&new_cpu_data); + /* Need this before paging_init. */ + set_bit(X86_FEATURE_PGE, new_cpu_data.x86_capability); + /* Math is always hard! */ + new_cpu_data.hard_math =3D 1; + + /* FIXME: Better way? */ + /* Suppress vgacon startup code */ + SCREEN_INFO.orig_video_isVGA =3D VIDEO_TYPE_VLFB; + + add_preferred_console("hvc", 0, NULL); + +#ifdef CONFIG_X86_MCE + mce_disabled =3D 1; +#endif + +#ifdef CONFIG_ACPI + acpi_disabled =3D 1; + acpi_ht =3D 0; +#endif + if (boot->initrd_size) { + /* We stash this at top of memory. */ + INITRD_START =3D boot->max_pfn*PAGE_SIZE - boot->initrd_size; + INITRD_SIZE =3D boot->initrd_size; + LOADER_TYPE =3D 0xFF; + } + + pm_power_off =3D lguest_power_off; + start_kernel(); +} + +asm("lguest_maybe_init:\n" + " cmpl $"__stringify(LGUEST_MAGIC_EBP)", %ebp\n" + " jne 1f\n" + " cmpl $"__stringify(LGUEST_MAGIC_EDI)", %edi\n" + " jne 1f\n" + " cmpl $"__stringify(LGUEST_MAGIC_ESI)", %esi\n" + " je lguest_init\n" + "1: ret"); +extern void asmlinkage lguest_maybe_init(void); +paravirt_probe(lguest_maybe_init); =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/lguest/lguest_bus.c @@ -0,0 +1,180 @@ +#include +#include +#include +#include +#include + +static ssize_t type_show(struct device *_dev, + struct device_attribute *attr, char *buf) +{ + struct lguest_device *dev =3D container_of(_dev,struct lguest_device,dev); + return sprintf(buf, "%hu", lguest_devices[dev->index].type); +} +static ssize_t features_show(struct device *_dev, + struct device_attribute *attr, char *buf) +{ + struct lguest_device *dev =3D container_of(_dev,struct lguest_device,dev); + return sprintf(buf, "%hx", lguest_devices[dev->index].features); +} +static ssize_t pfn_show(struct device *_dev, + struct device_attribute *attr, char *buf) +{ + struct lguest_device *dev =3D container_of(_dev,struct lguest_device,dev); + return sprintf(buf, "%u", lguest_devices[dev->index].pfn); +} +static ssize_t status_show(struct device *_dev, + struct device_attribute *attr, char *buf) +{ + struct lguest_device *dev =3D container_of(_dev,struct lguest_device,dev); + return sprintf(buf, "%hx", lguest_devices[dev->index].status); +} +static ssize_t status_store(struct device *_dev, struct device_attribute *= attr, + const char *buf, size_t count) +{ + struct lguest_device *dev =3D container_of(_dev,struct lguest_device,dev); + if (sscanf(buf, "%hi", &lguest_devices[dev->index].status) !=3D 1) + return -EINVAL; + return count; +} +static struct device_attribute lguest_dev_attrs[] =3D { + __ATTR_RO(type), + __ATTR_RO(features), + __ATTR_RO(pfn), + __ATTR(status, 0644, status_show, status_store), + __ATTR_NULL +}; + +static int lguest_dev_match(struct device *_dev, struct device_driver *_dr= v) +{ + struct lguest_device *dev =3D container_of(_dev,struct lguest_device,dev); + struct lguest_driver *drv =3D container_of(_drv,struct lguest_driver,drv); + + return (drv->device_type =3D=3D lguest_devices[dev->index].type); +} + +struct lguest_bus { + struct bus_type bus; + struct device dev; +}; + +static struct lguest_bus lguest_bus =3D { + .bus =3D { + .name =3D "lguest", + .match =3D lguest_dev_match, + .dev_attrs =3D lguest_dev_attrs, + }, + .dev =3D { + .parent =3D NULL, + .bus_id =3D "lguest", + } +}; + +static int lguest_dev_probe(struct device *_dev) +{ + int ret; + struct lguest_device *dev =3D container_of(_dev,struct lguest_device,dev); + struct lguest_driver *drv =3D container_of(dev->dev.driver, + struct lguest_driver, drv); + + lguest_devices[dev->index].status |=3D LGUEST_DEVICE_S_DRIVER; + ret =3D drv->probe(dev); + if (ret =3D=3D 0) + lguest_devices[dev->index].status |=3D LGUEST_DEVICE_S_DRIVER_OK; + return ret; +} + +static int lguest_dev_remove(struct device *_dev) +{ + struct lguest_device *dev =3D container_of(_dev,struct lguest_device,dev); + struct lguest_driver *drv =3D container_of(dev->dev.driver, + struct lguest_driver, drv); + + if (dev->dev.driver && drv->remove) + drv->remove(dev); + put_device(&dev->dev); + return 0; +} + +int register_lguest_driver(struct lguest_driver *drv) +{ + if (!lguest_devices) + return 0; + = + drv->drv.bus =3D &lguest_bus.bus; + drv->drv.name =3D drv->name; + drv->drv.owner =3D drv->owner; + drv->drv.probe =3D lguest_dev_probe; + drv->drv.remove =3D lguest_dev_remove; + + return driver_register(&drv->drv); +} +EXPORT_SYMBOL_GPL(register_lguest_driver); + +void unregister_lguest_driver(struct lguest_driver *drv) +{ + if (!lguest_devices) + return; + + driver_unregister(&drv->drv); +} +EXPORT_SYMBOL_GPL(unregister_lguest_driver); + +static void release_lguest_device(struct device *_dev) +{ + struct lguest_device *dev =3D container_of(_dev,struct lguest_device,dev); + + lguest_devices[dev->index].status |=3D LGUEST_DEVICE_S_REMOVED_ACK; + kfree(dev); +} + +static void add_lguest_device(unsigned int index) +{ + struct lguest_device *new; + + lguest_devices[index].status |=3D LGUEST_DEVICE_S_ACKNOWLEDGE; + new =3D kmalloc(sizeof(struct lguest_device), GFP_KERNEL); + if (!new) { + printk(KERN_EMERG "Cannot allocate lguest device %u\n", index); + lguest_devices[index].status |=3D LGUEST_DEVICE_S_FAILED; + return; + } + + new->index =3D index; + new->private =3D NULL; + memset(&new->dev, 0, sizeof(new->dev)); + new->dev.parent =3D &lguest_bus.dev; + new->dev.bus =3D &lguest_bus.bus; + new->dev.release =3D release_lguest_device; + sprintf(new->dev.bus_id, "%u", index); + if (device_register(&new->dev) !=3D 0) { + printk(KERN_EMERG "Cannot register lguest device %u\n", index); + lguest_devices[index].status |=3D LGUEST_DEVICE_S_FAILED; + kfree(new); + } +} + +static void scan_devices(void) +{ + unsigned int i; + + for (i =3D 0; i < LGUEST_MAX_DEVICES; i++) + if (lguest_devices[i].type) + add_lguest_device(i); +} + +static int __init lguest_bus_init(void) +{ + if (strcmp(paravirt_ops.name, "lguest") !=3D 0) + return 0; + + /* Devices are in page above top of "normal" mem. */ + lguest_devices =3D ioremap(max_pfn << PAGE_SHIFT, PAGE_SIZE); + + if (bus_register(&lguest_bus.bus) !=3D 0 + || device_register(&lguest_bus.dev) !=3D 0) + panic("lguest bus registration failed"); + + scan_devices(); + return 0; +} +postcore_initcall(lguest_bus_init); =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/lguest/lguest_user.c @@ -0,0 +1,242 @@ +/* Userspace control of the guest, via /dev/lguest. */ +#include +#include +#include +#include "lg.h" + +static struct lguest_state *setup_guest_state(unsigned int num, void *pgdi= r, + unsigned long start) +{ + struct lguest_state *guest =3D &__lguest_states()[num]; + unsigned int i; + const long *def =3D __lguest_default_idt_entries(); + struct lguest_regs *regs; + + guest->gdt_table[GDT_ENTRY_KERNEL_CS] =3D FULL_EXEC_SEGMENT; + guest->gdt_table[GDT_ENTRY_KERNEL_DS] =3D FULL_SEGMENT; + guest->gdt.size =3D GDT_ENTRIES*8-1; + guest->gdt.address =3D (unsigned long)&guest->gdt_table; + + /* Other guest's IDTs are initialized from default. */ + guest->idt.size =3D 8 * IDT_ENTRIES; + guest->idt.address =3D (long)guest->idt_table; + for (i =3D 0; i < IDT_ENTRIES; i++) { + u32 flags =3D 0x8e00; + + /* They can't "int" into any of them except hypercall. */ + if (i =3D=3D LGUEST_TRAP_ENTRY) + flags |=3D (GUEST_DPL << 13); + + guest->idt_table[i].a =3D (LGUEST_CS<<16) | (def[i]&0x0000FFFF); + guest->idt_table[i].b =3D (def[i]&0xFFFF0000) | flags; + } + + memset(&guest->tss, 0, sizeof(guest->tss)); + guest->tss.ss0 =3D LGUEST_DS; + guest->tss.esp0 =3D (unsigned long)(guest+1); + guest->tss.io_bitmap_base =3D sizeof(guest->tss); /* No I/O for you! */ + + /* Write out stack in format lguest expects, so we can switch to it. */ + regs =3D &guest->regs; + regs->cr3 =3D __pa(pgdir); + regs->eax =3D regs->ebx =3D regs->ecx =3D regs->edx =3D regs->esp =3D 0; + regs->edi =3D LGUEST_MAGIC_EDI; + regs->ebp =3D LGUEST_MAGIC_EBP; + regs->esi =3D LGUEST_MAGIC_ESI; + regs->gs =3D regs->fs =3D 0; + regs->ds =3D regs->es =3D __KERNEL_DS|GUEST_DPL; + regs->trapnum =3D regs->errcode =3D 0; + regs->eip =3D start; + regs->cs =3D __KERNEL_CS|GUEST_DPL; + regs->eflags =3D 0x202; /* Interrupts enabled. */ + regs->ss =3D __KERNEL_DS|GUEST_DPL; + + if (!fixup_gdt_table(guest->gdt_table, ARRAY_SIZE(guest->gdt_table), + &guest->regs, &guest->tss)) + return NULL; + + return guest; +} + +/* + addr */ +static long user_get_dma(struct lguest *lg, const u32 __user *input) +{ + unsigned long addr, udma, irq; + + if (get_user(addr, input) !=3D 0) + return -EFAULT; + udma =3D get_dma_buffer(lg, addr, &irq); + if (!udma) + return -ENOENT; + + /* We put irq number in udma->used_len. */ + lhwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq); + return udma; +} + +/* + irq */ +static int user_send_irq(struct lguest *lg, const u32 __user *input) +{ + u32 irq; + + if (get_user(irq, input) !=3D 0) + return -EFAULT; + if (irq >=3D LGUEST_IRQS) + return -EINVAL; + set_bit(irq, lg->irqs_pending); + return 0; +} + +static ssize_t read(struct file *file, char __user *user, size_t size,loff= _t*o) +{ + struct lguest *lg =3D file->private_data; + + if (!lg) + return -EINVAL; + + if (lg->dead) { + size_t len; + + if (lg->dead =3D=3D (void *)-1) + return -ENOMEM; + + len =3D min(size, strlen(lg->dead)+1); + if (copy_to_user(user, lg->dead, len) !=3D 0) + return -EFAULT; + return len; + } + + if (lg->dma_is_pending) + lg->dma_is_pending =3D 0; + + return run_guest(lg, user); +} + +/* Take: pfnlimit, pgdir, start, pageoffset. */ +static int initialize(struct file *file, const u32 __user *input) +{ + struct lguest *lg; + int err, i; + u32 args[4]; + + if (file->private_data) + return -EBUSY; + + if (copy_from_user(args, input, sizeof(args)) !=3D 0) + return -EFAULT; + + if (args[1] <=3D PAGE_SIZE) + return -EINVAL; + + down(&lguest_lock); + i =3D find_free_guest(); + if (i < 0) { + err =3D -ENOSPC; + goto unlock; + } + lg =3D &lguests[i]; + lg->guestid =3D i; + lg->pfn_limit =3D args[0]; + lg->page_offset =3D args[3]; + + lg->trap_page =3D (u32 *)get_zeroed_page(GFP_KERNEL); + if (!lg->trap_page) { + err =3D -ENOMEM; + goto release_guest; + } + + err =3D init_guest_pagetable(lg, args[1]); + if (err) + goto free_trap_page; + + lg->state =3D setup_guest_state(i, lg->pgdirs[lg->pgdidx].pgdir,args[2]); + if (!lg->state) { + err =3D -ENOEXEC; + goto release_pgtable; + } + up(&lguest_lock); + + lg->tsk =3D current; + lg->mm =3D get_task_mm(current); + file->private_data =3D lg; + return sizeof(args); + +release_pgtable: + free_guest_pagetable(lg); +free_trap_page: + free_page((long)lg->trap_page); +release_guest: + memset(lg, 0, sizeof(*lg)); +unlock: + up(&lguest_lock); + return err; +} + +static ssize_t write(struct file *file, const char __user *input, + size_t size, loff_t *off) +{ + struct lguest *lg =3D file->private_data; + u32 req; + + if (get_user(req, input) !=3D 0) + return -EFAULT; + input +=3D sizeof(req); + + if (req !=3D LHREQ_INITIALIZE && !lg) + return -EINVAL; + if (lg && lg->dead) + return -ENOENT; + + switch (req) { + case LHREQ_INITIALIZE: + return initialize(file, (const u32 __user *)input); + case LHREQ_GETDMA: + return user_get_dma(lg, (const u32 __user *)input); + case LHREQ_IRQ: + return user_send_irq(lg, (const u32 __user *)input); + default: + return -EINVAL; + } +} + +static int close(struct inode *inode, struct file *file) +{ + struct lguest *lg =3D file->private_data; + + if (!lg) + return 0; + + down(&lguest_lock); + release_all_dma(lg); + free_page((long)lg->trap_page); + free_guest_pagetable(lg); + mmput(lg->mm); + if (lg->dead !=3D (void *)1) + kfree(lg->dead); + memset(lg->state, 0, sizeof(*lg->state)); + memset(lg, 0, sizeof(*lg)); + up(&lguest_lock); + return 0; +} + +static struct file_operations lguest_fops =3D { + .owner =3D THIS_MODULE, + .release =3D close, + .write =3D write, + .read =3D read, +}; +static struct miscdevice lguest_dev =3D { + .minor =3D MISC_DYNAMIC_MINOR, + .name =3D "lguest", + .fops =3D &lguest_fops, +}; + +int __init lguest_device_init(void) +{ + return misc_register(&lguest_dev); +} + +void __exit lguest_device_remove(void) +{ + misc_deregister(&lguest_dev); +} =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/lguest/page_tables.c @@ -0,0 +1,374 @@ +/* Shadow page table operations. + * Copyright (C) Rusty Russell IBm Corporation 2006. + * GPL v2 and any later version */ +#include +#include +#include +#include +#include +#include +#include "lg.h" + +#define PTES_PER_PAGE_SHIFT 10 +#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT) +#define HYPERVISOR_PGD_ENTRY (PTES_PER_PAGE - 1) + +static DEFINE_PER_CPU(u32 *, hypervisor_pte_pages) =3D { NULL }; +#define hypervisor_pte_page(cpu) per_cpu(hypervisor_pte_pages, cpu) + +static unsigned vaddr_to_pgd(unsigned long vaddr) +{ + return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); +} + +/* These access the real versions. */ +static u32 *toplev(struct lguest *lg, u32 i, unsigned long vaddr) +{ + unsigned int index =3D vaddr_to_pgd(vaddr); + + if (index >=3D HYPERVISOR_PGD_ENTRY) { + kill_guest(lg, "attempt to access hypervisor pages"); + index =3D 0; + } = + return &lg->pgdirs[i].pgdir[index]; +} + +static u32 *pteof(struct lguest *lg, u32 top, unsigned long vaddr) +{ + u32 *page =3D __va(top&PAGE_MASK); + BUG_ON(!(top & _PAGE_PRESENT)); + return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE]; +} + +/* These access the guest versions. */ +static u32 gtoplev(struct lguest *lg, unsigned long vaddr) +{ + unsigned int index =3D vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); + return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(u32); +} + +static u32 gpteof(struct lguest *lg, u32 gtop, unsigned long vaddr) +{ + u32 gpage =3D (gtop&PAGE_MASK); + BUG_ON(!(gtop & _PAGE_PRESENT)); + return gpage + ((vaddr >> PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(u32); +} + +static void release_pte(u32 pte) +{ + if (pte & _PAGE_PRESENT) + put_page(pfn_to_page(pte >> PAGE_SHIFT)); +} + +/* Do a virtual -> physical mapping on a user page. */ +static unsigned long get_pfn(unsigned long virtpfn, int write) +{ + struct vm_area_struct *vma; + struct page *page; + unsigned long ret =3D -1UL; + + down_read(¤t->mm->mmap_sem); + if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT, + 1, write, 1, &page, &vma) =3D=3D 1) + ret =3D page_to_pfn(page); + up_read(¤t->mm->mmap_sem); + return ret; +} + +static u32 check_pgtable_entry(struct lguest *lg, u32 entry) +{ + if ((entry & (_PAGE_PWT|_PAGE_PSE)) + || (entry >> PAGE_SHIFT) >=3D lg->pfn_limit) + kill_guest(lg, "bad page table entry"); + return entry & ~_PAGE_GLOBAL; +} + +static u32 get_pte(struct lguest *lg, u32 entry, int write) +{ + u32 pfn; + + pfn =3D get_pfn(entry >> PAGE_SHIFT, write); + if (pfn =3D=3D -1UL) { + kill_guest(lg, "failed to get page %u", entry>>PAGE_SHIFT); + return 0; + } + return ((pfn << PAGE_SHIFT) | (entry & (PAGE_SIZE-1))); +} + +/* FIXME: We hold reference to pages, which prevents them from being + swapped. It'd be nice to have a callback when Linux wants to swap out.= */ + +/* We fault pages in, which allows us to update accessed/dirty bits. + * Return NULL or the pte page. */ +static int page_in(struct lguest *lg, u32 vaddr, unsigned flags) +{ + u32 gtop, gpte; + u32 *top, *pte, *ptepage; + u32 val; + + gtop =3D gtoplev(lg, vaddr); + val =3D lhread_u32(lg, gtop); + if (!(val & _PAGE_PRESENT)) + return 0; + + top =3D toplev(lg, lg->pgdidx, vaddr); + if (!(*top & _PAGE_PRESENT)) { + /* Get a PTE page for them. */ + ptepage =3D (void *)get_zeroed_page(GFP_KERNEL); + /* FIXME: Steal from self in this case? */ + if (!ptepage) { + kill_guest(lg, "out of memory allocating pte page"); + return 0; + } + val =3D check_pgtable_entry(lg, val); + *top =3D (__pa(ptepage) | (val & (PAGE_SIZE-1))); + } else + ptepage =3D __va(*top & PAGE_MASK); + + gpte =3D gpteof(lg, val, vaddr); + val =3D lhread_u32(lg, gpte); + + /* No page, or write to readonly page? */ + if (!(val&_PAGE_PRESENT) || ((flags&_PAGE_DIRTY) && !(val&_PAGE_RW))) + return 0; + + pte =3D pteof(lg, *top, vaddr); + val =3D check_pgtable_entry(lg, val) | flags; + + /* We're done with the old pte. */ + release_pte(*pte); + + /* We don't make it writable if this isn't a write: later + * write will fault so we can set dirty bit in guest. */ + if (val & _PAGE_DIRTY) + *pte =3D get_pte(lg, val, 1); + else + *pte =3D get_pte(lg, val & ~_PAGE_RW, 0); + + /* Now we update dirty/accessed on guest. */ + lhwrite_u32(lg, gpte, val); + return 1; +} + +int demand_page(struct lguest *lg, u32 vaddr, int write) +{ + return page_in(lg, vaddr, (write ? _PAGE_DIRTY : 0)|_PAGE_ACCESSED); +} + +void pin_stack_pages(struct lguest *lg) +{ + unsigned int i; + u32 stack =3D lg->state->tss.esp1; + + for (i =3D 0; i < lg->stack_pages; i++) + if (!demand_page(lg, stack - i*PAGE_SIZE, 1)) + kill_guest(lg, "bad stack page %i@%#x", i, stack); +} + +static unsigned int find_pgdir(struct lguest *lg, u32 pgtable) +{ + unsigned int i; + for (i =3D 0; i < ARRAY_SIZE(lg->pgdirs); i++) + if (lg->pgdirs[i].cr3 =3D=3D pgtable) + break; + return i; +} + +static void release_pgd(struct lguest *lg, u32 *pgd) +{ + if (*pgd & _PAGE_PRESENT) { + unsigned int i; + u32 *ptepage =3D __va(*pgd & ~(PAGE_SIZE-1)); + for (i =3D 0; i < PTES_PER_PAGE; i++) + release_pte(ptepage[i]); + free_page((long)ptepage); + *pgd =3D 0; + } +} + +static void flush_user_mappings(struct lguest *lg, int idx) +{ + unsigned int i; + for (i =3D 0; i < vaddr_to_pgd(lg->page_offset); i++) + release_pgd(lg, lg->pgdirs[idx].pgdir + i); +} + +void guest_pagetable_flush_user(struct lguest *lg) +{ + flush_user_mappings(lg, lg->pgdidx); +} + +static unsigned int new_pgdir(struct lguest *lg, u32 cr3) +{ + unsigned int next; + + next =3D (lg->pgdidx + random32()) % ARRAY_SIZE(lg->pgdirs); + if (!lg->pgdirs[next].pgdir) { + lg->pgdirs[next].pgdir =3D (u32 *)get_zeroed_page(GFP_KERNEL); + if (!lg->pgdirs[next].pgdir) + next =3D lg->pgdidx; + } + lg->pgdirs[next].cr3 =3D cr3; + /* Release all the non-kernel mappings. */ + flush_user_mappings(lg, next); + + return next; +} + +void guest_new_pagetable(struct lguest *lg, u32 pgtable) +{ + int newpgdir; + + newpgdir =3D find_pgdir(lg, pgtable); + if (newpgdir =3D=3D ARRAY_SIZE(lg->pgdirs)) + newpgdir =3D new_pgdir(lg, pgtable); + lg->pgdidx =3D newpgdir; + lg->state->regs.cr3 =3D __pa(lg->pgdirs[lg->pgdidx].pgdir); + pin_stack_pages(lg); +} + +static void release_all_pagetables(struct lguest *lg) +{ + unsigned int i, j; + + for (i =3D 0; i < ARRAY_SIZE(lg->pgdirs); i++) + if (lg->pgdirs[i].pgdir) + for (j =3D 0; j < HYPERVISOR_PGD_ENTRY; j++) + release_pgd(lg, lg->pgdirs[i].pgdir + j); +} + +void guest_pagetable_clear_all(struct lguest *lg) +{ + release_all_pagetables(lg); + pin_stack_pages(lg); +} + +static void do_set_pte(struct lguest *lg, int idx, + unsigned long vaddr, u32 val) +{ + u32 *top =3D toplev(lg, idx, vaddr); + if (*top & _PAGE_PRESENT) { + u32 *pte =3D pteof(lg, *top, vaddr); + release_pte(*pte); + if (val & (_PAGE_DIRTY | _PAGE_ACCESSED)) { + val =3D check_pgtable_entry(lg, val); + *pte =3D get_pte(lg, val, val & _PAGE_DIRTY); + } else + *pte =3D 0; + } +} + +void guest_set_pte(struct lguest *lg, + unsigned long cr3, unsigned long vaddr, u32 val) +{ + /* Kernel mappings must be changed on all top levels. */ + if (vaddr >=3D lg->page_offset) { + unsigned int i; + for (i =3D 0; i < ARRAY_SIZE(lg->pgdirs); i++) + if (lg->pgdirs[i].pgdir) + do_set_pte(lg, i, vaddr, val); + } else { + int pgdir =3D find_pgdir(lg, cr3); + if (pgdir !=3D ARRAY_SIZE(lg->pgdirs)) + do_set_pte(lg, pgdir, vaddr, val); + } +} + +void guest_set_pud(struct lguest *lg, unsigned long cr3, u32 idx) +{ + int pgdir; + + if (idx >=3D HYPERVISOR_PGD_ENTRY) + return; + + pgdir =3D find_pgdir(lg, cr3); + if (pgdir < ARRAY_SIZE(lg->pgdirs)) + release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); +} + +int init_guest_pagetable(struct lguest *lg, u32 pgtable) +{ + /* We assume this in flush_user_mappings, so check now */ + if (vaddr_to_pgd(lg->page_offset) >=3D HYPERVISOR_PGD_ENTRY) + return -EINVAL; + lg->pgdidx =3D 0; + lg->pgdirs[lg->pgdidx].cr3 =3D pgtable; + lg->pgdirs[lg->pgdidx].pgdir =3D (u32*)get_zeroed_page(GFP_KERNEL); + if (!lg->pgdirs[lg->pgdidx].pgdir) + return -ENOMEM; + return 0; +} + +void free_guest_pagetable(struct lguest *lg) +{ + unsigned int i; + + release_all_pagetables(lg); + for (i =3D 0; i < ARRAY_SIZE(lg->pgdirs); i++) + free_page((long)lg->pgdirs[i].pgdir); +} + +/* Caller must be preempt-safe */ +void map_trap_page(struct lguest *lg) +{ + int cpu =3D smp_processor_id(); + = + hypervisor_pte_page(cpu)[0] =3D (__pa(lg->trap_page)|_PAGE_PRESENT); + + /* Since hypervisor less that 4MB, we simply mug top pte page. */ + lg->pgdirs[lg->pgdidx].pgdir[HYPERVISOR_PGD_ENTRY] =3D + (__pa(hypervisor_pte_page(cpu))| _PAGE_KERNEL); +} + +static void free_hypervisor_pte_pages(void) +{ + int i; + = + for_each_possible_cpu(i) + free_page((long)hypervisor_pte_page(i)); +} + +static __init int alloc_hypervisor_pte_pages(void) +{ + int i; + + for_each_possible_cpu(i) { + hypervisor_pte_page(i) =3D (u32 *)get_zeroed_page(GFP_KERNEL); + if (!hypervisor_pte_page(i)) { + free_hypervisor_pte_pages(); + return -ENOMEM; + } + } + return 0; +} + +static __init void populate_hypervisor_pte_page(int cpu) +{ + int i; + u32 *pte =3D hypervisor_pte_page(cpu); + + for (i =3D 0; i < HYPERVISOR_PAGES; i++) { + /* First entry set dynamically in map_trap_page */ + pte[i+1] =3D ((page_to_pfn(&hype_pages[i]) << PAGE_SHIFT) = + | _PAGE_KERNEL_EXEC); + } +} + +__init int init_pagetables(struct page hype_pages[]) +{ + int ret; + unsigned int i; + + ret =3D alloc_hypervisor_pte_pages(); + if (ret) + return ret; + + for_each_possible_cpu(i) + populate_hypervisor_pte_page(i); + return 0; +} + +__exit void free_pagetables(void) +{ + free_hypervisor_pte_pages(); +} =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/lguest/segments.c @@ -0,0 +1,171 @@ +#include "lg.h" + +/* Dealing with GDT entries is such a horror, I convert to sanity and back= */ +struct decoded_gdt_entry +{ + u32 base, limit; + union { + struct { + unsigned type:4; + unsigned dtype:1; + unsigned dpl:2; + unsigned present:1; + unsigned unused:4; + unsigned avl:1; + unsigned mbz:1; + unsigned def:1; + unsigned page_granularity:1; + }; + u16 raw_attributes; + }; +}; + +static struct decoded_gdt_entry decode_gdt_entry(const struct desc_struct = *en) +{ + struct decoded_gdt_entry de; + de.base =3D ((en->a >> 16) | ((en->b & 0xff) << 16) = + | (en->b & 0xFF000000)); + de.limit =3D ((en->a & 0xFFFF) | (en->b & 0xF0000)); + de.raw_attributes =3D (en->b >> 8); + return de; +} + +static struct desc_struct encode_gdt_entry(const struct decoded_gdt_entry = *de) +{ + struct desc_struct en; + en.a =3D ((de->limit & 0xFFFF) | (de->base << 16)); + en.b =3D (((de->base >> 16) & 0xFF) = + | ((((u32)de->raw_attributes) & 0xF0FF) << 8) + | (de->limit & 0xF0000) + | (de->base & 0xFF000000)); + return en; +} + +static int check_desc(const struct decoded_gdt_entry *dec) +{ + return (dec->mbz =3D=3D 0 && dec->dtype =3D=3D 1 && (dec->type & 4) =3D= =3D 0); +} + +static void check_segment(const struct desc_struct *gdt, u32 *segreg) +{ + if (*segreg > 255 || !(gdt[*segreg >> 3].b & 0x8000)) + *segreg =3D 0; +} + +/* Ensure our manually-loaded segment regs don't fault in switch_to_guest.= */ +static void check_live_segments(const struct desc_struct *gdt, + struct lguest_regs *regs) +{ + check_segment(gdt, ®s->es); + check_segment(gdt, ®s->ds); + check_segment(gdt, ®s->fs); + check_segment(gdt, ®s->gs); +} + +int fixup_gdt_table(struct desc_struct *gdt, unsigned int num, + struct lguest_regs *regs, struct x86_tss *tss) +{ + unsigned int i; + struct decoded_gdt_entry dec; + + for (i =3D 0; i < num; i++) { + unsigned long base, length; + + /* We override these ones, so we don't care what they give. */ + if (i =3D=3D GDT_ENTRY_TSS + || i =3D=3D GDT_ENTRY_LGUEST_CS + || i =3D=3D GDT_ENTRY_LGUEST_DS + || i =3D=3D GDT_ENTRY_DOUBLEFAULT_TSS) + continue; + + dec =3D decode_gdt_entry(&gdt[i]); + if (!dec.present) + continue; + + if (!check_desc(&dec)) + return 0; + + base =3D dec.base; + length =3D dec.limit + 1; + if (dec.page_granularity) { + base *=3D PAGE_SIZE; + length *=3D PAGE_SIZE; + } + + /* Unacceptable base? */ + if (base >=3D HYPE_ADDR) + return 0; + + /* Wrap around or segment overlaps hypervisor mem? */ + if (!length + || base + length < base + || base + length > HYPE_ADDR) { + /* Trim to edge of hypervisor. */ + length =3D HYPE_ADDR - base; + if (dec.page_granularity) + dec.limit =3D (length / PAGE_SIZE) - 1; + else + dec.limit =3D length - 1; + } + if (dec.dpl =3D=3D 0) + dec.dpl =3D GUEST_DPL; + gdt[i] =3D encode_gdt_entry(&dec); + } + check_live_segments(gdt, regs); + + /* Now put in hypervisor data and code segments. */ + gdt[GDT_ENTRY_LGUEST_CS] =3D FULL_EXEC_SEGMENT; + gdt[GDT_ENTRY_LGUEST_DS] =3D FULL_SEGMENT; + + /* Finally, TSS entry */ + dec.base =3D (unsigned long)tss; + dec.limit =3D sizeof(*tss)-1; + dec.type =3D 0x9; + dec.dtype =3D 0; + dec.def =3D 0; + dec.present =3D 1; + dec.mbz =3D 0; + dec.page_granularity =3D 0; + gdt[GDT_ENTRY_TSS] =3D encode_gdt_entry(&dec); + + return 1; +} + +void load_guest_gdt(struct lguest *lg, u32 table, u32 num) +{ + if (num > GDT_ENTRIES) + kill_guest(lg, "too many gdt entries %i", num); + + lhread(lg, lg->state->gdt_table, table, + num * sizeof(lg->state->gdt_table[0])); + if (!fixup_gdt_table(lg->state->gdt_table, num, = + &lg->state->regs, &lg->state->tss)) + kill_guest(lg, "bad gdt table"); +} + +/* We don't care about limit here, since we only let them use these in + * usermode (where lack of USER bit in pagetable protects hypervisor mem). + * However, we want to ensure it doesn't fault when loaded, since *we* are + * the ones who will load it in switch_to_guest. + */ +void guest_load_tls(struct lguest *lg, const struct desc_struct __user *gt= ls) +{ + unsigned int i; + struct desc_struct *tls =3D &lg->state->gdt_table[GDT_ENTRY_TLS_MIN]; + + lhread(lg, tls, (u32)gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); + for (i =3D 0; i < ARRAY_SIZE(lg->tls_limits); i++) { + struct decoded_gdt_entry dec =3D decode_gdt_entry(&tls[i]); + + if (!dec.present) + continue; + + /* We truncate to one byte/page (depending on G bit) to neuter + it, so ensure it's more than 1 page below trap page. */ + tls[i].a &=3D 0xFFFF0000; + lg->tls_limits[i] =3D dec.limit; + if (!check_desc(&dec) || dec.base > HYPE_ADDR - PAGE_SIZE) + kill_guest(lg, "bad TLS descriptor %i", i); + } + check_live_segments(lg->state->gdt_table, &lg->state->regs); +} =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/include/asm-i386/lguest.h @@ -0,0 +1,86 @@ +/* Things the lguest guest needs to know. */ +#ifndef _ASM_LGUEST_H +#define _ASM_LGUEST_H + +#define LGUEST_MAGIC_EBP 0x4C687970 +#define LGUEST_MAGIC_EDI 0x652D4D65 +#define LGUEST_MAGIC_ESI 0xFFFFFFFF + +#define LHCALL_FLUSH_ASYNC 0 +#define LHCALL_LGUEST_INIT 1 +#define LHCALL_CRASH 2 +#define LHCALL_LOAD_GDT 3 +#define LHCALL_NEW_PGTABLE 4 +#define LHCALL_FLUSH_TLB 5 +#define LHCALL_LOAD_IDT_ENTRY 6 +#define LHCALL_SET_STACK 7 +#define LHCALL_TS 8 +#define LHCALL_TIMER_READ 9 +#define LHCALL_TIMER_START 10 +#define LHCALL_HALT 11 +#define LHCALL_GET_WALLCLOCK 12 +#define LHCALL_BIND_DMA 13 +#define LHCALL_SEND_DMA 14 +#define LHCALL_SET_PTE 15 +#define LHCALL_SET_UNKNOWN_PTE 16 +#define LHCALL_SET_PUD 17 +#define LHCALL_LOAD_TLS 18 + +#define LGUEST_TRAP_ENTRY 0x1F + +static inline unsigned long +hcall(unsigned long call, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) + : "=3Da"(call) + : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) = + : "memory"); + return call; +} + +void async_hcall(unsigned long call, + unsigned long arg1, unsigned long arg2, unsigned long arg3); + +#define LGUEST_IRQS 32 + +#define LHCALL_RING_SIZE 64 +struct hcall_ring +{ + u32 eax, edx, ebx, ecx; +}; + +/* All the good stuff happens here: guest registers it with LGUEST_INIT */ +struct lguest_data +{ +/* Fields which change during running: */ + /* 512 =3D=3D enabled (same as eflags) */ + unsigned int irq_enabled; + /* Blocked interrupts. */ + DECLARE_BITMAP(interrupts, LGUEST_IRQS); = + + /* Last (userspace) address we got a GPF & reloaded gs. */ + unsigned int gs_gpf_eip; + + /* Virtual address of page fault. */ + unsigned long cr2; + + /* Async hypercall ring. 0xFF =3D=3D done, 0 =3D=3D pending. */ + u8 hcall_status[LHCALL_RING_SIZE]; + struct hcall_ring hcalls[LHCALL_RING_SIZE]; + = +/* Fields initialized by the hypervisor at boot: */ + /* Memory not to try to access */ + unsigned long reserve_mem; + /* ID of this guest (used by network driver to set ethernet address) */ + u16 guestid; + /* Multiplier for TSC clock. */ + u32 clock_mult; + +/* Fields initialized by the guest at boot: */ + /* Instruction range to suppress interrupts even if enabled */ + unsigned long noirq_start, noirq_end; +}; +extern struct lguest_data lguest_data; +extern struct lguest_device_desc *lguest_devices; /* Just past max_pfn */ +#endif /* _ASM_LGUEST_H */ =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/include/asm-i386/lguest_device.h @@ -0,0 +1,31 @@ +#ifndef _ASM_LGUEST_DEVICE_H +#define _ASM_LGUEST_DEVICE_H +/* Everything you need to know about lguest devices. */ +#include +#include +#include + +struct lguest_device { + /* Unique busid, and index into lguest_page->devices[] */ + /* By convention, each device can use irq index+1 if it wants to. */ + unsigned int index; + + struct device dev; + + /* Driver can hang data off here. */ + void *private; +}; + +struct lguest_driver { + const char *name; + struct module *owner; + u16 device_type; + int (*probe)(struct lguest_device *dev); + void (*remove)(struct lguest_device *dev); + + struct device_driver drv; +}; + +extern int register_lguest_driver(struct lguest_driver *drv); +extern void unregister_lguest_driver(struct lguest_driver *drv); +#endif /* _ASM_LGUEST_DEVICE_H */ =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/include/asm-i386/lguest_user.h @@ -0,0 +1,86 @@ +#ifndef _ASM_LGUEST_USER +#define _ASM_LGUEST_USER +/* Everything the "lguest" userspace program needs to know. */ +/* They can register up to 32 arrays of lguest_dma. */ +#define LGUEST_MAX_DMA 32 +/* At most we can dma 16 lguest_dma in one op. */ +#define LGUEST_MAX_DMA_SECTIONS 16 + +/* How many devices? Assume each one wants up to two dma arrays per devic= e. */ +#define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2) + +struct lguest_dma +{ + /* 0 if free to be used, filled by hypervisor. */ + u32 used_len; + u32 addr[LGUEST_MAX_DMA_SECTIONS]; + u16 len[LGUEST_MAX_DMA_SECTIONS]; +}; + +/* This is found at address 0. */ +struct lguest_boot_info +{ + u32 max_pfn; + u32 initrd_size; + char cmdline[256]; +}; + +struct lguest_block_page +{ + /* 0 is a read, 1 is a write. */ + int type; + u32 sector; /* Offset in device =3D sector * 512. */ + u32 bytes; /* Length expected to be read/written in bytes */ + /* 0 =3D pending, 1 =3D done, 2 =3D done, error */ + int result; + u32 num_sectors; /* Disk length =3D num_sectors * 512 */ +}; + +/* There is a shared page of these. */ +struct lguest_net +{ + union { + unsigned char mac[6]; + struct { + u8 promisc; + u8 pad; + u16 guestid; + }; + }; +}; + +/* lguest_device_desc->type */ +#define LGUEST_DEVICE_T_CONSOLE 1 +#define LGUEST_DEVICE_T_NET 2 +#define LGUEST_DEVICE_T_BLOCK 3 + +/* lguest_device_desc->status. 256 and above are device specific. */ +#define LGUEST_DEVICE_S_ACKNOWLEDGE 1 /* We have seen device. */ +#define LGUEST_DEVICE_S_DRIVER 2 /* We have found a driver */ +#define LGUEST_DEVICE_S_DRIVER_OK 4 /* Driver says OK! */ +#define LGUEST_DEVICE_S_REMOVED 8 /* Device has gone away. */ +#define LGUEST_DEVICE_S_REMOVED_ACK 16 /* Driver has been told. */ +#define LGUEST_DEVICE_S_FAILED 128 /* Something actually failed */ + +#define LGUEST_NET_F_NOCSUM 0x4000 /* Don't bother checksumming */ +#define LGUEST_DEVICE_F_RANDOMNESS 0x8000 /* IRQ is fairly random */ + +/* We have a page of these descriptors in the lguest_device page. */ +struct lguest_device_desc { + u16 type; + u16 features; + u16 status; + u16 num_pages; + u32 pfn; +}; + +/* Write command first word is a request. */ +enum lguest_req +{ + LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */ + LHREQ_GETDMA, /* + addr (returns &lguest_dma, irq in ->used_len) */ + LHREQ_IRQ, /* + irq */ +}; + + +#endif /* _ASM_LGUEST_USER */