* [PATCH 1/8] lguest: the guest code [not found] <1176203068.26372.21.camel@localhost.localdomain> @ 2007-04-10 11:05 ` Rusty Russell [not found] ` <1176203130.26372.23.camel@localhost.localdomain> 1 sibling, 0 replies; 13+ messages in thread From: Rusty Russell @ 2007-04-10 11:05 UTC (permalink / raw) To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization This is the code and headers required to make an i386 kernel an lguest guest. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> --- drivers/lguest/lguest.c | 509 +++++++++++++++++++++++++++++++++++++++++++ drivers/lguest/lguest_asm.S | 59 ++++ drivers/lguest/lguest_bus.c | 148 ++++++++++++ include/linux/lguest.h | 85 +++++++ include/linux/lguest_bus.h | 33 ++ 5 files changed, 834 insertions(+) =================================================================== --- /dev/null +++ b/drivers/lguest/lguest.c @@ -0,0 +1,509 @@ +/* + * Lguest specific paravirt-ops implementation + * + * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <linux/kernel.h> +#include <linux/start_kernel.h> +#include <linux/string.h> +#include <linux/console.h> +#include <linux/screen_info.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/lguest.h> +#include <linux/lguest_launcher.h> +#include <asm/paravirt.h> +#include <asm/param.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/desc.h> +#include <asm/setup.h> +#include <asm/e820.h> +#include <asm/pda.h> +#include <asm/mce.h> + +/* Declarations for definitions in lguest_guest.S */ +extern char lguest_noirq_start[], lguest_noirq_end[]; +extern const char lgstart_cli[], lgend_cli[]; +extern const char lgstart_sti[], lgend_sti[]; +extern const char lgstart_popf[], lgend_popf[]; +extern const char lgstart_pushf[], lgend_pushf[]; +extern const char lgstart_iret[], lgend_iret[]; +extern asmlinkage void lguest_maybe_init(void); +extern void lguest_iret(void); + +struct lguest_data lguest_data = { + .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, + .noirq_start = (u32)lguest_noirq_start, + .noirq_end = (u32)lguest_noirq_end, + .blocked_interrupts = { 1 }, /* Block timer interrupts */ +}; +struct lguest_device_desc *lguest_devices; +static __initdata const struct lguest_boot_info *boot = __va(0); + +static enum paravirt_lazy_mode lazy_mode; +static void lguest_lazy_mode(enum paravirt_lazy_mode mode) +{ + lazy_mode = mode; + if (mode == PARAVIRT_LAZY_NONE) + hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); +} + +static void lazy_hcall(unsigned long call, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3) +{ + if (lazy_mode == PARAVIRT_LAZY_NONE) + hcall(call, arg1, arg2, arg3); + else + async_hcall(call, arg1, arg2, arg3); +} + +void async_hcall(unsigned long call, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + /* Note: This code assumes we're uniprocessor. */ + static unsigned int next_call; + unsigned long flags; + + local_irq_save(flags); + if (lguest_data.hcall_status[next_call] != 0xFF) { + /* Table full, so do normal hcall which will flush table. */ + hcall(call, arg1, arg2, arg3); + } else { + lguest_data.hcalls[next_call].eax = call; + lguest_data.hcalls[next_call].edx = arg1; + lguest_data.hcalls[next_call].ebx = arg2; + lguest_data.hcalls[next_call].ecx = arg3; + /* Make sure host sees arguments before "valid" flag. */ + wmb(); + lguest_data.hcall_status[next_call] = 0; + if (++next_call == LHCALL_RING_SIZE) + next_call = 0; + } + local_irq_restore(flags); +} + +static unsigned long save_fl(void) +{ + return lguest_data.irq_enabled; +} + +static void restore_fl(unsigned long flags) +{ + /* FIXME: Check if interrupt pending... */ + lguest_data.irq_enabled = flags; +} + +static void irq_disable(void) +{ + lguest_data.irq_enabled = 0; +} + +static void irq_enable(void) +{ + /* FIXME: Check if interrupt pending... */ + lguest_data.irq_enabled = X86_EFLAGS_IF; +} + +static void lguest_write_idt_entry(struct desc_struct *dt, + int entrynum, u32 low, u32 high) +{ + write_dt_entry(dt, entrynum, low, high); + hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high); +} + +static void lguest_load_idt(const struct Xgt_desc_struct *desc) +{ + unsigned int i; + struct desc_struct *idt = (void *)desc->address; + + for (i = 0; i < (desc->size+1)/8; i++) + hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); +} + +static void lguest_load_gdt(const struct Xgt_desc_struct *desc) +{ + BUG_ON((desc->size+1)/8 != GDT_ENTRIES); + hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0); +} + +static void lguest_write_gdt_entry(struct desc_struct *dt, + int entrynum, u32 low, u32 high) +{ + write_dt_entry(dt, entrynum, low, high); + hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0); +} + +static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) +{ + lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0); +} + +static void lguest_set_ldt(const void *addr, unsigned entries) +{ +} + +static void lguest_load_tr_desc(void) +{ +} + +static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + int function = *eax; + + native_cpuid(eax, ebx, ecx, edx); + switch (function) { + case 1: /* Basic feature request. */ + /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ + *ecx &= 0x00002201; + /* Similarly: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ + *edx &= 0x07808101; + /* Host wants to know when we flush kernel pages: set PGE. */ + *edx |= 0x00002000; + break; + case 0x80000000: + /* Futureproof this a little: if they ask how much extended + * processor information, limit it to known fields. */ + if (*eax > 0x80000008) + *eax = 0x80000008; + break; + } +} + +static unsigned long current_cr0, current_cr3; +static void lguest_write_cr0(unsigned long val) +{ + hcall(LHCALL_TS, val & 8, 0, 0); + current_cr0 = val; +} + +static unsigned long lguest_read_cr0(void) +{ + return current_cr0; +} + +static void lguest_clts(void) +{ + lazy_hcall(LHCALL_TS, 0, 0, 0); + current_cr0 &= ~8U; +} + +static unsigned long lguest_read_cr2(void) +{ + return lguest_data.cr2; +} + +static void lguest_write_cr3(unsigned long cr3) +{ + hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0); + current_cr3 = cr3; +} + +static unsigned long lguest_read_cr3(void) +{ + return current_cr3; +} + +/* Used to enable/disable PGE, but we don't care. */ +static unsigned long lguest_read_cr4(void) +{ + return 0; +} + +static void lguest_write_cr4(unsigned long val) +{ +} + +static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) +{ + *ptep = pteval; + lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low); +} + +/* We only support two-level pagetables at the moment. */ +static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + *pmdp = pmdval; + lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK, + (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); +} + +/* FIXME: Eliminate all callers of this. */ +static void lguest_set_pte(pte_t *ptep, pte_t pteval) +{ + *ptep = pteval; + /* Don't bother with hypercall before initial setup. */ + if (current_cr3) + hcall(LHCALL_FLUSH_TLB, 1, 0, 0); +} + +static void lguest_flush_tlb_single(unsigned long addr) +{ + /* Simply set it to zero, and it will fault back in. */ + lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0); +} + +static void lguest_flush_tlb_user(void) +{ + lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0); +} + +static void lguest_flush_tlb_kernel(void) +{ + lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); +} + +static void disable_lguest_irq(unsigned int irq) +{ + set_bit(irq, lguest_data.blocked_interrupts); +} + +static void enable_lguest_irq(unsigned int irq) +{ + clear_bit(irq, lguest_data.blocked_interrupts); + /* FIXME: If it's pending? */ +} + +static struct irq_chip lguest_irq_controller = { + .name = "lguest", + .mask = disable_lguest_irq, + .mask_ack = disable_lguest_irq, + .unmask = enable_lguest_irq, +}; + +static void __init lguest_init_IRQ(void) +{ + unsigned int i; + + for (i = 0; i < LGUEST_IRQS; i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (vector != SYSCALL_VECTOR) { + set_intr_gate(vector, interrupt[i]); + set_irq_chip_and_handler(i, &lguest_irq_controller, + handle_level_irq); + } + } + irq_ctx_init(smp_processor_id()); +} + +static unsigned long lguest_get_wallclock(void) +{ + return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0); +} + +static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) +{ + do_timer(hcall(LHCALL_TIMER_READ, 0, 0, 0)); + update_process_times(user_mode_vm(get_irq_regs())); +} + +static void lguest_time_init(void) +{ + set_irq_handler(0, lguest_time_irq); + hcall(LHCALL_TIMER_READ, 0, 0, 0); + enable_lguest_irq(0); +} + +static void lguest_load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0, + THREAD_SIZE/PAGE_SIZE); +} + +static void lguest_set_debugreg(int regno, unsigned long value) +{ + /* FIXME: Implement */ +} + +static void lguest_wbinvd(void) +{ +} + +#ifdef CONFIG_X86_LOCAL_APIC +static void lguest_apic_write(unsigned long reg, unsigned long v) +{ +} + +static unsigned long lguest_apic_read(unsigned long reg) +{ + return 0; +} +#endif + +static void lguest_safe_halt(void) +{ + hcall(LHCALL_HALT, 0, 0, 0); +} + +static void lguest_power_off(void) +{ + hcall(LHCALL_CRASH, __pa("Power down"), 0, 0); +} + +static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) +{ + hcall(LHCALL_CRASH, __pa(p), 0, 0); + return NOTIFY_DONE; +} + +static struct notifier_block paniced = { + .notifier_call = lguest_panic +}; + +static __init char *lguest_memory_setup(void) +{ + /* We do this here because lockcheck barfs if before start_kernel */ + atomic_notifier_chain_register(&panic_notifier_list, &paniced); + + e820.nr_map = 0; + add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM); + return "LGUEST"; +} + +static const struct lguest_insns +{ + const char *start, *end; +} lguest_insns[] = { + [PARAVIRT_PATCH(irq_disable)] = { lgstart_cli, lgend_cli }, + [PARAVIRT_PATCH(irq_enable)] = { lgstart_sti, lgend_sti }, + [PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf }, + [PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf }, + [PARAVIRT_PATCH(iret)] = { lgstart_iret, lgend_iret }, +}; +static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len) +{ + unsigned int insn_len; + + /* Don't touch it if we don't have a replacement */ + if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) + return len; + + insn_len = lguest_insns[type].end - lguest_insns[type].start; + + /* Similarly if we can't fit replacement. */ + if (len < insn_len) + return len; + + memcpy(insns, lguest_insns[type].start, insn_len); + if (type == PARAVIRT_PATCH(iret)) { + /* Jumps are relative. */ + u32 off = (u32)lguest_iret - ((u32)insns + insn_len); + memcpy(insns+1, &off, sizeof(off)); + } + return insn_len; +} + +/* From head.S */ +extern void setup_pda(void); +extern struct Xgt_desc_struct early_gdt_descr; + +__init void lguest_init(void) +{ + paravirt_ops.name = "lguest"; + paravirt_ops.paravirt_enabled = 1; + paravirt_ops.kernel_rpl = 1; + + paravirt_ops.save_fl = save_fl; + paravirt_ops.restore_fl = restore_fl; + paravirt_ops.irq_disable = irq_disable; + paravirt_ops.irq_enable = irq_enable; + paravirt_ops.load_gdt = lguest_load_gdt; + paravirt_ops.memory_setup = lguest_memory_setup; + paravirt_ops.cpuid = lguest_cpuid; + paravirt_ops.write_cr3 = lguest_write_cr3; + paravirt_ops.flush_tlb_user = lguest_flush_tlb_user; + paravirt_ops.flush_tlb_single = lguest_flush_tlb_single; + paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; + paravirt_ops.set_pte = lguest_set_pte; + paravirt_ops.set_pte_at = lguest_set_pte_at; + paravirt_ops.set_pmd = lguest_set_pmd; +#ifdef CONFIG_X86_LOCAL_APIC + paravirt_ops.apic_write = lguest_apic_write; + paravirt_ops.apic_write_atomic = lguest_apic_write; + paravirt_ops.apic_read = lguest_apic_read; +#endif + paravirt_ops.load_idt = lguest_load_idt; + paravirt_ops.iret = lguest_iret; + paravirt_ops.load_esp0 = lguest_load_esp0; + paravirt_ops.load_tr_desc = lguest_load_tr_desc; + paravirt_ops.set_ldt = lguest_set_ldt; + paravirt_ops.load_tls = lguest_load_tls; + paravirt_ops.set_debugreg = lguest_set_debugreg; + paravirt_ops.clts = lguest_clts; + paravirt_ops.read_cr0 = lguest_read_cr0; + paravirt_ops.write_cr0 = lguest_write_cr0; + paravirt_ops.init_IRQ = lguest_init_IRQ; + paravirt_ops.read_cr2 = lguest_read_cr2; + paravirt_ops.read_cr3 = lguest_read_cr3; + paravirt_ops.read_cr4 = lguest_read_cr4; + paravirt_ops.write_cr4 = lguest_write_cr4; + paravirt_ops.write_gdt_entry = lguest_write_gdt_entry; + paravirt_ops.write_idt_entry = lguest_write_idt_entry; + paravirt_ops.patch = lguest_patch; + paravirt_ops.safe_halt = lguest_safe_halt; + paravirt_ops.get_wallclock = lguest_get_wallclock; + paravirt_ops.time_init = lguest_time_init; + paravirt_ops.set_lazy_mode = lguest_lazy_mode; + paravirt_ops.wbinvd = lguest_wbinvd; + + hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0); + strncpy(boot_command_line, boot->cmdline, COMMAND_LINE_SIZE); + + /* We use top of mem for initial pagetables. */ + init_pg_tables_end = __pa(pg0); + + /* set up PDA descriptor */ + setup_pda(); + load_gdt(&early_gdt_descr); + asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory"); + + reserve_top_address(lguest_data.reserve_mem); + + cpu_detect(&new_cpu_data); + /* Need this before paging_init. */ + set_bit(X86_FEATURE_PGE, new_cpu_data.x86_capability); + /* Math is always hard! */ + new_cpu_data.hard_math = 1; + +#ifdef CONFIG_X86_MCE + mce_disabled = 1; +#endif + +#ifdef CONFIG_ACPI + acpi_disabled = 1; + acpi_ht = 0; +#endif + + add_preferred_console("hvc", 0, NULL); + + if (boot->initrd_size) { + /* We stash this at top of memory. */ + INITRD_START = boot->max_pfn*PAGE_SIZE - boot->initrd_size; + INITRD_SIZE = boot->initrd_size; + LOADER_TYPE = 0xFF; + } + + pm_power_off = lguest_power_off; + start_kernel(); +} +paravirt_probe(lguest_maybe_init); =================================================================== --- /dev/null +++ b/drivers/lguest/lguest_asm.S @@ -0,0 +1,59 @@ +#include <linux/linkage.h> +#include <linux/lguest.h> +#include <asm/asm-offsets.h> + +/* FIXME: Once asm/processor-flags.h goes in, include that */ +#define X86_EFLAGS_IF 0x00000200 + +/* + * This is where we begin: head.S notes that paging is already enabled (which + * doesn't happen in native boot) and calls the registered paravirt_probe + * functions one at a time. Ours is a simple assembler test for magic + * registers. If they're correct we jump to lguest_init. + * + * We put it in .init.text will be discarded after boot. + */ +.section .init.text, "ax", @progbits +ENTRY(lguest_maybe_init) + cmpl $LGUEST_MAGIC_EBP, %ebp + jne out + cmpl $LGUEST_MAGIC_EDI, %edi + jne out + cmpl $LGUEST_MAGIC_ESI, %esi + jne out + je lguest_init +out: + ret + +/* The templates for inline patching. */ +#define LGUEST_PATCH(name, insns...) \ + lgstart_##name: insns; lgend_##name:; \ + .globl lgstart_##name; .globl lgend_##name + +LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) +LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) +LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) +LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) +LGUEST_PATCH(iret, .byte 0xE9,0,0,0,0) /* jmp <to-be-patched> */ + +.text +/* These demark the EIP range where host should never deliver interrupts. */ +.global lguest_noirq_start +.global lguest_noirq_end + +/* + * We move eflags word to lguest_data.irq_enabled to restore interrupt state. + * For page faults, gpfs and virtual interrupts, the hypervisor has saved + * eflags manually, otherwise it was delivered directly and so eflags reflects + * the real machine IF state, ie. interrupts on. Since the kernel always dies + * if it takes such a trap with interrupts disabled anyway, turning interrupts + * back on unconditionally here is OK. + */ +ENTRY(lguest_iret) + pushl %eax + movl 12(%esp), %eax +lguest_noirq_start: + movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled + popl %eax + iret +lguest_noirq_end: =================================================================== --- /dev/null +++ b/drivers/lguest/lguest_bus.c @@ -0,0 +1,148 @@ +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/lguest_bus.h> +#include <asm/io.h> + +static ssize_t type_show(struct device *_dev, + struct device_attribute *attr, char *buf) +{ + struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); + return sprintf(buf, "%hu", lguest_devices[dev->index].type); +} +static ssize_t features_show(struct device *_dev, + struct device_attribute *attr, char *buf) +{ + struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); + return sprintf(buf, "%hx", lguest_devices[dev->index].features); +} +static ssize_t pfn_show(struct device *_dev, + struct device_attribute *attr, char *buf) +{ + struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); + return sprintf(buf, "%u", lguest_devices[dev->index].pfn); +} +static ssize_t status_show(struct device *_dev, + struct device_attribute *attr, char *buf) +{ + struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); + return sprintf(buf, "%hx", lguest_devices[dev->index].status); +} +static ssize_t status_store(struct device *_dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); + if (sscanf(buf, "%hi", &lguest_devices[dev->index].status) != 1) + return -EINVAL; + return count; +} +static struct device_attribute lguest_dev_attrs[] = { + __ATTR_RO(type), + __ATTR_RO(features), + __ATTR_RO(pfn), + __ATTR(status, 0644, status_show, status_store), + __ATTR_NULL +}; + +static int lguest_dev_match(struct device *_dev, struct device_driver *_drv) +{ + struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); + struct lguest_driver *drv = container_of(_drv,struct lguest_driver,drv); + + return (drv->device_type == lguest_devices[dev->index].type); +} + +struct lguest_bus { + struct bus_type bus; + struct device dev; +}; + +static struct lguest_bus lguest_bus = { + .bus = { + .name = "lguest", + .match = lguest_dev_match, + .dev_attrs = lguest_dev_attrs, + }, + .dev = { + .parent = NULL, + .bus_id = "lguest", + } +}; + +static int lguest_dev_probe(struct device *_dev) +{ + int ret; + struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); + struct lguest_driver *drv = container_of(dev->dev.driver, + struct lguest_driver, drv); + + lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER; + ret = drv->probe(dev); + if (ret == 0) + lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER_OK; + return ret; +} + +int register_lguest_driver(struct lguest_driver *drv) +{ + if (!lguest_devices) + return 0; + + drv->drv.bus = &lguest_bus.bus; + drv->drv.name = drv->name; + drv->drv.owner = drv->owner; + drv->drv.probe = lguest_dev_probe; + + return driver_register(&drv->drv); +} +EXPORT_SYMBOL_GPL(register_lguest_driver); + +static void add_lguest_device(unsigned int index) +{ + struct lguest_device *new; + + lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE; + new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL); + if (!new) { + printk(KERN_EMERG "Cannot allocate lguest device %u\n", index); + lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED; + return; + } + + new->index = index; + new->private = NULL; + memset(&new->dev, 0, sizeof(new->dev)); + new->dev.parent = &lguest_bus.dev; + new->dev.bus = &lguest_bus.bus; + sprintf(new->dev.bus_id, "%u", index); + if (device_register(&new->dev) != 0) { + printk(KERN_EMERG "Cannot register lguest device %u\n", index); + lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED; + kfree(new); + } +} + +static void scan_devices(void) +{ + unsigned int i; + + for (i = 0; i < LGUEST_MAX_DEVICES; i++) + if (lguest_devices[i].type) + add_lguest_device(i); +} + +static int __init lguest_bus_init(void) +{ + if (strcmp(paravirt_ops.name, "lguest") != 0) + return 0; + + /* Devices are in page above top of "normal" mem. */ + lguest_devices = ioremap(max_pfn << PAGE_SHIFT, PAGE_SIZE); + + if (bus_register(&lguest_bus.bus) != 0 + || device_register(&lguest_bus.dev) != 0) + panic("lguest bus registration failed"); + + scan_devices(); + return 0; +} +postcore_initcall(lguest_bus_init); =================================================================== --- /dev/null +++ b/include/linux/lguest.h @@ -0,0 +1,85 @@ +/* Things the lguest guest needs to know. Note: like all lguest interfaces, + * this is subject to wild and random change between versions. */ +#ifndef _ASM_LGUEST_H +#define _ASM_LGUEST_H + +/* These are randomly chosen numbers which indicate we're an lguest at boot */ +#define LGUEST_MAGIC_EBP 0x4C687970 +#define LGUEST_MAGIC_EDI 0x652D4D65 +#define LGUEST_MAGIC_ESI 0xFFFFFFFF + +#ifndef __ASSEMBLY__ +#include <asm/irq.h> + +#define LHCALL_FLUSH_ASYNC 0 +#define LHCALL_LGUEST_INIT 1 +#define LHCALL_CRASH 2 +#define LHCALL_LOAD_GDT 3 +#define LHCALL_NEW_PGTABLE 4 +#define LHCALL_FLUSH_TLB 5 +#define LHCALL_LOAD_IDT_ENTRY 6 +#define LHCALL_SET_STACK 7 +#define LHCALL_TS 8 +#define LHCALL_TIMER_READ 9 +#define LHCALL_HALT 10 +#define LHCALL_GET_WALLCLOCK 11 +#define LHCALL_BIND_DMA 12 +#define LHCALL_SEND_DMA 13 +#define LHCALL_SET_PTE 14 +#define LHCALL_SET_PMD 15 +#define LHCALL_LOAD_TLS 16 + +#define LGUEST_TRAP_ENTRY 0x1F + +static inline unsigned long +hcall(unsigned long call, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) + : "=a"(call) + : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) + : "memory"); + return call; +} + +void async_hcall(unsigned long call, + unsigned long arg1, unsigned long arg2, unsigned long arg3); + +/* Can't use our min() macro here: needs to be a constant */ +#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) + +#define LHCALL_RING_SIZE 64 +struct hcall_ring +{ + u32 eax, edx, ebx, ecx; +}; + +/* All the good stuff happens here: guest registers it with LGUEST_INIT */ +struct lguest_data +{ +/* Fields which change during running: */ + /* 512 == enabled (same as eflags) */ + unsigned int irq_enabled; + /* Interrupts blocked by guest. */ + DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS); + + /* Virtual address of page fault. */ + unsigned long cr2; + + /* Async hypercall ring. 0xFF == done, 0 == pending. */ + u8 hcall_status[LHCALL_RING_SIZE]; + struct hcall_ring hcalls[LHCALL_RING_SIZE]; + +/* Fields initialized by the hypervisor at boot: */ + /* Memory not to try to access */ + unsigned long reserve_mem; + /* ID of this guest (used by network driver to set ethernet address) */ + u16 guestid; + +/* Fields initialized by the guest at boot: */ + /* Instruction range to suppress interrupts even if enabled */ + unsigned long noirq_start, noirq_end; +}; +extern struct lguest_data lguest_data; +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_LGUEST_H */ =================================================================== --- /dev/null +++ b/include/linux/lguest_bus.h @@ -0,0 +1,33 @@ +#ifndef _ASM_LGUEST_DEVICE_H +#define _ASM_LGUEST_DEVICE_H +/* Everything you need to know about lguest devices. */ +#include <linux/device.h> +#include <linux/lguest.h> +#include <linux/lguest_launcher.h> + +struct lguest_device { + /* Unique busid, and index into lguest_page->devices[] */ + /* By convention, each device can use irq index+1 if it wants to. */ + unsigned int index; + + struct device dev; + + /* Driver can hang data off here. */ + void *private; +}; + +struct lguest_driver { + const char *name; + struct module *owner; + u16 device_type; + int (*probe)(struct lguest_device *dev); + void (*remove)(struct lguest_device *dev); + + struct device_driver drv; +}; + +extern int register_lguest_driver(struct lguest_driver *drv); +extern void unregister_lguest_driver(struct lguest_driver *drv); + +extern struct lguest_device_desc *lguest_devices; /* Just past max_pfn */ +#endif /* _ASM_LGUEST_DEVICE_H */ ^ permalink raw reply [flat|nested] 13+ messages in thread
[parent not found: <1176203130.26372.23.camel@localhost.localdomain>]
* [PATCH 2/8] lguest: the host code [not found] ` <1176203130.26372.23.camel@localhost.localdomain> @ 2007-04-10 11:06 ` Rusty Russell [not found] ` <1176203176.26372.25.camel@localhost.localdomain> 1 sibling, 0 replies; 13+ messages in thread From: Rusty Russell @ 2007-04-10 11:06 UTC (permalink / raw) To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization This is the code for the "lg.ko" module, which allows lguest guests to be launched. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> --- drivers/lguest/core.c | 444 +++++++++++++++++++++++++++++++++ drivers/lguest/hypercalls.c | 172 ++++++++++++ drivers/lguest/interrupts_and_traps.c | 258 +++++++++++++++++++ drivers/lguest/io.c | 412 ++++++++++++++++++++++++++++++ drivers/lguest/lg.h | 278 ++++++++++++++++++++ drivers/lguest/lguest_user.c | 192 ++++++++++++++ drivers/lguest/page_tables.c | 407 ++++++++++++++++++++++++++++++ drivers/lguest/segments.c | 115 ++++++++ drivers/lguest/switcher.S | 159 +++++++++++ include/linux/lguest_launcher.h | 80 +++++ 10 files changed, 2517 insertions(+) =================================================================== --- /dev/null +++ b/drivers/lguest/core.c @@ -0,0 +1,444 @@ +/* World's simplest hypervisor, to test paravirt_ops and show + * unbelievers that virtualization is the future. Plus, it's fun! */ +#include <linux/module.h> +#include <linux/stringify.h> +#include <linux/stddef.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/cpu.h> +#include <linux/freezer.h> +#include <asm/paravirt.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/uaccess.h> +#include <asm/poll.h> +#include <asm/highmem.h> +#include <asm/asm-offsets.h> +#include <asm/i387.h> +#include "lg.h" + +/* Found in switcher.S */ +extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; +extern unsigned long default_idt_entries[]; + +/* Every guest maps the core switcher code. */ +#define SHARED_SWITCHER_PAGES \ + DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) +/* Pages for switcher itself, then two pages per cpu */ +#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS) + +/* We map at -4M for ease of mapping into the guest (one PTE page). */ +#define SWITCHER_ADDR 0xFFC00000 + +static struct vm_struct *switcher_vma; +static struct page **switcher_page; + +static int cpu_had_pge; +static struct { + unsigned long offset; + unsigned short segment; +} lguest_entry; + +/* This One Big lock protects all inter-guest data structures. */ +DEFINE_MUTEX(lguest_lock); +static DEFINE_PER_CPU(struct lguest *, last_guest); + +/* FIXME: Make dynamic. */ +#define MAX_LGUEST_GUESTS 16 +struct lguest lguests[MAX_LGUEST_GUESTS]; + +/* Offset from where switcher.S was compiled to where we've copied it */ +static unsigned long switcher_offset(void) +{ + return SWITCHER_ADDR - (unsigned long)start_switcher_text; +} + +/* This cpu's struct lguest_pages. */ +static struct lguest_pages *lguest_pages(unsigned int cpu) +{ + return &(((struct lguest_pages *) + (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); +} + +static __init int map_switcher(void) +{ + int i, err; + struct page **pagep; + + switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, + GFP_KERNEL); + if (!switcher_page) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { + unsigned long addr = get_zeroed_page(GFP_KERNEL); + if (!addr) { + err = -ENOMEM; + goto free_some_pages; + } + switcher_page[i] = virt_to_page(addr); + } + + switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, + VM_ALLOC, SWITCHER_ADDR, VMALLOC_END); + if (!switcher_vma) { + err = -ENOMEM; + printk("lguest: could not map switcher pages high\n"); + goto free_pages; + } + + pagep = switcher_page; + err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep); + if (err) { + printk("lguest: map_vm_area failed: %i\n", err); + goto free_vma; + } + memcpy(switcher_vma->addr, start_switcher_text, + end_switcher_text - start_switcher_text); + + /* Fix up IDT entries to point into copied text. */ + for (i = 0; i < IDT_ENTRIES; i++) + default_idt_entries[i] += switcher_offset(); + + for_each_possible_cpu(i) { + struct lguest_pages *pages = lguest_pages(i); + struct lguest_ro_state *state = &pages->state; + + /* These fields are static: rest done in copy_in_guest_info */ + state->host_gdt_desc.size = GDT_SIZE-1; + state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); + store_idt(&state->host_idt_desc); + state->guest_idt_desc.size = sizeof(state->guest_idt)-1; + state->guest_idt_desc.address = (long)&state->guest_idt; + state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; + state->guest_gdt_desc.address = (long)&state->guest_gdt; + state->guest_tss.esp0 = (long)(&pages->regs + 1); + state->guest_tss.ss0 = LGUEST_DS; + /* No I/O for you! */ + state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); + setup_default_gdt_entries(state); + setup_default_idt_entries(state, default_idt_entries); + + /* Setup LGUEST segments on all cpus */ + get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; + get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; + } + + /* Initialize entry point into switcher. */ + lguest_entry.offset = (long)switch_to_guest + switcher_offset(); + lguest_entry.segment = LGUEST_CS; + + printk(KERN_INFO "lguest: mapped switcher at %p\n", + switcher_vma->addr); + return 0; + +free_vma: + vunmap(switcher_vma->addr); +free_pages: + i = TOTAL_SWITCHER_PAGES; +free_some_pages: + for (--i; i >= 0; i--) + __free_pages(switcher_page[i], 0); + kfree(switcher_page); +out: + return err; +} + +static void unmap_switcher(void) +{ + unsigned int i; + + vunmap(switcher_vma->addr); + for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) + __free_pages(switcher_page[i], 0); +} + +/* IN/OUT insns: enough to get us past boot-time probing. */ +static int emulate_insn(struct lguest *lg) +{ + u8 insn; + unsigned int insnlen = 0, in = 0, shift = 0; + unsigned long physaddr = guest_pa(lg, lg->regs->eip); + + /* This only works for addresses in linear mapping... */ + if (lg->regs->eip < lg->page_offset) + return 0; + lgread(lg, &insn, physaddr, 1); + + /* Operand size prefix means it's actually for ax. */ + if (insn == 0x66) { + shift = 16; + insnlen = 1; + lgread(lg, &insn, physaddr + insnlen, 1); + } + + switch (insn & 0xFE) { + case 0xE4: /* in <next byte>,%al */ + insnlen += 2; + in = 1; + break; + case 0xEC: /* in (%dx),%al */ + insnlen += 1; + in = 1; + break; + case 0xE6: /* out %al,<next byte> */ + insnlen += 2; + break; + case 0xEE: /* out %al,(%dx) */ + insnlen += 1; + break; + default: + return 0; + } + + if (in) { + /* Lower bit tells is whether it's a 16 or 32 bit access */ + if (insn & 0x1) + lg->regs->eax = 0xFFFFFFFF; + else + lg->regs->eax |= (0xFFFF << shift); + } + lg->regs->eip += insnlen; + return 1; +} + +int lguest_address_ok(const struct lguest *lg, + unsigned long addr, unsigned long len) +{ + return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); +} + +/* Just like get_user, but don't let guest access lguest binary. */ +u32 lgread_u32(struct lguest *lg, u32 addr) +{ + u32 val = 0; + + /* Don't let them access lguest binary */ + if (!lguest_address_ok(lg, addr, sizeof(val)) + || get_user(val, (u32 __user *)addr) != 0) + kill_guest(lg, "bad read address %u", addr); + return val; +} + +void lgwrite_u32(struct lguest *lg, u32 addr, u32 val) +{ + if (!lguest_address_ok(lg, addr, sizeof(val)) + || put_user(val, (u32 __user *)addr) != 0) + kill_guest(lg, "bad write address %u", addr); +} + +void lgread(struct lguest *lg, void *b, u32 addr, unsigned bytes) +{ + if (!lguest_address_ok(lg, addr, bytes) + || copy_from_user(b, (void __user *)addr, bytes) != 0) { + /* copy_from_user should do this, but as we rely on it... */ + memset(b, 0, bytes); + kill_guest(lg, "bad read address %u len %u", addr, bytes); + } +} + +void lgwrite(struct lguest *lg, u32 addr, const void *b, unsigned bytes) +{ + if (!lguest_address_ok(lg, addr, bytes) + || copy_to_user((void __user *)addr, b, bytes) != 0) + kill_guest(lg, "bad write address %u len %u", addr, bytes); +} + +static void set_ts(void) +{ + u32 cr0; + + cr0 = read_cr0(); + if (!(cr0 & 8)) + write_cr0(cr0|8); +} + +static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) +{ + if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { + __get_cpu_var(last_guest) = lg; + lg->last_pages = pages; + lg->changed = CHANGED_ALL; + } + + /* These are pretty cheap, so we do them unconditionally. */ + pages->state.host_cr3 = __pa(current->mm->pgd); + map_switcher_in_guest(lg, pages); + pages->state.guest_tss.esp1 = lg->esp1; + pages->state.guest_tss.ss1 = lg->ss1; + + /* Copy direct trap entries. */ + if (lg->changed & CHANGED_IDT) + copy_traps(lg, pages->state.guest_idt, default_idt_entries); + + /* Copy all GDT entries but the TSS. */ + if (lg->changed & CHANGED_GDT) + copy_gdt(lg, pages->state.guest_gdt); + + lg->changed = 0; +} + +static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) +{ + unsigned int clobber; + + copy_in_guest_info(lg, pages); + + /* Put eflags on stack, lcall does rest: suitable for iret return. */ + asm volatile("pushf; lcall *lguest_entry" + : "=a"(clobber), "=b"(clobber) + : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) + : "memory", "%edx", "%ecx", "%edi", "%esi"); +} + +int run_guest(struct lguest *lg, char *__user user) +{ + while (!lg->dead) { + unsigned int cr2 = 0; /* Damn gcc */ + + /* Hypercalls first: we might have been out to userspace */ + do_hypercalls(lg); + if (lg->dma_is_pending) { + put_user(lg->pending_dma, (unsigned long *)user); + put_user(lg->pending_key, (unsigned long *)user+1); + return sizeof(unsigned long)*2; + } + + if (signal_pending(current)) + return -EINTR; + maybe_do_interrupt(lg); + + try_to_freeze(); + + if (lg->dead) + break; + + if (lg->halted) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + continue; + } + + local_irq_disable(); + + /* Even if *we* don't want FPU trap, guest might... */ + if (lg->ts) + set_ts(); + + run_guest_once(lg, lguest_pages(raw_smp_processor_id())); + + /* Save cr2 now if we page-faulted. */ + if (lg->regs->trapnum == 14) + cr2 = read_cr2(); + else if (lg->regs->trapnum == 7) + math_state_restore(); + local_irq_enable(); + + switch (lg->regs->trapnum) { + case 13: /* We've intercepted a GPF. */ + if (lg->regs->errcode == 0) { + if (emulate_insn(lg)) + continue; + } + break; + case 14: /* We've intercepted a page fault. */ + if (demand_page(lg, cr2, lg->regs->errcode & 2)) + continue; + + /* If lguest_data is NULL, this won't hurt. */ + put_user(cr2, &lg->lguest_data->cr2); + break; + case 7: /* We've intercepted a Device Not Available fault. */ + /* If they don't want to know, just absorb it. */ + if (!lg->ts) + continue; + break; + case 32 ... 255: /* Real interrupt, fall thru */ + cond_resched(); + case LGUEST_TRAP_ENTRY: /* Handled at top of loop */ + continue; + } + + if (deliver_trap(lg, lg->regs->trapnum)) + continue; + + kill_guest(lg, "unhandled trap %i at %#x (%#x)", + lg->regs->trapnum, lg->regs->eip, + lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode); + } + return -ENOENT; +} + +int find_free_guest(void) +{ + unsigned int i; + for (i = 0; i < MAX_LGUEST_GUESTS; i++) + if (!lguests[i].tsk) + return i; + return -1; +} + +static void adjust_pge(void *on) +{ + if (on) + write_cr4(read_cr4() | X86_CR4_PGE); + else + write_cr4(read_cr4() & ~X86_CR4_PGE); +} + +static int __init init(void) +{ + int err; + + if (paravirt_enabled()) { + printk("lguest is afraid of %s\n", paravirt_ops.name); + return -EPERM; + } + + err = map_switcher(); + if (err) + return err; + + err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); + if (err) { + unmap_switcher(); + return err; + } + lguest_io_init(); + + err = lguest_device_init(); + if (err) { + free_pagetables(); + unmap_switcher(); + return err; + } + lock_cpu_hotplug(); + if (cpu_has_pge) { /* We have a broader idea of "global". */ + cpu_had_pge = 1; + on_each_cpu(adjust_pge, 0, 0, 1); + clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); + } + unlock_cpu_hotplug(); + return 0; +} + +static void __exit fini(void) +{ + lguest_device_remove(); + free_pagetables(); + unmap_switcher(); + lock_cpu_hotplug(); + if (cpu_had_pge) { + set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); + on_each_cpu(adjust_pge, (void *)1, 0, 1); + } + unlock_cpu_hotplug(); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); =================================================================== --- /dev/null +++ b/drivers/lguest/hypercalls.c @@ -0,0 +1,172 @@ +/* Actual hypercalls, which allow guests to actually do something. + Copyright (C) 2006 Rusty Russell IBM Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include <linux/uaccess.h> +#include <linux/syscalls.h> +#include <linux/mm.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <irq_vectors.h> +#include "lg.h" + +static void do_hcall(struct lguest *lg, struct lguest_regs *regs) +{ + switch (regs->eax) { + case LHCALL_FLUSH_ASYNC: + break; + case LHCALL_LGUEST_INIT: + kill_guest(lg, "already have lguest_data"); + break; + case LHCALL_CRASH: { + char msg[128]; + lgread(lg, msg, regs->edx, sizeof(msg)); + msg[sizeof(msg)-1] = '\0'; + kill_guest(lg, "CRASH: %s", msg); + break; + } + case LHCALL_FLUSH_TLB: + if (regs->edx) + guest_pagetable_clear_all(lg); + else + guest_pagetable_flush_user(lg); + break; + case LHCALL_TIMER_READ: { + u32 now = jiffies; + mb(); + regs->eax = now - lg->last_timer; + lg->last_timer = now; + break; + } + case LHCALL_GET_WALLCLOCK: { + struct timespec ts; + ktime_get_real_ts(&ts); + regs->eax = ts.tv_sec; + break; + } + case LHCALL_BIND_DMA: + regs->eax = bind_dma(lg, regs->edx, regs->ebx, + regs->ecx >> 8, regs->ecx & 0xFF); + break; + case LHCALL_SEND_DMA: + send_dma(lg, regs->edx, regs->ebx); + break; + case LHCALL_LOAD_GDT: + load_guest_gdt(lg, regs->edx, regs->ebx); + break; + case LHCALL_LOAD_IDT_ENTRY: + load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx); + break; + case LHCALL_NEW_PGTABLE: + guest_new_pagetable(lg, regs->edx); + break; + case LHCALL_SET_STACK: + guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx); + break; + case LHCALL_SET_PTE: + guest_set_pte(lg, regs->edx, regs->ebx, mkgpte(regs->ecx)); + break; + case LHCALL_SET_PMD: + guest_set_pmd(lg, regs->edx, regs->ebx); + break; + case LHCALL_LOAD_TLS: + guest_load_tls(lg, (struct desc_struct __user*)regs->edx); + break; + case LHCALL_TS: + lg->ts = regs->edx; + break; + case LHCALL_HALT: + lg->halted = 1; + break; + default: + kill_guest(lg, "Bad hypercall %i\n", regs->eax); + } +} + +/* We always do queued calls before actual hypercall. */ +static void do_async_hcalls(struct lguest *lg) +{ + unsigned int i; + u8 st[LHCALL_RING_SIZE]; + + copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)); + for (i = 0; i < ARRAY_SIZE(st); i++) { + struct lguest_regs regs; + unsigned int n = lg->next_hcall; + + if (st[n] == 0xFF) + break; + + if (++lg->next_hcall == LHCALL_RING_SIZE) + lg->next_hcall = 0; + + get_user(regs.eax, &lg->lguest_data->hcalls[n].eax); + get_user(regs.edx, &lg->lguest_data->hcalls[n].edx); + get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx); + get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx); + do_hcall(lg, ®s); + put_user(0xFF, &lg->lguest_data->hcall_status[n]); + if (lg->dma_is_pending) + break; + } +} + +static void initialize(struct lguest *lg) +{ + if (lg->regs->eax != LHCALL_LGUEST_INIT) { + kill_guest(lg, "hypercall %i before LGUEST_INIT", + lg->regs->eax); + return; + } + + lg->lguest_data = (struct lguest_data __user *)lg->regs->edx; + /* We check here so we can simply copy_to_user/from_user */ + if (!lguest_address_ok(lg, (long)lg->lguest_data, + sizeof(*lg->lguest_data))) { + kill_guest(lg, "bad guest page %p", lg->lguest_data); + return; + } + get_user(lg->noirq_start, &lg->lguest_data->noirq_start); + get_user(lg->noirq_end, &lg->lguest_data->noirq_end); + /* We reserve the top pgd entry. */ + put_user(4U*1024*1024, &lg->lguest_data->reserve_mem); + put_user(lg->guestid, &lg->lguest_data->guestid); +} + +/* Even if we go out to userspace and come back, we don't want to do + * the hypercall again. */ +static void clear_hcall(struct lguest *lg) +{ + lg->regs->trapnum = 255; +} + +void do_hypercalls(struct lguest *lg) +{ + if (unlikely(!lg->lguest_data)) { + if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) { + initialize(lg); + clear_hcall(lg); + } + return; + } + + do_async_hcalls(lg); + if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) { + do_hcall(lg, lg->regs); + clear_hcall(lg); + } + set_wakeup_process(lg, NULL); +} =================================================================== --- /dev/null +++ b/drivers/lguest/interrupts_and_traps.c @@ -0,0 +1,258 @@ +#include <linux/uaccess.h> +#include "lg.h" + +static unsigned long idt_address(u32 lo, u32 hi) +{ + return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); +} + +static int idt_type(u32 lo, u32 hi) +{ + return (hi >> 8) & 0xF; +} + +static int idt_present(u32 lo, u32 hi) +{ + return (hi & 0x8000); +} + +static void push_guest_stack(struct lguest *lg, u32 __user **gstack, u32 val) +{ + lgwrite_u32(lg, (u32)--(*gstack), val); +} + +static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) +{ + u32 __user *gstack; + u32 eflags, ss, irq_enable; + + /* If they want a ring change, we use new stack and push old ss/esp */ + if ((lg->regs->ss&0x3) != GUEST_PL) { + gstack = (u32 __user *)guest_pa(lg, lg->esp1); + ss = lg->ss1; + push_guest_stack(lg, &gstack, lg->regs->ss); + push_guest_stack(lg, &gstack, lg->regs->esp); + } else { + gstack = (u32 __user *)guest_pa(lg, lg->regs->esp); + ss = lg->regs->ss; + } + + /* We use IF bit in eflags to indicate whether irqs were disabled + (it's always 0, since irqs are enabled when guest is running). */ + eflags = lg->regs->eflags; + get_user(irq_enable, &lg->lguest_data->irq_enabled); + eflags |= (irq_enable & X86_EFLAGS_IF); + + push_guest_stack(lg, &gstack, eflags); + push_guest_stack(lg, &gstack, lg->regs->cs); + push_guest_stack(lg, &gstack, lg->regs->eip); + + if (has_err) + push_guest_stack(lg, &gstack, lg->regs->errcode); + + /* Change the real stack so switcher returns to trap handler */ + lg->regs->ss = ss; + lg->regs->esp = (u32)gstack + lg->page_offset; + lg->regs->cs = (__KERNEL_CS|GUEST_PL); + lg->regs->eip = idt_address(lo, hi); + + /* Disable interrupts for an interrupt gate. */ + if (idt_type(lo, hi) == 0xE) + put_user(0, &lg->lguest_data->irq_enabled); +} + +void maybe_do_interrupt(struct lguest *lg) +{ + unsigned int irq; + DECLARE_BITMAP(blk, LGUEST_IRQS); + struct desc_struct *idt; + + if (!lg->lguest_data) + return; + + /* If timer has changed, set timer interrupt. */ + if (jiffies != lg->last_timer) + set_bit(0, lg->irqs_pending); + + /* Mask out any interrupts they have blocked. */ + copy_from_user(&blk, lg->lguest_data->blocked_interrupts, sizeof(blk)); + bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS); + + irq = find_first_bit(blk, LGUEST_IRQS); + if (irq >= LGUEST_IRQS) + return; + + if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end) + return; + + /* If they're halted, we re-enable interrupts. */ + if (lg->halted) { + /* Re-enable interrupts. */ + put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled); + lg->halted = 0; + } else { + /* Maybe they have interrupts disabled? */ + u32 irq_enabled; + get_user(irq_enabled, &lg->lguest_data->irq_enabled); + if (!irq_enabled) + return; + } + + idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq]; + if (idt_present(idt->a, idt->b)) { + clear_bit(irq, lg->irqs_pending); + set_guest_interrupt(lg, idt->a, idt->b, 0); + } +} + +static int has_err(unsigned int trap) +{ + return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); +} + +int deliver_trap(struct lguest *lg, unsigned int num) +{ + u32 lo = lg->idt[num].a, hi = lg->idt[num].b; + + if (!idt_present(lo, hi)) + return 0; + set_guest_interrupt(lg, lo, hi, has_err(num)); + return 1; +} + +static int direct_trap(const struct lguest *lg, + const struct desc_struct *trap, + unsigned int num) +{ + /* Hardware interrupts don't go to guest (except syscall). */ + if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR) + return 0; + + /* We intercept page fault (demand shadow paging & cr2 saving) + protection fault (in/out emulation) and device not + available (TS handling), and hypercall */ + if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY) + return 0; + + /* Interrupt gates (0xE) or not present (0x0) can't go direct. */ + return idt_type(trap->a, trap->b) == 0xF; +} + +static void pin_stack_pages(struct lguest *lg) +{ + unsigned int i; + + for (i = 0; i < lg->stack_pages; i++) + pin_page(lg, lg->esp1 - i * PAGE_SIZE); +} + +/* We need to ensure all the direct trap pages are mapped after we + * clear shadow mappings. */ +void pin_trap_pages(struct lguest *lg) +{ + unsigned int i; + struct desc_struct *trap; + + for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) { + trap = &lg->idt[i]; + if (direct_trap(lg, trap, i)) + pin_page(lg, idt_address(trap->a, trap->b)); + } + + trap = &lg->syscall_idt; + if (direct_trap(lg, trap, SYSCALL_VECTOR)) + pin_page(lg, idt_address(trap->a, trap->b)); + pin_stack_pages(lg); +} + +void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) +{ + /* You cannot have a stack segment with priv level 0. */ + if ((seg & 0x3) != GUEST_PL) + kill_guest(lg, "bad stack segment %i", seg); + if (pages > 2) + kill_guest(lg, "bad stack pages %u", pages); + lg->ss1 = seg; + lg->esp1 = esp; + lg->stack_pages = pages; + pin_stack_pages(lg); +} + +/* Set up trap in IDT. */ +static void set_trap(struct lguest *lg, struct desc_struct *trap, + unsigned int num, u32 lo, u32 hi) +{ + u8 type = idt_type(lo, hi); + + if (!idt_present(lo, hi)) { + trap->a = trap->b = 0; + return; + } + + if (type != 0xE && type != 0xF) + kill_guest(lg, "bad IDT type %i", type); + + trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); + trap->b = (hi&0xFFFFEF00); + + /* Make sure trap address is available so we don't fault. In + * theory, it could overlap two pages, in practice it's aligned. */ + if (direct_trap(lg, trap, num)) + pin_page(lg, idt_address(lo, hi)); +} + +void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi) +{ + /* Guest never handles: NMI, doublefault, hypercall, spurious irq. */ + if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) + return; + + lg->changed |= CHANGED_IDT; + if (num < ARRAY_SIZE(lg->idt)) + set_trap(lg, &lg->idt[num], num, lo, hi); + else if (num == SYSCALL_VECTOR) + set_trap(lg, &lg->syscall_idt, num, lo, hi); +} + +static void default_idt_entry(struct desc_struct *idt, + int trap, + const unsigned long handler) +{ + u32 flags = 0x8e00; + + /* They can't "int" into any of them except hypercall. */ + if (trap == LGUEST_TRAP_ENTRY) + flags |= (GUEST_PL << 13); + + idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF); + idt->b = (handler&0xFFFF0000) | flags; +} + +void setup_default_idt_entries(struct lguest_ro_state *state, + const unsigned long *def) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++) + default_idt_entry(&state->guest_idt[i], i, def[i]); +} + +void copy_traps(const struct lguest *lg, struct desc_struct *idt, + const unsigned long *def) +{ + unsigned int i; + + /* All hardware interrupts are same whatever the guest: only the + * traps might be different. */ + for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) { + if (direct_trap(lg, &lg->idt[i], i)) + idt[i] = lg->idt[i]; + else + default_idt_entry(&idt[i], i, def[i]); + } + i = SYSCALL_VECTOR; + if (direct_trap(lg, &lg->syscall_idt, i)) + idt[i] = lg->syscall_idt; + else + default_idt_entry(&idt[i], i, def[i]); +} =================================================================== --- /dev/null +++ b/drivers/lguest/io.c @@ -0,0 +1,412 @@ +/* Simple I/O model for guests, based on shared memory. + * Copyright (C) 2006 Rusty Russell IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <linux/types.h> +#include <linux/futex.h> +#include <linux/jhash.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/uaccess.h> +#include "lg.h" + +static struct list_head dma_hash[61]; + +void lguest_io_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(dma_hash); i++) + INIT_LIST_HEAD(&dma_hash[i]); +} + +/* FIXME: allow multi-page lengths. */ +static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma) +{ + unsigned int i; + + for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { + if (!dma->len[i]) + return 1; + if (!lguest_address_ok(lg, dma->addr[i], dma->len[i])) + goto kill; + if (dma->len[i] > PAGE_SIZE) + goto kill; + /* We could do over a page, but is it worth it? */ + if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE) + goto kill; + } + return 1; + +kill: + kill_guest(lg, "bad DMA entry: %u@%#x", dma->len[i], dma->addr[i]); + return 0; +} + +static unsigned int hash(const union futex_key *key) +{ + return jhash2((u32*)&key->both.word, + (sizeof(key->both.word)+sizeof(key->both.ptr))/4, + key->both.offset) + % ARRAY_SIZE(dma_hash); +} + +static inline int key_eq(const union futex_key *a, const union futex_key *b) +{ + return (a->both.word == b->both.word + && a->both.ptr == b->both.ptr + && a->both.offset == b->both.offset); +} + +/* Must hold read lock on dmainfo owner's current->mm->mmap_sem */ +static void unlink_dma(struct lguest_dma_info *dmainfo) +{ + BUG_ON(!mutex_is_locked(&lguest_lock)); + dmainfo->interrupt = 0; + list_del(&dmainfo->list); + drop_futex_key_refs(&dmainfo->key); +} + +static int unbind_dma(struct lguest *lg, + const union futex_key *key, + unsigned long dmas) +{ + int i, ret = 0; + + for (i = 0; i < LGUEST_MAX_DMA; i++) { + if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) { + unlink_dma(&lg->dma[i]); + ret = 1; + break; + } + } + return ret; +} + +int bind_dma(struct lguest *lg, + unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt) +{ + unsigned int i; + int ret = 0; + union futex_key key; + + if (interrupt >= LGUEST_IRQS) + return 0; + + mutex_lock(&lguest_lock); + down_read(¤t->mm->mmap_sem); + if (get_futex_key((u32 __user *)ukey, &key) != 0) { + kill_guest(lg, "bad dma key %#lx", ukey); + goto unlock; + } + get_futex_key_refs(&key); + + if (interrupt == 0) + ret = unbind_dma(lg, &key, dmas); + else { + for (i = 0; i < LGUEST_MAX_DMA; i++) { + if (lg->dma[i].interrupt) + continue; + + lg->dma[i].dmas = dmas; + lg->dma[i].num_dmas = numdmas; + lg->dma[i].next_dma = 0; + lg->dma[i].key = key; + lg->dma[i].guestid = lg->guestid; + lg->dma[i].interrupt = interrupt; + list_add(&lg->dma[i].list, &dma_hash[hash(&key)]); + ret = 1; + goto unlock; + } + } + drop_futex_key_refs(&key); +unlock: + up_read(¤t->mm->mmap_sem); + mutex_unlock(&lguest_lock); + return ret; +} + +/* lgread from another guest */ +static int lgread_other(struct lguest *lg, + void *buf, u32 addr, unsigned bytes) +{ + if (!lguest_address_ok(lg, addr, bytes) + || access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) { + memset(buf, 0, bytes); + kill_guest(lg, "bad address in registered DMA struct"); + return 0; + } + return 1; +} + +/* lgwrite to another guest */ +static int lgwrite_other(struct lguest *lg, u32 addr, + const void *buf, unsigned bytes) +{ + if (!lguest_address_ok(lg, addr, bytes) + || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1) + != bytes)) { + kill_guest(lg, "bad address writing to registered DMA"); + return 0; + } + return 1; +} + +static u32 copy_data(struct lguest *srclg, + const struct lguest_dma *src, + const struct lguest_dma *dst, + struct page *pages[]) +{ + unsigned int totlen, si, di, srcoff, dstoff; + void *maddr = NULL; + + totlen = 0; + si = di = 0; + srcoff = dstoff = 0; + while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si] + && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) { + u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff); + + if (!maddr) + maddr = kmap(pages[di]); + + /* FIXME: This is not completely portable, since + archs do different things for copy_to_user_page. */ + if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE, + (void *__user)src->addr[si], len) != 0) { + kill_guest(srclg, "bad address in sending DMA"); + totlen = 0; + break; + } + + totlen += len; + srcoff += len; + dstoff += len; + if (srcoff == src->len[si]) { + si++; + srcoff = 0; + } + if (dstoff == dst->len[di]) { + kunmap(pages[di]); + maddr = NULL; + di++; + dstoff = 0; + } + } + + if (maddr) + kunmap(pages[di]); + + return totlen; +} + +/* Src is us, ie. current. */ +static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src, + struct lguest *dstlg, const struct lguest_dma *dst) +{ + int i; + u32 ret; + struct page *pages[LGUEST_MAX_DMA_SECTIONS]; + + if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src)) + return 0; + + /* First get the destination pages */ + for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { + if (dst->len[i] == 0) + break; + if (get_user_pages(dstlg->tsk, dstlg->mm, + dst->addr[i], 1, 1, 1, pages+i, NULL) + != 1) { + kill_guest(dstlg, "Error mapping DMA pages"); + ret = 0; + goto drop_pages; + } + } + + /* Now copy until we run out of src or dst. */ + ret = copy_data(srclg, src, dst, pages); + +drop_pages: + while (--i >= 0) + put_page(pages[i]); + return ret; +} + +static int dma_transfer(struct lguest *srclg, + unsigned long udma, + struct lguest_dma_info *dst) +{ + struct lguest_dma dst_dma, src_dma; + struct lguest *dstlg; + u32 i, dma = 0; + + dstlg = &lguests[dst->guestid]; + /* Get our dma list. */ + lgread(srclg, &src_dma, udma, sizeof(src_dma)); + + /* We can't deadlock against them dmaing to us, because this + * is all under the lguest_lock. */ + down_read(&dstlg->mm->mmap_sem); + + for (i = 0; i < dst->num_dmas; i++) { + dma = (dst->next_dma + i) % dst->num_dmas; + if (!lgread_other(dstlg, &dst_dma, + dst->dmas + dma * sizeof(struct lguest_dma), + sizeof(dst_dma))) { + goto fail; + } + if (!dst_dma.used_len) + break; + } + if (i != dst->num_dmas) { + unsigned long used_lenp; + unsigned int ret; + + ret = do_dma(srclg, &src_dma, dstlg, &dst_dma); + /* Put used length in src. */ + lgwrite_u32(srclg, + udma+offsetof(struct lguest_dma, used_len), ret); + if (ret == 0 && src_dma.len[0] != 0) + goto fail; + + /* Make sure destination sees contents before length. */ + wmb(); + used_lenp = dst->dmas + + dma * sizeof(struct lguest_dma) + + offsetof(struct lguest_dma, used_len); + lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret)); + dst->next_dma++; + } + up_read(&dstlg->mm->mmap_sem); + + /* Do this last so dst doesn't simply sleep on lock. */ + set_bit(dst->interrupt, dstlg->irqs_pending); + set_wakeup_process(srclg, dstlg->tsk); + return i == dst->num_dmas; + +fail: + up_read(&dstlg->mm->mmap_sem); + return 0; +} + +void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma) +{ + union futex_key key; + int empty = 0; + +again: + mutex_lock(&lguest_lock); + down_read(¤t->mm->mmap_sem); + if (get_futex_key((u32 __user *)ukey, &key) != 0) { + kill_guest(lg, "bad sending DMA key"); + goto unlock; + } + /* Shared mapping? Look for other guests... */ + if (key.shared.offset & 1) { + struct lguest_dma_info *i; + list_for_each_entry(i, &dma_hash[hash(&key)], list) { + if (i->guestid == lg->guestid) + continue; + if (!key_eq(&key, &i->key)) + continue; + + empty += dma_transfer(lg, udma, i); + break; + } + if (empty == 1) { + /* Give any recipients one chance to restock. */ + up_read(¤t->mm->mmap_sem); + mutex_unlock(&lguest_lock); + set_wakeup_process(lg, NULL); + empty++; + goto again; + } + } else { + /* Private mapping: tell our userspace. */ + lg->dma_is_pending = 1; + lg->pending_dma = udma; + lg->pending_key = ukey; + } +unlock: + up_read(¤t->mm->mmap_sem); + mutex_unlock(&lguest_lock); +} + +void release_all_dma(struct lguest *lg) +{ + unsigned int i; + + BUG_ON(!mutex_is_locked(&lguest_lock)); + + down_read(&lg->mm->mmap_sem); + for (i = 0; i < LGUEST_MAX_DMA; i++) { + if (lg->dma[i].interrupt) + unlink_dma(&lg->dma[i]); + } + up_read(&lg->mm->mmap_sem); +} + +/* We cache one process to wakeup: helps for batching & wakes outside locks. */ +void set_wakeup_process(struct lguest *lg, struct task_struct *p) +{ + if (p == lg->wake) + return; + + if (lg->wake) { + wake_up_process(lg->wake); + put_task_struct(lg->wake); + } + lg->wake = p; + if (lg->wake) + get_task_struct(lg->wake); +} + +/* Userspace wants a dma buffer from this guest. */ +unsigned long get_dma_buffer(struct lguest *lg, + unsigned long ukey, unsigned long *interrupt) +{ + unsigned long ret = 0; + union futex_key key; + struct lguest_dma_info *i; + + mutex_lock(&lguest_lock); + down_read(¤t->mm->mmap_sem); + if (get_futex_key((u32 __user *)ukey, &key) != 0) { + kill_guest(lg, "bad registered DMA buffer"); + goto unlock; + } + list_for_each_entry(i, &dma_hash[hash(&key)], list) { + if (key_eq(&key, &i->key) && i->guestid == lg->guestid) { + unsigned int j; + for (j = 0; j < i->num_dmas; j++) { + struct lguest_dma dma; + + ret = i->dmas + j * sizeof(struct lguest_dma); + lgread(lg, &dma, ret, sizeof(dma)); + if (dma.used_len == 0) + break; + } + *interrupt = i->interrupt; + break; + } + } +unlock: + up_read(¤t->mm->mmap_sem); + mutex_unlock(&lguest_lock); + return ret; +} + =================================================================== --- /dev/null +++ b/drivers/lguest/lg.h @@ -0,0 +1,278 @@ +#ifndef _LGUEST_H +#define _LGUEST_H + +#include <asm/desc.h> + +#define GDT_ENTRY_LGUEST_CS 10 +#define GDT_ENTRY_LGUEST_DS 11 +#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) +#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) + +#ifndef __ASSEMBLY__ +#include <linux/types.h> +#include <linux/init.h> +#include <linux/stringify.h> +#include <linux/binfmts.h> +#include <linux/futex.h> +#include <linux/lguest.h> +#include <linux/lguest_launcher.h> +#include <linux/err.h> +#include <asm/semaphore.h> +#include "irq_vectors.h" + +#define GUEST_PL 1 + +struct lguest_regs +{ + /* Manually saved part. */ + u32 ebx, ecx, edx; + u32 esi, edi, ebp; + u32 gs; + u32 eax; + u32 fs, ds, es; + u32 trapnum, errcode; + /* Trap pushed part */ + u32 eip; + u32 cs; + u32 eflags; + u32 esp; + u32 ss; +}; + +void free_pagetables(void); +int init_pagetables(struct page **switcher_page, unsigned int pages); + +/* Full 4G segment descriptors, suitable for CS and DS. */ +#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00}) +#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300}) + +struct lguest_dma_info +{ + struct list_head list; + union futex_key key; + unsigned long dmas; + u16 next_dma; + u16 num_dmas; + u16 guestid; + u8 interrupt; /* 0 when not registered */ +}; + +/* We have separate types for the guest's ptes & pgds and the shadow ptes & + * pgds. Since this host might use three-level pagetables and the guest and + * shadow pagetables don't, we can't use the normal pte_t/pgd_t. */ +typedef union { + struct { unsigned flags:12, pfn:20; }; + struct { unsigned long val; } raw; +} spgd_t; +typedef union { + struct { unsigned flags:12, pfn:20; }; + struct { unsigned long val; } raw; +} spte_t; +typedef union { + struct { unsigned flags:12, pfn:20; }; + struct { unsigned long val; } raw; +} gpgd_t; +typedef union { + struct { unsigned flags:12, pfn:20; }; + struct { unsigned long val; } raw; +} gpte_t; +#define mkgpte(_val) ((gpte_t){.raw.val = _val}) +#define mkgpgd(_val) ((gpgd_t){.raw.val = _val}) + +struct pgdir +{ + unsigned long cr3; + spgd_t *pgdir; +}; + +/* FIXME: When this is defined in processor.h, delete this definition. */ +struct i386_hw_tss { + unsigned short back_link,__blh; + unsigned long esp0; + unsigned short ss0,__ss0h; + unsigned long esp1; + unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */ + unsigned long esp2; + unsigned short ss2,__ss2h; + unsigned long __cr3; + unsigned long eip; + unsigned long eflags; + unsigned long eax,ecx,edx,ebx; + unsigned long esp; + unsigned long ebp; + unsigned long esi; + unsigned long edi; + unsigned short es, __esh; + unsigned short cs, __csh; + unsigned short ss, __ssh; + unsigned short ds, __dsh; + unsigned short fs, __fsh; + unsigned short gs, __gsh; + unsigned short ldt, __ldth; + unsigned short trace, io_bitmap_base; +}; + +/* This is a guest-specific page (mapped ro) into the guest. */ +struct lguest_ro_state +{ + /* Host information we need to restore when we switch back. */ + u32 host_cr3; + struct Xgt_desc_struct host_idt_desc; + struct Xgt_desc_struct host_gdt_desc; + u32 host_sp; + + /* Fields which are used when guest is running. */ + struct Xgt_desc_struct guest_idt_desc; + struct Xgt_desc_struct guest_gdt_desc; + struct i386_hw_tss guest_tss; + struct desc_struct guest_idt[IDT_ENTRIES]; + struct desc_struct guest_gdt[GDT_ENTRIES]; +}; + +/* We have two pages shared with guests, per cpu. */ +struct lguest_pages +{ + /* This is the stack page mapped rw in guest */ + char spare[PAGE_SIZE - sizeof(struct lguest_regs)]; + struct lguest_regs regs; + + /* This is the host state & guest descriptor page, ro in guest */ + struct lguest_ro_state state; +} __attribute__((aligned(PAGE_SIZE))); + +#define CHANGED_IDT 1 +#define CHANGED_GDT 2 +#define CHANGED_ALL 3 + +/* The private info the thread maintains about the guest. */ +struct lguest +{ + /* At end of a page shared mapped over lguest_pages in guest. */ + unsigned long regs_page; + struct lguest_regs *regs; + struct lguest_data __user *lguest_data; + struct task_struct *tsk; + struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ + u16 guestid; + u32 pfn_limit; + u32 page_offset; + u32 cr2; + int halted; + int ts; + u32 last_timer; + u32 next_hcall; + u32 esp1; + u8 ss1; + + /* Bitmap of what has changed: see CHANGED_* above. */ + int changed; + struct lguest_pages *last_pages; + + /* We keep a small number of these. */ + u32 pgdidx; + struct pgdir pgdirs[4]; + + /* Cached wakeup: we hold a reference to this task. */ + struct task_struct *wake; + + unsigned long noirq_start, noirq_end; + int dma_is_pending; + unsigned long pending_dma; /* struct lguest_dma */ + unsigned long pending_key; /* address they're sending to */ + + unsigned int stack_pages; + + struct lguest_dma_info dma[LGUEST_MAX_DMA]; + + /* Dead? */ + const char *dead; + + /* The GDT entries copied into lguest_ro_state when running. */ + struct desc_struct gdt[GDT_ENTRIES]; + + /* The IDT entries: some copied into lguest_ro_state when running. */ + struct desc_struct idt[FIRST_EXTERNAL_VECTOR+LGUEST_IRQS]; + struct desc_struct syscall_idt; + + /* Pending virtual interrupts */ + DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); +}; + +extern struct lguest lguests[]; +extern struct mutex lguest_lock; + +/* core.c: */ +u32 lgread_u32(struct lguest *lg, u32 addr); +void lgwrite_u32(struct lguest *lg, u32 val, u32 addr); +void lgread(struct lguest *lg, void *buf, u32 addr, unsigned bytes); +void lgwrite(struct lguest *lg, u32 addr, const void *buf, unsigned bytes); +int find_free_guest(void); +int lguest_address_ok(const struct lguest *lg, + unsigned long addr, unsigned long len); +int run_guest(struct lguest *lg, char *__user user); + + +/* interrupts_and_traps.c: */ +void maybe_do_interrupt(struct lguest *lg); +int deliver_trap(struct lguest *lg, unsigned int num); +void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi); +void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages); +void pin_trap_pages(struct lguest *lg); +void setup_default_idt_entries(struct lguest_ro_state *state, + const unsigned long *def); +void copy_traps(const struct lguest *lg, struct desc_struct *idt, + const unsigned long *def); + +/* segments.c: */ +void setup_default_gdt_entries(struct lguest_ro_state *state); +void setup_guest_gdt(struct lguest *lg); +void load_guest_gdt(struct lguest *lg, u32 table, u32 num); +void guest_load_tls(struct lguest *lg, + const struct desc_struct __user *tls_array); +void copy_gdt(const struct lguest *lg, struct desc_struct *gdt); + +/* page_tables.c: */ +int init_guest_pagetable(struct lguest *lg, unsigned long pgtable); +void free_guest_pagetable(struct lguest *lg); +void guest_new_pagetable(struct lguest *lg, unsigned long pgtable); +void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 i); +void guest_pagetable_clear_all(struct lguest *lg); +void guest_pagetable_flush_user(struct lguest *lg); +void guest_set_pte(struct lguest *lg, unsigned long cr3, + unsigned long vaddr, gpte_t val); +void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); +int demand_page(struct lguest *info, unsigned long cr2, int write); +void pin_page(struct lguest *lg, unsigned long vaddr); + +/* lguest_user.c: */ +int lguest_device_init(void); +void lguest_device_remove(void); + +/* io.c: */ +void lguest_io_init(void); +int bind_dma(struct lguest *lg, + unsigned long key, unsigned long udma, u16 numdmas, u8 interrupt); +void send_dma(struct lguest *info, unsigned long key, unsigned long udma); +void release_all_dma(struct lguest *lg); +unsigned long get_dma_buffer(struct lguest *lg, unsigned long key, + unsigned long *interrupt); +void set_wakeup_process(struct lguest *lg, struct task_struct *p); + +/* hypercalls.c: */ +void do_hypercalls(struct lguest *lg); + +#define kill_guest(lg, fmt...) \ +do { \ + if (!(lg)->dead) { \ + (lg)->dead = kasprintf(GFP_ATOMIC, fmt); \ + if (!(lg)->dead) \ + (lg)->dead = ERR_PTR(-ENOMEM); \ + } \ +} while(0) + +static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) +{ + return vaddr - lg->page_offset; +} +#endif /* __ASSEMBLY__ */ +#endif /* _LGUEST_H */ =================================================================== --- /dev/null +++ b/drivers/lguest/lguest_user.c @@ -0,0 +1,192 @@ +/* Userspace control of the guest, via /dev/lguest. */ +#include <linux/uaccess.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include "lg.h" + +static void setup_regs(struct lguest_regs *regs, unsigned long start) +{ + /* Write out stack in format lguest expects, so we can switch to it. */ + regs->edi = LGUEST_MAGIC_EDI; + regs->ebp = LGUEST_MAGIC_EBP; + regs->esi = LGUEST_MAGIC_ESI; + regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; + regs->cs = __KERNEL_CS|GUEST_PL; + regs->eflags = 0x202; /* Interrupts enabled. */ + regs->eip = start; +} + +/* + addr */ +static long user_get_dma(struct lguest *lg, const u32 __user *input) +{ + unsigned long key, udma, irq; + + if (get_user(key, input) != 0) + return -EFAULT; + udma = get_dma_buffer(lg, key, &irq); + if (!udma) + return -ENOENT; + + /* We put irq number in udma->used_len. */ + lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq); + return udma; +} + +/* + irq */ +static int user_send_irq(struct lguest *lg, const u32 __user *input) +{ + u32 irq; + + if (get_user(irq, input) != 0) + return -EFAULT; + if (irq >= LGUEST_IRQS) + return -EINVAL; + set_bit(irq, lg->irqs_pending); + return 0; +} + +static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) +{ + struct lguest *lg = file->private_data; + + if (!lg) + return -EINVAL; + + if (lg->dead) { + size_t len; + + if (IS_ERR(lg->dead)) + return PTR_ERR(lg->dead); + + len = min(size, strlen(lg->dead)+1); + if (copy_to_user(user, lg->dead, len) != 0) + return -EFAULT; + return len; + } + + if (lg->dma_is_pending) + lg->dma_is_pending = 0; + + return run_guest(lg, user); +} + +/* Take: pfnlimit, pgdir, start, pageoffset. */ +static int initialize(struct file *file, const u32 __user *input) +{ + struct lguest *lg; + int err, i; + u32 args[4]; + + if (file->private_data) + return -EBUSY; + + if (copy_from_user(args, input, sizeof(args)) != 0) + return -EFAULT; + + mutex_lock(&lguest_lock); + i = find_free_guest(); + if (i < 0) { + err = -ENOSPC; + goto unlock; + } + lg = &lguests[i]; + lg->guestid = i; + lg->pfn_limit = args[0]; + lg->page_offset = args[3]; + lg->regs_page = get_zeroed_page(GFP_KERNEL); + if (!lg->regs_page) { + err = -ENOMEM; + goto release_guest; + } + lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs); + + err = init_guest_pagetable(lg, args[1]); + if (err) + goto free_regs; + + setup_regs(lg->regs, args[2]); + setup_guest_gdt(lg); + lg->tsk = current; + lg->mm = get_task_mm(current); + lg->last_pages = NULL; + mutex_unlock(&lguest_lock); + + file->private_data = lg; + return sizeof(args); + +free_regs: + free_page(lg->regs_page); +release_guest: + memset(lg, 0, sizeof(*lg)); +unlock: + mutex_unlock(&lguest_lock); + return err; +} + +static ssize_t write(struct file *file, const char __user *input, + size_t size, loff_t *off) +{ + struct lguest *lg = file->private_data; + u32 req; + + if (get_user(req, input) != 0) + return -EFAULT; + input += sizeof(req); + + if (req != LHREQ_INITIALIZE && !lg) + return -EINVAL; + if (lg && lg->dead) + return -ENOENT; + + switch (req) { + case LHREQ_INITIALIZE: + return initialize(file, (const u32 __user *)input); + case LHREQ_GETDMA: + return user_get_dma(lg, (const u32 __user *)input); + case LHREQ_IRQ: + return user_send_irq(lg, (const u32 __user *)input); + default: + return -EINVAL; + } +} + +static int close(struct inode *inode, struct file *file) +{ + struct lguest *lg = file->private_data; + + if (!lg) + return 0; + + mutex_lock(&lguest_lock); + release_all_dma(lg); + free_guest_pagetable(lg); + mmput(lg->mm); + if (!IS_ERR(lg->dead)) + kfree(lg->dead); + free_page(lg->regs_page); + memset(lg, 0, sizeof(*lg)); + mutex_unlock(&lguest_lock); + return 0; +} + +static struct file_operations lguest_fops = { + .owner = THIS_MODULE, + .release = close, + .write = write, + .read = read, +}; +static struct miscdevice lguest_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "lguest", + .fops = &lguest_fops, +}; + +int __init lguest_device_init(void) +{ + return misc_register(&lguest_dev); +} + +void __exit lguest_device_remove(void) +{ + misc_deregister(&lguest_dev); +} =================================================================== --- /dev/null +++ b/drivers/lguest/page_tables.c @@ -0,0 +1,407 @@ +/* Shadow page table operations. + * Copyright (C) Rusty Russell IBM Corporation 2006. + * GPL v2 and any later version */ +#include <linux/mm.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/random.h> +#include <linux/percpu.h> +#include <asm/tlbflush.h> +#include "lg.h" + +#define PTES_PER_PAGE_SHIFT 10 +#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT) +#define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1) + +static DEFINE_PER_CPU(spte_t *, switcher_pte_pages) = { NULL }; +#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) + +static unsigned vaddr_to_pgd_index(unsigned long vaddr) +{ + return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); +} + +/* These access the shadow versions (ie. the ones used by the CPU). */ +static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) +{ + unsigned int index = vaddr_to_pgd_index(vaddr); + + if (index >= SWITCHER_PGD_INDEX) { + kill_guest(lg, "attempt to access switcher pages"); + index = 0; + } + return &lg->pgdirs[i].pgdir[index]; +} + +static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr) +{ + spte_t *page = __va(spgd.pfn << PAGE_SHIFT); + BUG_ON(!(spgd.flags & _PAGE_PRESENT)); + return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE]; +} + +/* These access the guest versions. */ +static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) +{ + unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); + return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t); +} + +static unsigned long gpte_addr(struct lguest *lg, + gpgd_t gpgd, unsigned long vaddr) +{ + unsigned long gpage = gpgd.pfn << PAGE_SHIFT; + BUG_ON(!(gpgd.flags & _PAGE_PRESENT)); + return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t); +} + +/* Do a virtual -> physical mapping on a user page. */ +static unsigned long get_pfn(unsigned long virtpfn, int write) +{ + struct page *page; + unsigned long ret = -1UL; + + down_read(¤t->mm->mmap_sem); + if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT, + 1, write, 1, &page, NULL) == 1) + ret = page_to_pfn(page); + up_read(¤t->mm->mmap_sem); + return ret; +} + +static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write) +{ + spte_t spte; + unsigned long pfn; + + /* We ignore the global flag. */ + spte.flags = (gpte.flags & ~_PAGE_GLOBAL); + pfn = get_pfn(gpte.pfn, write); + if (pfn == -1UL) { + kill_guest(lg, "failed to get page %u", gpte.pfn); + /* Must not put_page() bogus page on cleanup. */ + spte.flags = 0; + } + spte.pfn = pfn; + return spte; +} + +static void release_pte(spte_t pte) +{ + if (pte.flags & _PAGE_PRESENT) + put_page(pfn_to_page(pte.pfn)); +} + +static void check_gpte(struct lguest *lg, gpte_t gpte) +{ + if ((gpte.flags & (_PAGE_PWT|_PAGE_PSE)) || gpte.pfn >= lg->pfn_limit) + kill_guest(lg, "bad page table entry"); +} + +static void check_gpgd(struct lguest *lg, gpgd_t gpgd) +{ + if ((gpgd.flags & ~_PAGE_TABLE) || gpgd.pfn >= lg->pfn_limit) + kill_guest(lg, "bad page directory entry"); +} + +/* FIXME: We hold reference to pages, which prevents them from being + swapped. It'd be nice to have a callback when Linux wants to swap out. */ + +/* We fault pages in, which allows us to update accessed/dirty bits. + * Return true if we got page. */ +int demand_page(struct lguest *lg, unsigned long vaddr, int write) +{ + gpgd_t gpgd; + spgd_t *spgd; + unsigned long gpte_ptr; + gpte_t gpte; + spte_t *spte; + + gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); + if (!(gpgd.flags & _PAGE_PRESENT)) + return 0; + + spgd = spgd_addr(lg, lg->pgdidx, vaddr); + if (!(spgd->flags & _PAGE_PRESENT)) { + /* Get a page of PTEs for them. */ + unsigned long ptepage = get_zeroed_page(GFP_KERNEL); + /* FIXME: Steal from self in this case? */ + if (!ptepage) { + kill_guest(lg, "out of memory allocating pte page"); + return 0; + } + check_gpgd(lg, gpgd); + spgd->raw.val = (__pa(ptepage) | gpgd.flags); + } + + gpte_ptr = gpte_addr(lg, gpgd, vaddr); + gpte = mkgpte(lgread_u32(lg, gpte_ptr)); + + /* No page? */ + if (!(gpte.flags & _PAGE_PRESENT)) + return 0; + + /* Write to read-only page? */ + if (write && !(gpte.flags & _PAGE_RW)) + return 0; + + check_gpte(lg, gpte); + gpte.flags |= _PAGE_ACCESSED; + if (write) + gpte.flags |= _PAGE_DIRTY; + + /* We're done with the old pte. */ + spte = spte_addr(lg, *spgd, vaddr); + release_pte(*spte); + + /* We don't make it writable if this isn't a write: later + * write will fault so we can set dirty bit in guest. */ + if (gpte.flags & _PAGE_DIRTY) + *spte = gpte_to_spte(lg, gpte, 1); + else { + gpte_t ro_gpte = gpte; + ro_gpte.flags &= ~_PAGE_RW; + *spte = gpte_to_spte(lg, ro_gpte, 0); + } + + /* Now we update dirty/accessed on guest. */ + lgwrite_u32(lg, gpte_ptr, gpte.raw.val); + return 1; +} + +/* This is much faster than the full demand_page logic. */ +static int page_writable(struct lguest *lg, unsigned long vaddr) +{ + spgd_t *spgd; + unsigned long flags; + + spgd = spgd_addr(lg, lg->pgdidx, vaddr); + if (!(spgd->flags & _PAGE_PRESENT)) + return 0; + + flags = spte_addr(lg, *spgd, vaddr)->flags; + return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); +} + +void pin_page(struct lguest *lg, unsigned long vaddr) +{ + if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 0)) + kill_guest(lg, "bad trap page %#lx", vaddr); +} + +static void release_pgd(struct lguest *lg, spgd_t *spgd) +{ + if (spgd->flags & _PAGE_PRESENT) { + unsigned int i; + spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT); + for (i = 0; i < PTES_PER_PAGE; i++) + release_pte(ptepage[i]); + free_page((long)ptepage); + spgd->raw.val = 0; + } +} + +static void flush_user_mappings(struct lguest *lg, int idx) +{ + unsigned int i; + for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++) + release_pgd(lg, lg->pgdirs[idx].pgdir + i); +} + +void guest_pagetable_flush_user(struct lguest *lg) +{ + flush_user_mappings(lg, lg->pgdidx); +} + +static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) +{ + unsigned int i; + for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) + if (lg->pgdirs[i].cr3 == pgtable) + break; + return i; +} + +static unsigned int new_pgdir(struct lguest *lg, + unsigned long cr3, + int *blank_pgdir) +{ + unsigned int next; + + next = random32() % ARRAY_SIZE(lg->pgdirs); + if (!lg->pgdirs[next].pgdir) { + lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL); + if (!lg->pgdirs[next].pgdir) + next = lg->pgdidx; + else + /* There are no mappings: you'll need to re-pin */ + *blank_pgdir = 1; + } + lg->pgdirs[next].cr3 = cr3; + /* Release all the non-kernel mappings. */ + flush_user_mappings(lg, next); + + return next; +} + +void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) +{ + int newpgdir, repin = 0; + + newpgdir = find_pgdir(lg, pgtable); + if (newpgdir == ARRAY_SIZE(lg->pgdirs)) + newpgdir = new_pgdir(lg, pgtable, &repin); + lg->pgdidx = newpgdir; + if (repin) + pin_trap_pages(lg); +} + +static void release_all_pagetables(struct lguest *lg) +{ + unsigned int i, j; + + for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) + if (lg->pgdirs[i].pgdir) + for (j = 0; j < SWITCHER_PGD_INDEX; j++) + release_pgd(lg, lg->pgdirs[i].pgdir + j); +} + +void guest_pagetable_clear_all(struct lguest *lg) +{ + release_all_pagetables(lg); + pin_trap_pages(lg); +} + +static void do_set_pte(struct lguest *lg, int idx, + unsigned long vaddr, gpte_t gpte) +{ + spgd_t *spgd = spgd_addr(lg, idx, vaddr); + if (spgd->flags & _PAGE_PRESENT) { + spte_t *spte = spte_addr(lg, *spgd, vaddr); + release_pte(*spte); + if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) { + check_gpte(lg, gpte); + *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY); + } else + spte->raw.val = 0; + } +} + +void guest_set_pte(struct lguest *lg, + unsigned long cr3, unsigned long vaddr, gpte_t gpte) +{ + /* Kernel mappings must be changed on all top levels. */ + if (vaddr >= lg->page_offset) { + unsigned int i; + for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) + if (lg->pgdirs[i].pgdir) + do_set_pte(lg, i, vaddr, gpte); + } else { + int pgdir = find_pgdir(lg, cr3); + if (pgdir != ARRAY_SIZE(lg->pgdirs)) + do_set_pte(lg, pgdir, vaddr, gpte); + } +} + +void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) +{ + int pgdir; + + if (idx >= SWITCHER_PGD_INDEX) + return; + + pgdir = find_pgdir(lg, cr3); + if (pgdir < ARRAY_SIZE(lg->pgdirs)) + release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); +} + +int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) +{ + /* We assume this in flush_user_mappings, so check now */ + if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX) + return -EINVAL; + lg->pgdidx = 0; + lg->pgdirs[lg->pgdidx].cr3 = pgtable; + lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL); + if (!lg->pgdirs[lg->pgdidx].pgdir) + return -ENOMEM; + return 0; +} + +void free_guest_pagetable(struct lguest *lg) +{ + unsigned int i; + + release_all_pagetables(lg); + for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) + free_page((long)lg->pgdirs[i].pgdir); +} + +/* Caller must be preempt-safe */ +void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) +{ + spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); + spgd_t switcher_pgd; + spte_t regs_pte; + + /* Since switcher less that 4MB, we simply mug top pte page. */ + switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT; + switcher_pgd.flags = _PAGE_KERNEL; + lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; + + /* Map our regs page over stack page. */ + regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT; + regs_pte.flags = _PAGE_KERNEL; + switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE] + = regs_pte; +} + +static void free_switcher_pte_pages(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + free_page((long)switcher_pte_page(i)); +} + +static __init void populate_switcher_pte_page(unsigned int cpu, + struct page *switcher_page[], + unsigned int pages) +{ + unsigned int i; + spte_t *pte = switcher_pte_page(cpu); + + for (i = 0; i < pages; i++) { + pte[i].pfn = page_to_pfn(switcher_page[i]); + pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED; + } + + /* We only map this CPU's pages, so guest can't see others. */ + i = pages + cpu*2; + + /* First page (regs) is rw, second (state) is ro. */ + pte[i].pfn = page_to_pfn(switcher_page[i]); + pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW; + pte[i+1].pfn = page_to_pfn(switcher_page[i+1]); + pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED; +} + +__init int init_pagetables(struct page **switcher_page, unsigned int pages) +{ + unsigned int i; + + for_each_possible_cpu(i) { + switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL); + if (!switcher_pte_page(i)) { + free_switcher_pte_pages(); + return -ENOMEM; + } + populate_switcher_pte_page(i, switcher_page, pages); + } + return 0; +} + +void free_pagetables(void) +{ + free_switcher_pte_pages(); +} =================================================================== --- /dev/null +++ b/drivers/lguest/segments.c @@ -0,0 +1,115 @@ +#include "lg.h" + +static int desc_ok(const struct desc_struct *gdt) +{ + /* MBZ=0, P=1, DT=1 */ + return ((gdt->b & 0x00209000) == 0x00009000); +} + +static int segment_present(const struct desc_struct *gdt) +{ + return gdt->b & 0x8000; +} + +static int ignored_gdt(unsigned int num) +{ + return (num == GDT_ENTRY_TSS + || num == GDT_ENTRY_LGUEST_CS + || num == GDT_ENTRY_LGUEST_DS + || num == GDT_ENTRY_DOUBLEFAULT_TSS); +} + +/* We don't allow removal of CS, DS or SS; it doesn't make sense. */ +static void check_segment_use(struct lguest *lg, unsigned int desc) +{ + if (lg->regs->gs / 8 == desc) + lg->regs->gs = 0; + if (lg->regs->fs / 8 == desc) + lg->regs->fs = 0; + if (lg->regs->es / 8 == desc) + lg->regs->es = 0; + if (lg->regs->ds / 8 == desc + || lg->regs->cs / 8 == desc + || lg->regs->ss / 8 == desc) + kill_guest(lg, "Removed live GDT entry %u", desc); +} + +static void fixup_gdt_table(struct lguest *lg) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(lg->gdt); i++) { + /* We never copy these ones to real gdt */ + if (ignored_gdt(i)) + continue; + + /* We could fault in switch_to_guest if they are using + * a removed segment. */ + if (!segment_present(&lg->gdt[i])) { + check_segment_use(lg, i); + continue; + } + + if (!desc_ok(&lg->gdt[i])) + kill_guest(lg, "Bad GDT descriptor %i", i); + + /* DPL 0 presumably means "for use by guest". */ + if ((lg->gdt[i].b & 0x00006000) == 0) + lg->gdt[i].b |= (GUEST_PL << 13); + + /* Set accessed bit, since gdt isn't writable. */ + lg->gdt[i].b |= 0x00000100; + } +} + +void setup_default_gdt_entries(struct lguest_ro_state *state) +{ + struct desc_struct *gdt = state->guest_gdt; + unsigned long tss = (unsigned long)&state->guest_tss; + + /* Hypervisor segments. */ + gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; + gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; + + /* This is the one which we *cannot* copy from guest, since tss + is depended on this lguest_ro_state, ie. this cpu. */ + gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); + gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) + | ((tss >> 16) & 0x000000FF); +} + +void setup_guest_gdt(struct lguest *lg) +{ + lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; + lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; + lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); + lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); +} + +void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) +{ + unsigned int i; + + for (i = 0; i < GDT_ENTRIES; i++) + if (!ignored_gdt(i)) + gdt[i] = lg->gdt[i]; +} + +void load_guest_gdt(struct lguest *lg, u32 table, u32 num) +{ + if (num > ARRAY_SIZE(lg->gdt)) + kill_guest(lg, "too many gdt entries %i", num); + + lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0])); + fixup_gdt_table(lg); + lg->changed |= CHANGED_GDT; +} + +void guest_load_tls(struct lguest *lg, const struct desc_struct __user *gtls) +{ + struct desc_struct *tls = &lg->gdt[GDT_ENTRY_TLS_MIN]; + + lgread(lg, tls, (u32)gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); + fixup_gdt_table(lg); + lg->changed |= CHANGED_GDT; +} =================================================================== --- /dev/null +++ b/drivers/lguest/switcher.S @@ -0,0 +1,159 @@ +/* This code sits at 0xFFC00000 to do the low-level guest<->host switch. + + There is are two pages above us for this CPU (struct lguest_pages). + The second page (struct lguest_ro_state) becomes read-only after the + context switch. The first page (the stack for traps) remains writable, + but while we're in here, the guest cannot be running. +*/ +#include <linux/linkage.h> +#include <asm/asm-offsets.h> +#include "lg.h" + +.text +ENTRY(start_switcher_text) + +/* %eax points to lguest pages for this CPU. %ebx contains cr3 value. + All normal registers can be clobbered! */ +ENTRY(switch_to_guest) + /* Save host segments on host stack. */ + pushl %es + pushl %ds + pushl %gs + pushl %fs + /* With CONFIG_FRAME_POINTER, gcc doesn't let us clobber this! */ + pushl %ebp + /* Save host stack. */ + movl %esp, LGUEST_PAGES_host_sp(%eax) + /* Switch to guest stack: if we get NMI we expect to be there. */ + movl %eax, %edx + addl $LGUEST_PAGES_regs, %edx + movl %edx, %esp + /* Switch to guest's GDT, IDT. */ + lgdt LGUEST_PAGES_guest_gdt_desc(%eax) + lidt LGUEST_PAGES_guest_idt_desc(%eax) + /* Switch to guest's TSS while GDT still writable. */ + movl $(GDT_ENTRY_TSS*8), %edx + ltr %dx + /* Set host's TSS GDT entry to available (clear byte 5 bit 2). */ + movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx + andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx) + /* Switch to guest page tables: lguest_pages->state now read-only. */ + movl %ebx, %cr3 + /* Restore guest regs */ + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %gs + popl %eax + popl %fs + popl %ds + popl %es + /* Skip error code and trap number */ + addl $8, %esp + iret + +#define SWITCH_TO_HOST \ + /* Save guest state */ \ + pushl %es; \ + pushl %ds; \ + pushl %fs; \ + pushl %eax; \ + pushl %gs; \ + pushl %ebp; \ + pushl %edi; \ + pushl %esi; \ + pushl %edx; \ + pushl %ecx; \ + pushl %ebx; \ + /* Load lguest ds segment for convenience. */ \ + movl $(LGUEST_DS), %eax; \ + movl %eax, %ds; \ + /* Figure out where we are, based on stack (at top of regs). */ \ + movl %esp, %eax; \ + subl $LGUEST_PAGES_regs, %eax; \ + /* Put trap number in %ebx before we switch cr3 and lose it. */ \ + movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \ + /* Switch to host page tables (host GDT, IDT and stack are in host \ + mem, so need this first) */ \ + movl LGUEST_PAGES_host_cr3(%eax), %edx; \ + movl %edx, %cr3; \ + /* Set guest's TSS to available (clear byte 5 bit 2). */ \ + andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \ + /* Switch to host's GDT & IDT. */ \ + lgdt LGUEST_PAGES_host_gdt_desc(%eax); \ + lidt LGUEST_PAGES_host_idt_desc(%eax); \ + /* Switch to host's stack. */ \ + movl LGUEST_PAGES_host_sp(%eax), %esp; \ + /* Switch to host's TSS */ \ + movl $(GDT_ENTRY_TSS*8), %edx; \ + ltr %dx; \ + popl %ebp; \ + popl %fs; \ + popl %gs; \ + popl %ds; \ + popl %es + +/* Return to run_guest_once. */ +return_to_host: + SWITCH_TO_HOST + iret + +deliver_to_host: + SWITCH_TO_HOST + /* Decode IDT and jump to hosts' irq handler. When that does iret, it + * will return to run_guest_once. This is a feature. */ + movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx + leal (%edx,%ebx,8), %eax + movzwl (%eax),%edx + movl 4(%eax), %eax + xorw %ax, %ax + orl %eax, %edx + jmp *%edx + +/* Real hardware interrupts are delivered straight to the host. Others + cause us to return to run_guest_once so it can decide what to do. Note + that some of these are overridden by the guest to deliver directly, and + never enter here (see load_guest_idt_entry). */ +.macro IRQ_STUB N TARGET + .data; .long 1f; .text; 1: + /* Make an error number for most traps, which don't have one. */ + .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17) + pushl $0 + .endif + pushl $\N + jmp \TARGET + ALIGN +.endm + +.macro IRQ_STUBS FIRST LAST TARGET + irq=\FIRST + .rept \LAST-\FIRST+1 + IRQ_STUB irq \TARGET + irq=irq+1 + .endr +.endm + +/* We intercept every interrupt, because we may need to switch back to + * host. Unfortunately we can't tell them apart except by entry + * point, so we need 256 entry points. + */ +.data +.global default_idt_entries +default_idt_entries: +.text + IRQ_STUBS 0 1 return_to_host /* First two traps */ + IRQ_STUB 2 handle_nmi /* NMI */ + IRQ_STUBS 3 31 return_to_host /* Rest of traps */ + IRQ_STUBS 32 127 deliver_to_host /* Real interrupts */ + IRQ_STUB 128 return_to_host /* System call (overridden) */ + IRQ_STUBS 129 255 deliver_to_host /* Other real interrupts */ + +/* We ignore NMI and return. */ +handle_nmi: + addl $8, %esp + iret + +ENTRY(end_switcher_text) =================================================================== --- /dev/null +++ b/include/linux/lguest_launcher.h @@ -0,0 +1,80 @@ +#ifndef _ASM_LGUEST_USER +#define _ASM_LGUEST_USER +/* Everything the "lguest" userspace program needs to know. */ +/* They can register up to 32 arrays of lguest_dma. */ +#define LGUEST_MAX_DMA 32 +/* At most we can dma 16 lguest_dma in one op. */ +#define LGUEST_MAX_DMA_SECTIONS 16 + +/* How many devices? Assume each one wants up to two dma arrays per device. */ +#define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2) + +struct lguest_dma +{ + /* 0 if free to be used, filled by hypervisor. */ + u32 used_len; + u32 addr[LGUEST_MAX_DMA_SECTIONS]; + u16 len[LGUEST_MAX_DMA_SECTIONS]; +}; + +/* This is found at address 0. */ +struct lguest_boot_info +{ + u32 max_pfn; + u32 initrd_size; + char cmdline[256]; +}; + +struct lguest_block_page +{ + /* 0 is a read, 1 is a write. */ + int type; + u32 sector; /* Offset in device = sector * 512. */ + u32 bytes; /* Length expected to be read/written in bytes */ + /* 0 = pending, 1 = done, 2 = done, error */ + int result; + u32 num_sectors; /* Disk length = num_sectors * 512 */ +}; + +/* There is a shared page of these. */ +struct lguest_net +{ + /* Simply the mac address (with multicast bit meaning promisc). */ + unsigned char mac[6]; +}; + +/* Where the Host expects the Guest to SEND_DMA console output to. */ +#define LGUEST_CONSOLE_DMA_KEY 0 + +/* We have a page of these descriptors in the lguest_device page. */ +struct lguest_device_desc { + u16 type; +#define LGUEST_DEVICE_T_CONSOLE 1 +#define LGUEST_DEVICE_T_NET 2 +#define LGUEST_DEVICE_T_BLOCK 3 + + u16 features; +#define LGUEST_NET_F_NOCSUM 0x4000 /* Don't bother checksumming */ +#define LGUEST_DEVICE_F_RANDOMNESS 0x8000 /* IRQ is fairly random */ + + u16 status; +/* 256 and above are device specific. */ +#define LGUEST_DEVICE_S_ACKNOWLEDGE 1 /* We have seen device. */ +#define LGUEST_DEVICE_S_DRIVER 2 /* We have found a driver */ +#define LGUEST_DEVICE_S_DRIVER_OK 4 /* Driver says OK! */ +#define LGUEST_DEVICE_S_REMOVED 8 /* Device has gone away. */ +#define LGUEST_DEVICE_S_REMOVED_ACK 16 /* Driver has been told. */ +#define LGUEST_DEVICE_S_FAILED 128 /* Something actually failed */ + + u16 num_pages; + u32 pfn; +}; + +/* Write command first word is a request. */ +enum lguest_req +{ + LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */ + LHREQ_GETDMA, /* + addr (returns &lguest_dma, irq in ->used_len) */ + LHREQ_IRQ, /* + irq */ +}; +#endif /* _ASM_LGUEST_USER */ ^ permalink raw reply [flat|nested] 13+ messages in thread
[parent not found: <1176203176.26372.25.camel@localhost.localdomain>]
* [PATCH 3/8] lguest: the asm offsets [not found] ` <1176203176.26372.25.camel@localhost.localdomain> @ 2007-04-10 11:07 ` Rusty Russell [not found] ` <1176203231.26372.27.camel@localhost.localdomain> 1 sibling, 0 replies; 13+ messages in thread From: Rusty Russell @ 2007-04-10 11:07 UTC (permalink / raw) To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization This is the structure offsets required by lg.ko's switcher.S. Unfortunately we don't have infrastructure for private asm-offsets creation. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> --- arch/i386/kernel/asm-offsets.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) =================================================================== --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -16,6 +16,10 @@ #include <asm/thread_info.h> #include <asm/elf.h> #include <asm/pda.h> +#ifdef CONFIG_LGUEST_GUEST +#include <linux/lguest.h> +#include "../../../drivers/lguest/lg.h" +#endif #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -114,4 +118,19 @@ void foo(void) OFFSET(PARAVIRT_iret, paravirt_ops, iret); OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); #endif + +#ifdef CONFIG_LGUEST_GUEST + BLANK(); + OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); + OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); + OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); + OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); + OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp); + OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc); + OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc); + OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt); + OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum); + OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); + OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); +#endif } ^ permalink raw reply [flat|nested] 13+ messages in thread
[parent not found: <1176203231.26372.27.camel@localhost.localdomain>]
* [PATCH 4/8] lguest: the Makefile and Kconfig [not found] ` <1176203231.26372.27.camel@localhost.localdomain> @ 2007-04-10 11:08 ` Rusty Russell 2007-04-10 11:08 ` [PATCH 5/8] lguest: the console driver Rusty Russell 0 siblings, 1 reply; 13+ messages in thread From: Rusty Russell @ 2007-04-10 11:08 UTC (permalink / raw) To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization This is the Kconfig and Makefile to allow lguest to actually be compiled. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> --- drivers/Kconfig | 2 ++ drivers/Makefile | 1 + drivers/lguest/Kconfig | 20 ++++++++++++++++++++ drivers/lguest/Makefile | 7 +++++++ 4 files changed, 30 insertions(+) =================================================================== --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -87,4 +87,6 @@ source "drivers/kvm/Kconfig" source "drivers/kvm/Kconfig" source "drivers/uio/Kconfig" + +source "drivers/lguest/Kconfig" endmenu =================================================================== --- a/drivers/Makefile +++ b/drivers/Makefile @@ -86,3 +86,4 @@ obj-$(CONFIG_HID) += hid/ obj-$(CONFIG_HID) += hid/ obj-$(CONFIG_PPC_PS3) += ps3/ obj-$(CONFIG_SSB) += ssb/ +obj-$(CONFIG_LGUEST_GUEST) += lguest/ =================================================================== --- /dev/null +++ b/drivers/lguest/Kconfig @@ -0,0 +1,20 @@ +config LGUEST + tristate "Linux hypervisor example code" + depends on X86 && PARAVIRT && NET && EXPERIMENTAL && !X86_PAE + select LGUEST_GUEST + select HVC_DRIVER + ---help--- + This is a very simple module which allows you to run + multiple instances of the same Linux kernel, using the + "lguest" command found in the Documentation/lguest directory. + Note that "lguest" is pronounced to rhyme with "fell quest", + not "rustyvisor". See Documentation/lguest/lguest.txt. + + If unsure, say N. If curious, say M. If masochistic, say Y. + +config LGUEST_GUEST + bool + help + The guest needs code built-in, even if the host has lguest + support as a module. The drivers are tiny, so we build them + in too. =================================================================== --- /dev/null +++ b/drivers/lguest/Makefile @@ -0,0 +1,7 @@ +# Guest requires the paravirt_ops replacement and the bus driver. +obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o + +# Host requires the other files, which can be a module. +obj-$(CONFIG_LGUEST) += lg.o +lg-objs := core.o hypercalls.o page_tables.o interrupts_and_traps.o \ + segments.o io.o lguest_user.o switcher.o ^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 5/8] lguest: the console driver 2007-04-10 11:08 ` [PATCH 4/8] lguest: the Makefile and Kconfig Rusty Russell @ 2007-04-10 11:08 ` Rusty Russell 2007-04-10 11:09 ` [PATCH 5/8] lguest: the net driver Rusty Russell ` (2 more replies) 0 siblings, 3 replies; 13+ messages in thread From: Rusty Russell @ 2007-04-10 11:08 UTC (permalink / raw) To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization A simple console driver for lguest. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> --- drivers/char/Makefile | 1 drivers/char/hvc_lguest.c | 99 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) =================================================================== --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_SX) += sx.o generic_serial obj-$(CONFIG_SX) += sx.o generic_serial.o obj-$(CONFIG_RIO) += rio/ generic_serial.o obj-$(CONFIG_HVC_CONSOLE) += hvc_vio.o hvsi.o +obj-$(CONFIG_LGUEST_GUEST) += hvc_lguest.o obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o obj-$(CONFIG_HVC_BEAT) += hvc_beat.o =================================================================== --- /dev/null +++ b/drivers/char/hvc_lguest.c @@ -0,0 +1,99 @@ +/* Simple console for lguest. + * + * Copyright (C) 2006 Rusty Russell, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <linux/err.h> +#include <linux/init.h> +#include <linux/lguest_bus.h> +#include "hvc_console.h" + +static char inbuf[256]; +static struct lguest_dma cons_input = { .used_len = 0, + .addr[0] = __pa(inbuf), + .len[0] = sizeof(inbuf), + .len[1] = 0 }; + +static int put_chars(u32 vtermno, const char *buf, int count) +{ + struct lguest_dma dma; + + /* FIXME: what if it's over a page boundary? */ + dma.len[0] = count; + dma.len[1] = 0; + dma.addr[0] = __pa(buf); + + hcall(LHCALL_SEND_DMA, LGUEST_CONSOLE_DMA_KEY, __pa(&dma), 0); + return count; +} + +static int get_chars(u32 vtermno, char *buf, int count) +{ + static int cons_offset; + + if (!cons_input.used_len) + return 0; + + if (cons_input.used_len - cons_offset < count) + count = cons_input.used_len - cons_offset; + + memcpy(buf, inbuf + cons_offset, count); + cons_offset += count; + if (cons_offset == cons_input.used_len) { + cons_offset = 0; + cons_input.used_len = 0; + } + return count; +} + +struct hv_ops lguest_cons = { + .get_chars = get_chars, + .put_chars = put_chars, +}; + +static int __init cons_init(void) +{ + if (strcmp(paravirt_ops.name, "lguest") != 0) + return 0; + + return hvc_instantiate(0, 0, &lguest_cons); +} +console_initcall(cons_init); + +static int lguestcons_probe(struct lguest_device *lgdev) +{ + lgdev->private = hvc_alloc(0, lgdev->index+1, &lguest_cons, 256); + if (IS_ERR(lgdev->private)) + return PTR_ERR(lgdev->private); + + if (!hcall(LHCALL_BIND_DMA, LGUEST_CONSOLE_DMA_KEY, __pa(&cons_input), + (1<<8) + lgdev->index+1)) + printk("lguest console: failed to bind buffer.\n"); + return 0; +} + +static struct lguest_driver lguestcons_drv = { + .name = "lguestcons", + .owner = THIS_MODULE, + .device_type = LGUEST_DEVICE_T_CONSOLE, + .probe = lguestcons_probe, +}; + +static int __init hvc_lguest_init(void) +{ + return register_lguest_driver(&lguestcons_drv); +} +module_init(hvc_lguest_init); ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 5/8] lguest: the net driver 2007-04-10 11:08 ` [PATCH 5/8] lguest: the console driver Rusty Russell @ 2007-04-10 11:09 ` Rusty Russell 2007-04-10 11:11 ` [PATCH 6/8] " Rusty Russell [not found] ` <1176203475.26372.37.camel@localhost.localdomain> 2 siblings, 0 replies; 13+ messages in thread From: Rusty Russell @ 2007-04-10 11:09 UTC (permalink / raw) To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization A simple net driver for lguest. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> --- drivers/net/Makefile | 1 drivers/net/lguest_net.c | 355 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 356 insertions(+) =================================================================== --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -220,3 +220,4 @@ obj-$(CONFIG_FS_ENET) += fs_enet/ obj-$(CONFIG_FS_ENET) += fs_enet/ obj-$(CONFIG_NETXEN_NIC) += netxen/ +obj-$(CONFIG_LGUEST_GUEST) += lguest_net.o =================================================================== --- /dev/null +++ b/drivers/net/lguest_net.c @@ -0,0 +1,355 @@ +/* A simple network driver for lguest. + * + * Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +//#define DEBUG +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/module.h> +#include <linux/mm_types.h> +#include <linux/lguest_bus.h> +#include <asm/io.h> + +#define SHARED_SIZE PAGE_SIZE +#define MAX_LANS 4 +#define NUM_SKBS 8 + +struct lguestnet_info +{ + /* The shared page(s). */ + struct lguest_net *peer; + unsigned long peer_phys; + unsigned long mapsize; + + /* My peerid. */ + unsigned int me; + + struct net_device_stats stats; + + /* Receive queue. */ + struct sk_buff *skb[NUM_SKBS]; + struct lguest_dma dma[NUM_SKBS]; +}; + +/* How many bytes left in this page. */ +static unsigned int rest_of_page(void *data) +{ + return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE); +} + +/* Simple convention: offset 4 * peernum. */ +static unsigned long peer_key(struct lguestnet_info *info, unsigned peernum) +{ + return info->peer_phys + 4 * peernum; +} + +static void skb_to_dma(const struct sk_buff *skb, unsigned int headlen, + struct lguest_dma *dma) +{ + unsigned int i, seg; + + for (i = seg = 0; i < headlen; seg++, i += rest_of_page(skb->data+i)) { + dma->addr[seg] = virt_to_phys(skb->data + i); + dma->len[seg] = min((unsigned)(headlen - i), + rest_of_page(skb->data + i)); + } + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, seg++) { + const skb_frag_t *f = &skb_shinfo(skb)->frags[i]; + /* Should not happen with MTU less than 64k - 2 * PAGE_SIZE. */ + if (seg == LGUEST_MAX_DMA_SECTIONS) { + printk("Woah dude! Megapacket!\n"); + break; + } + dma->addr[seg] = page_to_phys(f->page) + f->page_offset; + dma->len[seg] = f->size; + } + if (seg < LGUEST_MAX_DMA_SECTIONS) + dma->len[seg] = 0; +} + +/* We overload multicast bit to show promiscuous mode. */ +#define PROMISC_BIT 0x01 + +static void lguestnet_set_multicast(struct net_device *dev) +{ + struct lguestnet_info *info = dev->priv; + + if ((dev->flags & (IFF_PROMISC|IFF_ALLMULTI)) || dev->mc_count) + info->peer[info->me].mac[0] |= PROMISC_BIT; + else + info->peer[info->me].mac[0] &= ~PROMISC_BIT; +} + +static int promisc(struct lguestnet_info *info, unsigned int peer) +{ + return info->peer[peer].mac[0] & PROMISC_BIT; +} + +static int mac_eq(const unsigned char mac[ETH_ALEN], + struct lguestnet_info *info, unsigned int peer) +{ + /* Ignore multicast bit, which peer turns on to mean promisc. */ + if ((info->peer[peer].mac[0] & (~PROMISC_BIT)) != mac[0]) + return 0; + return memcmp(mac+1, info->peer[peer].mac+1, ETH_ALEN-1) == 0; +} + +static void transfer_packet(struct net_device *dev, + struct sk_buff *skb, + unsigned int peernum) +{ + struct lguestnet_info *info = dev->priv; + struct lguest_dma dma; + + skb_to_dma(skb, skb_headlen(skb), &dma); + pr_debug("xfer length %04x (%u)\n", htons(skb->len), skb->len); + + hcall(LHCALL_SEND_DMA, peer_key(info,peernum), __pa(&dma), 0); + if (dma.used_len != skb->len) { + info->stats.tx_carrier_errors++; + pr_debug("Bad xfer to peer %i: %i of %i (dma %p/%i)\n", + peernum, dma.used_len, skb->len, + (void *)dma.addr[0], dma.len[0]); + } else { + info->stats.tx_bytes += skb->len; + info->stats.tx_packets++; + } +} + +static int unused_peer(const struct lguest_net peer[], unsigned int num) +{ + return peer[num].mac[0] == 0; +} + +static int lguestnet_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + unsigned int i; + int broadcast; + struct lguestnet_info *info = dev->priv; + const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; + + pr_debug("%s: xmit %02x:%02x:%02x:%02x:%02x:%02x\n", + dev->name, dest[0],dest[1],dest[2],dest[3],dest[4],dest[5]); + + broadcast = is_multicast_ether_addr(dest); + for (i = 0; i < info->mapsize/sizeof(struct lguest_net); i++) { + if (i == info->me || unused_peer(info->peer, i)) + continue; + + if (!broadcast && !promisc(info, i) && !mac_eq(dest, info, i)) + continue; + + pr_debug("lguestnet %s: sending from %i to %i\n", + dev->name, info->me, i); + transfer_packet(dev, skb, i); + } + dev_kfree_skb(skb); + return 0; +} + +/* Find a new skb to put in this slot in shared mem. */ +static int fill_slot(struct net_device *dev, unsigned int slot) +{ + struct lguestnet_info *info = dev->priv; + /* Try to create and register a new one. */ + info->skb[slot] = netdev_alloc_skb(dev, ETH_HLEN + ETH_DATA_LEN); + if (!info->skb[slot]) { + printk("%s: could not fill slot %i\n", dev->name, slot); + return -ENOMEM; + } + + skb_to_dma(info->skb[slot], ETH_HLEN + ETH_DATA_LEN, &info->dma[slot]); + wmb(); + /* Now we tell hypervisor it can use the slot. */ + info->dma[slot].used_len = 0; + return 0; +} + +static irqreturn_t lguestnet_rcv(int irq, void *dev_id) +{ + struct net_device *dev = dev_id; + struct lguestnet_info *info = dev->priv; + unsigned int i, done = 0; + + for (i = 0; i < ARRAY_SIZE(info->dma); i++) { + unsigned int length; + struct sk_buff *skb; + + length = info->dma[i].used_len; + if (length == 0) + continue; + + done++; + skb = info->skb[i]; + fill_slot(dev, i); + + if (length < ETH_HLEN || length > ETH_HLEN + ETH_DATA_LEN) { + pr_debug(KERN_WARNING "%s: unbelievable skb len: %i\n", + dev->name, length); + dev_kfree_skb(skb); + continue; + } + + skb_put(skb, length); + skb->protocol = eth_type_trans(skb, dev); + /* This is a reliable transport. */ + if (dev->features & NETIF_F_NO_CSUM) + skb->ip_summed = CHECKSUM_UNNECESSARY; + pr_debug("Receiving skb proto 0x%04x len %i type %i\n", + ntohs(skb->protocol), skb->len, skb->pkt_type); + + info->stats.rx_bytes += skb->len; + info->stats.rx_packets++; + netif_rx(skb); + } + return done ? IRQ_HANDLED : IRQ_NONE; +} + +static int lguestnet_open(struct net_device *dev) +{ + int i; + struct lguestnet_info *info = dev->priv; + + /* Set up our MAC address */ + memcpy(info->peer[info->me].mac, dev->dev_addr, ETH_ALEN); + + /* Turn on promisc mode if needed */ + lguestnet_set_multicast(dev); + + for (i = 0; i < ARRAY_SIZE(info->dma); i++) { + if (fill_slot(dev, i) != 0) + goto cleanup; + } + if (!hcall(LHCALL_BIND_DMA, peer_key(info, info->me), __pa(info->dma), + (NUM_SKBS << 8) | dev->irq)) + goto cleanup; + return 0; + +cleanup: + while (--i >= 0) + dev_kfree_skb(info->skb[i]); + return -ENOMEM; +} + +static int lguestnet_close(struct net_device *dev) +{ + unsigned int i; + struct lguestnet_info *info = dev->priv; + + /* Clear all trace: others might deliver packets, we'll ignore it. */ + memset(&info->peer[info->me], 0xFF, sizeof(info->peer[info->me])); + + /* Deregister sg lists. */ + hcall(LHCALL_BIND_DMA, peer_key(info, info->me), __pa(info->dma), 0); + for (i = 0; i < ARRAY_SIZE(info->dma); i++) + dev_kfree_skb(info->skb[i]); + return 0; +} + +static struct net_device_stats *lguestnet_get_stats(struct net_device *dev) +{ + struct lguestnet_info *info = dev->priv; + + return &info->stats; +} + +static int lguestnet_probe(struct lguest_device *lgdev) +{ + int err, irqf = IRQF_SHARED; + struct net_device *dev; + struct lguestnet_info *info; + struct lguest_device_desc *desc = &lguest_devices[lgdev->index]; + + pr_debug("lguest_net: probing for device %i\n", lgdev->index); + + dev = alloc_etherdev(sizeof(struct lguestnet_info)); + if (!dev) + return -ENOMEM; + + SET_MODULE_OWNER(dev); + + /* Ethernet defaults with some changes */ + ether_setup(dev); + dev->set_mac_address = NULL; + + dev->dev_addr[0] = 0x02; /* set local assignment bit (IEEE802) */ + dev->dev_addr[1] = 0x00; + memcpy(&dev->dev_addr[2], &lguest_data.guestid, 2); + dev->dev_addr[4] = 0x00; + dev->dev_addr[5] = 0x00; + + dev->open = lguestnet_open; + dev->stop = lguestnet_close; + dev->hard_start_xmit = lguestnet_start_xmit; + dev->get_stats = lguestnet_get_stats; + + /* Turning on/off promisc will call dev->set_multicast_list. + * We don't actually support multicast yet */ + dev->set_multicast_list = lguestnet_set_multicast; + dev->mem_start = ((unsigned long)desc->pfn << PAGE_SHIFT); + dev->mem_end = dev->mem_start + PAGE_SIZE * desc->num_pages; + dev->irq = lgdev->index+1; + dev->features = NETIF_F_SG; + if (desc->features & LGUEST_NET_F_NOCSUM) + dev->features |= NETIF_F_NO_CSUM; + + info = dev->priv; + info->mapsize = PAGE_SIZE * desc->num_pages; + info->peer_phys = ((unsigned long)desc->pfn << PAGE_SHIFT); + info->peer = (void *)ioremap(info->peer_phys, info->mapsize); + /* This stores our peerid (upper bits reserved for future). */ + info->me = (desc->features & (info->mapsize-1)); + + err = register_netdev(dev); + if (err) { + pr_debug("lguestnet: registering device failed\n"); + goto free; + } + + if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS) + irqf |= IRQF_SAMPLE_RANDOM; + if (request_irq(dev->irq, lguestnet_rcv, irqf, "lguestnet", dev) != 0) { + pr_debug("lguestnet: could not get net irq %i\n", dev->irq); + goto unregister; + } + + pr_debug("lguestnet: registered device %s\n", dev->name); + lgdev->private = dev; + return 0; + +unregister: + unregister_netdev(dev); +free: + free_netdev(dev); + return err; +} + +static struct lguest_driver lguestnet_drv = { + .name = "lguestnet", + .owner = THIS_MODULE, + .device_type = LGUEST_DEVICE_T_NET, + .probe = lguestnet_probe, +}; + +static __init int lguestnet_init(void) +{ + return register_lguest_driver(&lguestnet_drv); +} +module_init(lguestnet_init); + +MODULE_DESCRIPTION("Lguest network driver"); +MODULE_LICENSE("GPL"); ^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 6/8] lguest: the net driver 2007-04-10 11:08 ` [PATCH 5/8] lguest: the console driver Rusty Russell 2007-04-10 11:09 ` [PATCH 5/8] lguest: the net driver Rusty Russell @ 2007-04-10 11:11 ` Rusty Russell [not found] ` <1176203475.26372.37.camel@localhost.localdomain> 2 siblings, 0 replies; 13+ messages in thread From: Rusty Russell @ 2007-04-10 11:11 UTC (permalink / raw) To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization Lguest net driver A simple net driver for lguest. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> --- drivers/net/Makefile | 1 drivers/net/lguest_net.c | 355 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 356 insertions(+) =================================================================== --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -220,3 +220,4 @@ obj-$(CONFIG_FS_ENET) += fs_enet/ obj-$(CONFIG_FS_ENET) += fs_enet/ obj-$(CONFIG_NETXEN_NIC) += netxen/ +obj-$(CONFIG_LGUEST_GUEST) += lguest_net.o =================================================================== --- /dev/null +++ b/drivers/net/lguest_net.c @@ -0,0 +1,355 @@ +/* A simple network driver for lguest. + * + * Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +//#define DEBUG +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/module.h> +#include <linux/mm_types.h> +#include <linux/lguest_bus.h> +#include <asm/io.h> + +#define SHARED_SIZE PAGE_SIZE +#define MAX_LANS 4 +#define NUM_SKBS 8 + +struct lguestnet_info +{ + /* The shared page(s). */ + struct lguest_net *peer; + unsigned long peer_phys; + unsigned long mapsize; + + /* My peerid. */ + unsigned int me; + + struct net_device_stats stats; + + /* Receive queue. */ + struct sk_buff *skb[NUM_SKBS]; + struct lguest_dma dma[NUM_SKBS]; +}; + +/* How many bytes left in this page. */ +static unsigned int rest_of_page(void *data) +{ + return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE); +} + +/* Simple convention: offset 4 * peernum. */ +static unsigned long peer_key(struct lguestnet_info *info, unsigned peernum) +{ + return info->peer_phys + 4 * peernum; +} + +static void skb_to_dma(const struct sk_buff *skb, unsigned int headlen, + struct lguest_dma *dma) +{ + unsigned int i, seg; + + for (i = seg = 0; i < headlen; seg++, i += rest_of_page(skb->data+i)) { + dma->addr[seg] = virt_to_phys(skb->data + i); + dma->len[seg] = min((unsigned)(headlen - i), + rest_of_page(skb->data + i)); + } + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, seg++) { + const skb_frag_t *f = &skb_shinfo(skb)->frags[i]; + /* Should not happen with MTU less than 64k - 2 * PAGE_SIZE. */ + if (seg == LGUEST_MAX_DMA_SECTIONS) { + printk("Woah dude! Megapacket!\n"); + break; + } + dma->addr[seg] = page_to_phys(f->page) + f->page_offset; + dma->len[seg] = f->size; + } + if (seg < LGUEST_MAX_DMA_SECTIONS) + dma->len[seg] = 0; +} + +/* We overload multicast bit to show promiscuous mode. */ +#define PROMISC_BIT 0x01 + +static void lguestnet_set_multicast(struct net_device *dev) +{ + struct lguestnet_info *info = dev->priv; + + if ((dev->flags & (IFF_PROMISC|IFF_ALLMULTI)) || dev->mc_count) + info->peer[info->me].mac[0] |= PROMISC_BIT; + else + info->peer[info->me].mac[0] &= ~PROMISC_BIT; +} + +static int promisc(struct lguestnet_info *info, unsigned int peer) +{ + return info->peer[peer].mac[0] & PROMISC_BIT; +} + +static int mac_eq(const unsigned char mac[ETH_ALEN], + struct lguestnet_info *info, unsigned int peer) +{ + /* Ignore multicast bit, which peer turns on to mean promisc. */ + if ((info->peer[peer].mac[0] & (~PROMISC_BIT)) != mac[0]) + return 0; + return memcmp(mac+1, info->peer[peer].mac+1, ETH_ALEN-1) == 0; +} + +static void transfer_packet(struct net_device *dev, + struct sk_buff *skb, + unsigned int peernum) +{ + struct lguestnet_info *info = dev->priv; + struct lguest_dma dma; + + skb_to_dma(skb, skb_headlen(skb), &dma); + pr_debug("xfer length %04x (%u)\n", htons(skb->len), skb->len); + + hcall(LHCALL_SEND_DMA, peer_key(info,peernum), __pa(&dma), 0); + if (dma.used_len != skb->len) { + info->stats.tx_carrier_errors++; + pr_debug("Bad xfer to peer %i: %i of %i (dma %p/%i)\n", + peernum, dma.used_len, skb->len, + (void *)dma.addr[0], dma.len[0]); + } else { + info->stats.tx_bytes += skb->len; + info->stats.tx_packets++; + } +} + +static int unused_peer(const struct lguest_net peer[], unsigned int num) +{ + return peer[num].mac[0] == 0; +} + +static int lguestnet_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + unsigned int i; + int broadcast; + struct lguestnet_info *info = dev->priv; + const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; + + pr_debug("%s: xmit %02x:%02x:%02x:%02x:%02x:%02x\n", + dev->name, dest[0],dest[1],dest[2],dest[3],dest[4],dest[5]); + + broadcast = is_multicast_ether_addr(dest); + for (i = 0; i < info->mapsize/sizeof(struct lguest_net); i++) { + if (i == info->me || unused_peer(info->peer, i)) + continue; + + if (!broadcast && !promisc(info, i) && !mac_eq(dest, info, i)) + continue; + + pr_debug("lguestnet %s: sending from %i to %i\n", + dev->name, info->me, i); + transfer_packet(dev, skb, i); + } + dev_kfree_skb(skb); + return 0; +} + +/* Find a new skb to put in this slot in shared mem. */ +static int fill_slot(struct net_device *dev, unsigned int slot) +{ + struct lguestnet_info *info = dev->priv; + /* Try to create and register a new one. */ + info->skb[slot] = netdev_alloc_skb(dev, ETH_HLEN + ETH_DATA_LEN); + if (!info->skb[slot]) { + printk("%s: could not fill slot %i\n", dev->name, slot); + return -ENOMEM; + } + + skb_to_dma(info->skb[slot], ETH_HLEN + ETH_DATA_LEN, &info->dma[slot]); + wmb(); + /* Now we tell hypervisor it can use the slot. */ + info->dma[slot].used_len = 0; + return 0; +} + +static irqreturn_t lguestnet_rcv(int irq, void *dev_id) +{ + struct net_device *dev = dev_id; + struct lguestnet_info *info = dev->priv; + unsigned int i, done = 0; + + for (i = 0; i < ARRAY_SIZE(info->dma); i++) { + unsigned int length; + struct sk_buff *skb; + + length = info->dma[i].used_len; + if (length == 0) + continue; + + done++; + skb = info->skb[i]; + fill_slot(dev, i); + + if (length < ETH_HLEN || length > ETH_HLEN + ETH_DATA_LEN) { + pr_debug(KERN_WARNING "%s: unbelievable skb len: %i\n", + dev->name, length); + dev_kfree_skb(skb); + continue; + } + + skb_put(skb, length); + skb->protocol = eth_type_trans(skb, dev); + /* This is a reliable transport. */ + if (dev->features & NETIF_F_NO_CSUM) + skb->ip_summed = CHECKSUM_UNNECESSARY; + pr_debug("Receiving skb proto 0x%04x len %i type %i\n", + ntohs(skb->protocol), skb->len, skb->pkt_type); + + info->stats.rx_bytes += skb->len; + info->stats.rx_packets++; + netif_rx(skb); + } + return done ? IRQ_HANDLED : IRQ_NONE; +} + +static int lguestnet_open(struct net_device *dev) +{ + int i; + struct lguestnet_info *info = dev->priv; + + /* Set up our MAC address */ + memcpy(info->peer[info->me].mac, dev->dev_addr, ETH_ALEN); + + /* Turn on promisc mode if needed */ + lguestnet_set_multicast(dev); + + for (i = 0; i < ARRAY_SIZE(info->dma); i++) { + if (fill_slot(dev, i) != 0) + goto cleanup; + } + if (!hcall(LHCALL_BIND_DMA, peer_key(info, info->me), __pa(info->dma), + (NUM_SKBS << 8) | dev->irq)) + goto cleanup; + return 0; + +cleanup: + while (--i >= 0) + dev_kfree_skb(info->skb[i]); + return -ENOMEM; +} + +static int lguestnet_close(struct net_device *dev) +{ + unsigned int i; + struct lguestnet_info *info = dev->priv; + + /* Clear all trace: others might deliver packets, we'll ignore it. */ + memset(&info->peer[info->me], 0xFF, sizeof(info->peer[info->me])); + + /* Deregister sg lists. */ + hcall(LHCALL_BIND_DMA, peer_key(info, info->me), __pa(info->dma), 0); + for (i = 0; i < ARRAY_SIZE(info->dma); i++) + dev_kfree_skb(info->skb[i]); + return 0; +} + +static struct net_device_stats *lguestnet_get_stats(struct net_device *dev) +{ + struct lguestnet_info *info = dev->priv; + + return &info->stats; +} + +static int lguestnet_probe(struct lguest_device *lgdev) +{ + int err, irqf = IRQF_SHARED; + struct net_device *dev; + struct lguestnet_info *info; + struct lguest_device_desc *desc = &lguest_devices[lgdev->index]; + + pr_debug("lguest_net: probing for device %i\n", lgdev->index); + + dev = alloc_etherdev(sizeof(struct lguestnet_info)); + if (!dev) + return -ENOMEM; + + SET_MODULE_OWNER(dev); + + /* Ethernet defaults with some changes */ + ether_setup(dev); + dev->set_mac_address = NULL; + + dev->dev_addr[0] = 0x02; /* set local assignment bit (IEEE802) */ + dev->dev_addr[1] = 0x00; + memcpy(&dev->dev_addr[2], &lguest_data.guestid, 2); + dev->dev_addr[4] = 0x00; + dev->dev_addr[5] = 0x00; + + dev->open = lguestnet_open; + dev->stop = lguestnet_close; + dev->hard_start_xmit = lguestnet_start_xmit; + dev->get_stats = lguestnet_get_stats; + + /* Turning on/off promisc will call dev->set_multicast_list. + * We don't actually support multicast yet */ + dev->set_multicast_list = lguestnet_set_multicast; + dev->mem_start = ((unsigned long)desc->pfn << PAGE_SHIFT); + dev->mem_end = dev->mem_start + PAGE_SIZE * desc->num_pages; + dev->irq = lgdev->index+1; + dev->features = NETIF_F_SG; + if (desc->features & LGUEST_NET_F_NOCSUM) + dev->features |= NETIF_F_NO_CSUM; + + info = dev->priv; + info->mapsize = PAGE_SIZE * desc->num_pages; + info->peer_phys = ((unsigned long)desc->pfn << PAGE_SHIFT); + info->peer = (void *)ioremap(info->peer_phys, info->mapsize); + /* This stores our peerid (upper bits reserved for future). */ + info->me = (desc->features & (info->mapsize-1)); + + err = register_netdev(dev); + if (err) { + pr_debug("lguestnet: registering device failed\n"); + goto free; + } + + if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS) + irqf |= IRQF_SAMPLE_RANDOM; + if (request_irq(dev->irq, lguestnet_rcv, irqf, "lguestnet", dev) != 0) { + pr_debug("lguestnet: could not get net irq %i\n", dev->irq); + goto unregister; + } + + pr_debug("lguestnet: registered device %s\n", dev->name); + lgdev->private = dev; + return 0; + +unregister: + unregister_netdev(dev); +free: + free_netdev(dev); + return err; +} + +static struct lguest_driver lguestnet_drv = { + .name = "lguestnet", + .owner = THIS_MODULE, + .device_type = LGUEST_DEVICE_T_NET, + .probe = lguestnet_probe, +}; + +static __init int lguestnet_init(void) +{ + return register_lguest_driver(&lguestnet_drv); +} +module_init(lguestnet_init); + +MODULE_DESCRIPTION("Lguest network driver"); +MODULE_LICENSE("GPL"); ^ permalink raw reply [flat|nested] 13+ messages in thread
[parent not found: <1176203475.26372.37.camel@localhost.localdomain>]
* PATCH 7/8] lguest: the block driver [not found] ` <1176203475.26372.37.camel@localhost.localdomain> @ 2007-04-10 11:12 ` Rusty Russell [not found] ` <1176203528.26372.40.camel@localhost.localdomain> 1 sibling, 0 replies; 13+ messages in thread From: Rusty Russell @ 2007-04-10 11:12 UTC (permalink / raw) To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization Lguest block driver A simple block driver for lguest. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> --- drivers/block/Makefile | 1 drivers/block/lguest_blk.c | 271 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 272 insertions(+) =================================================================== --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -28,4 +28,5 @@ obj-$(CONFIG_VIODASD) += viodasd.o obj-$(CONFIG_VIODASD) += viodasd.o obj-$(CONFIG_BLK_DEV_SX8) += sx8.o obj-$(CONFIG_BLK_DEV_UB) += ub.o +obj-$(CONFIG_LGUEST_GUEST) += lguest_blk.o =================================================================== --- /dev/null +++ b/drivers/block/lguest_blk.c @@ -0,0 +1,271 @@ +/* A simple block driver for lguest. + * + * Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +//#define DEBUG +#include <linux/init.h> +#include <linux/types.h> +#include <linux/blkdev.h> +#include <linux/interrupt.h> +#include <linux/lguest_bus.h> + +static char next_block_index = 'a'; + +struct blockdev +{ + spinlock_t lock; + + /* The disk structure for the kernel. */ + struct gendisk *disk; + + /* The major number for this disk. */ + int major; + int irq; + + unsigned long phys_addr; + /* The ioremap'ed block page. */ + struct lguest_block_page *lb_page; + + /* We only have a single request outstanding at a time. */ + struct lguest_dma dma; + struct request *req; +}; + +/* Jens gave me this nice helper to end all chunks of a request. */ +static void end_entire_request(struct request *req, int uptodate) +{ + if (end_that_request_first(req, uptodate, req->hard_nr_sectors)) + BUG(); + add_disk_randomness(req->rq_disk); + blkdev_dequeue_request(req); + end_that_request_last(req, uptodate); +} + +static irqreturn_t lgb_irq(int irq, void *_bd) +{ + struct blockdev *bd = _bd; + unsigned long flags; + + if (!bd->req) { + pr_debug("No work!\n"); + return IRQ_NONE; + } + + if (!bd->lb_page->result) { + pr_debug("No result!\n"); + return IRQ_NONE; + } + + spin_lock_irqsave(&bd->lock, flags); + end_entire_request(bd->req, bd->lb_page->result == 1); + bd->req = NULL; + bd->dma.used_len = 0; + blk_start_queue(bd->disk->queue); + spin_unlock_irqrestore(&bd->lock, flags); + return IRQ_HANDLED; +} + +static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma) +{ + unsigned int i = 0, idx, len = 0; + struct bio *bio; + + rq_for_each_bio(bio, req) { + struct bio_vec *bvec; + bio_for_each_segment(bvec, bio, idx) { + BUG_ON(i == LGUEST_MAX_DMA_SECTIONS); + BUG_ON(!bvec->bv_len); + dma->addr[i] = page_to_phys(bvec->bv_page) + + bvec->bv_offset; + dma->len[i] = bvec->bv_len; + len += bvec->bv_len; + i++; + } + } + if (i < LGUEST_MAX_DMA_SECTIONS) + dma->len[i] = 0; + return len; +} + +static void empty_dma(struct lguest_dma *dma) +{ + dma->len[0] = 0; +} + +static void setup_req(struct blockdev *bd, + int type, struct request *req, struct lguest_dma *dma) +{ + bd->lb_page->type = type; + bd->lb_page->sector = req->sector; + bd->lb_page->result = 0; + bd->req = req; + bd->lb_page->bytes = req_to_dma(req, dma); +} + +static void do_write(struct blockdev *bd, struct request *req) +{ + struct lguest_dma send; + + pr_debug("lgb: WRITE sector %li\n", (long)req->sector); + setup_req(bd, 1, req, &send); + + hcall(LHCALL_SEND_DMA, bd->phys_addr, __pa(&send), 0); +} + +static void do_read(struct blockdev *bd, struct request *req) +{ + struct lguest_dma ping; + + pr_debug("lgb: READ sector %li\n", (long)req->sector); + setup_req(bd, 0, req, &bd->dma); + + empty_dma(&ping); + hcall(LHCALL_SEND_DMA, bd->phys_addr, __pa(&ping), 0); +} + +static void do_lgb_request(request_queue_t *q) +{ + struct blockdev *bd; + struct request *req; + +again: + req = elv_next_request(q); + if (!req) + return; + + bd = req->rq_disk->private_data; + /* Sometimes we get repeated requests after blk_stop_queue. */ + if (bd->req) + return; + + if (!blk_fs_request(req)) { + pr_debug("Got non-command 0x%08x\n", req->cmd_type); + req->errors++; + end_entire_request(req, 0); + goto again; + } + + if (rq_data_dir(req) == WRITE) + do_write(bd, req); + else + do_read(bd, req); + + /* Wait for interrupt to tell us it's done. */ + blk_stop_queue(q); +} + +static struct block_device_operations lguestblk_fops = { + .owner = THIS_MODULE, +}; + +static int lguestblk_probe(struct lguest_device *lgdev) +{ + struct blockdev *bd; + int err; + int irqflags = IRQF_SHARED; + + bd = kmalloc(sizeof(*bd), GFP_KERNEL); + if (!bd) + return -ENOMEM; + + spin_lock_init(&bd->lock); + bd->irq = lgdev->index+1; + bd->req = NULL; + bd->dma.used_len = 0; + bd->dma.len[0] = 0; + bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT); + + bd->lb_page = (void *)ioremap(bd->phys_addr, PAGE_SIZE); + if (!bd->lb_page) { + err = -ENOMEM; + goto out_free_bd; + } + + bd->major = register_blkdev(0, "lguestblk"); + if (bd->major < 0) { + err = bd->major; + goto out_unmap; + } + + bd->disk = alloc_disk(1); + if (!bd->disk) { + err = -ENOMEM; + goto out_unregister_blkdev; + } + + bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock); + if (!bd->disk->queue) { + err = -ENOMEM; + goto out_put_disk; + } + + /* We can only handle a certain number of sg entries */ + blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS); + /* Buffers must not cross page boundaries */ + blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1); + + sprintf(bd->disk->disk_name, "lgb%c", next_block_index++); + if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS) + irqflags |= IRQF_SAMPLE_RANDOM; + err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd); + if (err) + goto out_cleanup_queue; + + hcall(LHCALL_BIND_DMA, bd->phys_addr, __pa(&bd->dma), (1<<8)+bd->irq); + + bd->disk->major = bd->major; + bd->disk->first_minor = 0; + bd->disk->private_data = bd; + bd->disk->fops = &lguestblk_fops; + /* This is initialized to the disk size by the other end. */ + set_capacity(bd->disk, bd->lb_page->num_sectors); + add_disk(bd->disk); + + printk(KERN_INFO "%s: device %i at major %d\n", + bd->disk->disk_name, lgdev->index, bd->major); + + lgdev->private = bd; + return 0; + +out_cleanup_queue: + blk_cleanup_queue(bd->disk->queue); +out_put_disk: + put_disk(bd->disk); +out_unregister_blkdev: + unregister_blkdev(bd->major, "lguestblk"); +out_unmap: + iounmap(bd->lb_page); +out_free_bd: + kfree(bd); + return err; +} + +static struct lguest_driver lguestblk_drv = { + .name = "lguestblk", + .owner = THIS_MODULE, + .device_type = LGUEST_DEVICE_T_BLOCK, + .probe = lguestblk_probe, +}; + +static __init int lguestblk_init(void) +{ + return register_lguest_driver(&lguestblk_drv); +} +module_init(lguestblk_init); + +MODULE_DESCRIPTION("Lguest block driver"); +MODULE_LICENSE("GPL"); ^ permalink raw reply [flat|nested] 13+ messages in thread
[parent not found: <1176203528.26372.40.camel@localhost.localdomain>]
* [PATCH 8/8] lguest: the documentation, example launcher [not found] ` <1176203528.26372.40.camel@localhost.localdomain> @ 2007-04-10 11:12 ` Rusty Russell 2007-04-10 11:28 ` PATCH 7/8] lguest: the block driver Pekka Enberg [not found] ` <84144f020704100428r6e35e34cued8758c94728a6a0@mail.gmail.com> 2 siblings, 0 replies; 13+ messages in thread From: Rusty Russell @ 2007-04-10 11:12 UTC (permalink / raw) To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization A brief document describing how to use lguest. Because lguest doesn't have an ABI we also include an example launcher in the Documentation directory. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> --- Documentation/lguest/Makefile | 20 Documentation/lguest/lguest.c | 982 +++++++++++++++++++++++++++++++++++++++ Documentation/lguest/lguest.txt | 125 ++++ 3 files changed, 1127 insertions(+) =================================================================== --- /dev/null +++ b/Documentation/lguest/Makefile @@ -0,0 +1,20 @@ +# This creates the demonstration utility "lguest" which runs a Linux guest. + +# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary. +include ../../.config +LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000) + +CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 \ + -static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds +LDLIBS:=-lz + +all: lguest.lds lguest + +# The linker script on x86 is so complex the only way of creating one +# which will link our binary in the right place is to mangle the +# default one. +lguest.lds: + $(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@ + +clean: + rm -f lguest.lds lguest =================================================================== --- /dev/null +++ b/Documentation/lguest/lguest.c @@ -0,0 +1,982 @@ +/* Simple program to layout "physical" memory for new lguest guest. + * Linked high to avoid likely physical memory. */ +#define _LARGEFILE64_SOURCE +#define _GNU_SOURCE +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <err.h> +#include <stdint.h> +#include <stdlib.h> +#include <elf.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/wait.h> +#include <fcntl.h> +#include <stdbool.h> +#include <errno.h> +#include <signal.h> +#include <ctype.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <sys/time.h> +#include <time.h> +#include <netinet/in.h> +#include <net/if.h> +#include <linux/sockios.h> +#include <linux/if_tun.h> +#include <sys/uio.h> +#include <termios.h> +#include <getopt.h> +#include <zlib.h> +typedef uint32_t u32; +typedef uint16_t u16; +typedef uint8_t u8; +#include "../../include/linux/lguest_launcher.h" + +#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ +#define NET_PEERNUM 1 +#define BRIDGE_PFX "bridge:" + +static bool verbose; +#define verbose(args...) \ + do { if (verbose) printf(args); } while(0) + +struct device_list +{ + fd_set infds; + int max_infd; + + struct device *dev; + struct device **lastdev; +}; + +struct device +{ + struct device *next; + struct lguest_device_desc *desc; + void *mem; + + /* Watch this fd if handle_input non-NULL. */ + int fd; + bool (*handle_input)(int fd, struct device *me); + + /* Watch DMA to this key if handle_input non-NULL. */ + unsigned long watch_key; + u32 (*handle_output)(int fd, const struct iovec *iov, + unsigned int num, struct device *me); + + /* Device-specific data. */ + void *priv; +}; + +static int open_or_die(const char *name, int flags) +{ + int fd = open(name, flags); + if (fd < 0) + err(1, "Failed to open %s", name); + return fd; +} + +static void *map_zeroed_pages(unsigned long addr, unsigned int num) +{ + static int fd = -1; + + if (fd == -1) + fd = open_or_die("/dev/zero", O_RDONLY); + + if (mmap((void *)addr, getpagesize() * num, + PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0) + != (void *)addr) + err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr); + return (void *)addr; +} + +/* Returns the entry point */ +static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, + unsigned long *page_offset) +{ + void *addr; + Elf32_Phdr phdr[ehdr->e_phnum]; + unsigned int i; + + /* Sanity checks. */ + if (ehdr->e_type != ET_EXEC + || ehdr->e_machine != EM_386 + || ehdr->e_phentsize != sizeof(Elf32_Phdr) + || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) + errx(1, "Malformed elf header"); + + if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) + err(1, "Seeking to program headers"); + if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) + err(1, "Reading program headers"); + + *page_offset = 0; + /* We map the loadable segments at virtual addresses corresponding + * to their physical addresses (our virtual == guest physical). */ + for (i = 0; i < ehdr->e_phnum; i++) { + if (phdr[i].p_type != PT_LOAD) + continue; + + verbose("Section %i: size %i addr %p\n", + i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); + + /* We expect linear address space. */ + if (!*page_offset) + *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr; + else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr) + errx(1, "Page offset of section %i different", i); + + /* We map everything private, writable. */ + addr = mmap((void *)phdr[i].p_paddr, + phdr[i].p_filesz, + PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_FIXED|MAP_PRIVATE, + elf_fd, phdr[i].p_offset); + if (addr != (void *)phdr[i].p_paddr) + err(1, "Mmaping vmlinux seg %i gave %p not %p", + i, addr, (void *)phdr[i].p_paddr); + } + + /* Entry is physical address: convert to virtual */ + return ehdr->e_entry + *page_offset; +} + +/* This is amazingly reliable. */ +static unsigned long intuit_page_offset(unsigned char *img, unsigned long len) +{ + unsigned int i, possibilities[256] = { 0 }; + + for (i = 0; i + 4 < len; i++) { + /* mov 0xXXXXXXXX,%eax */ + if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3) + return (unsigned long)img[i+4] << 24; + } + errx(1, "could not determine page offset"); +} + +static unsigned long unpack_bzimage(int fd, unsigned long *page_offset) +{ + gzFile f; + int ret, len = 0; + void *img = (void *)0x100000; + + f = gzdopen(fd, "rb"); + if (gzdirect(f)) + errx(1, "did not find correct gzip header"); + while ((ret = gzread(f, img + len, 65536)) > 0) + len += ret; + if (ret < 0) + err(1, "reading image from bzImage"); + + verbose("Unpacked size %i addr %p\n", len, img); + *page_offset = intuit_page_offset(img, len); + + /* Entry is physical address: convert to virtual */ + return (unsigned long)img + *page_offset; +} + +static unsigned long load_bzimage(int fd, unsigned long *page_offset) +{ + unsigned char c; + int state = 0; + + /* Ugly brute force search for gzip header. */ + while (read(fd, &c, 1) == 1) { + switch (state) { + case 0: + if (c == 0x1F) + state++; + break; + case 1: + if (c == 0x8B) + state++; + else + state = 0; + break; + case 2 ... 8: + state++; + break; + case 9: + lseek(fd, -10, SEEK_CUR); + if (c != 0x03) /* Compressed under UNIX. */ + state = -1; + else + return unpack_bzimage(fd, page_offset); + } + } + errx(1, "Could not find kernel in bzImage"); +} + +static unsigned long load_kernel(int fd, unsigned long *page_offset) +{ + Elf32_Ehdr hdr; + + if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) + err(1, "Reading kernel"); + + if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) + return map_elf(fd, &hdr, page_offset); + + return load_bzimage(fd, page_offset); +} + +/* initrd gets loaded at top of memory: return length. */ +static unsigned long load_initrd(const char *name, unsigned long mem) +{ + int ifd; + struct stat st; + void *iaddr; + + ifd = open_or_die(name, O_RDONLY); + if (fstat(ifd, &st) < 0) + err(1, "fstat() on initrd '%s'", name); + + iaddr = mmap((void *)mem - st.st_size, st.st_size, + PROT_READ|PROT_EXEC|PROT_WRITE, + MAP_FIXED|MAP_PRIVATE, ifd, 0); + if (iaddr != (void *)mem - st.st_size) + err(1, "Mmaping initrd '%s' returned %p not %p", + name, iaddr, (void *)mem - st.st_size); + close(ifd); + verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr); + return st.st_size; +} + +static inline unsigned long page_align(unsigned long addr) +{ + return ((addr + getpagesize()-1) & ~(getpagesize()-1)); +} + +static unsigned long setup_pagetables(unsigned long mem, + unsigned long initrd_size, + unsigned long page_offset) +{ + u32 *pgdir, *linear; + unsigned int mapped_pages, i, linear_pages; + unsigned int ptes_per_page = getpagesize()/sizeof(u32); + + /* If we can map all of memory above page_offset, we do so. */ + if (mem > -page_offset) + mapped_pages = mem/getpagesize(); + else + mapped_pages = -page_offset/getpagesize(); + + /* Each linear PTE page can map ptes_per_page pages. */ + linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; + + /* We lay out top-level then linear mapping immediately below initrd */ + pgdir = (void *)mem - initrd_size - getpagesize(); + linear = (void *)pgdir - linear_pages*getpagesize(); + + for (i = 0; i < mapped_pages; i++) + linear[i] = ((i * getpagesize()) | PAGE_PRESENT); + + /* Now set up pgd so that this memory is at page_offset */ + for (i = 0; i < mapped_pages; i += ptes_per_page) { + pgdir[(i + page_offset/getpagesize())/ptes_per_page] + = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT); + } + + verbose("Linear mapping of %u pages in %u pte pages at %p\n", + mapped_pages, linear_pages, linear); + + return (unsigned long)pgdir; +} + +static void concat(char *dst, char *args[]) +{ + unsigned int i, len = 0; + + for (i = 0; args[i]; i++) { + strcpy(dst+len, args[i]); + strcat(dst+len, " "); + len += strlen(args[i]) + 1; + } + /* In case it's empty. */ + dst[len] = '\0'; +} + +static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) +{ + u32 args[] = { LHREQ_INITIALIZE, + LGUEST_GUEST_TOP/getpagesize(), /* Just below us */ + pgdir, start, page_offset }; + int fd; + + fd = open_or_die("/dev/lguest", O_RDWR); + if (write(fd, args, sizeof(args)) < 0) + err(1, "Writing to /dev/lguest"); + return fd; +} + +static void set_fd(int fd, struct device_list *devices) +{ + FD_SET(fd, &devices->infds); + if (fd > devices->max_infd) + devices->max_infd = fd; +} + +/* We send lguest_add signals while input is pending: avoids races. */ +static void wake_parent(int pipefd, struct device_list *devices) +{ + nice(19); + + set_fd(pipefd, devices); + + for (;;) { + fd_set rfds = devices->infds; + + select(devices->max_infd+1, &rfds, NULL, NULL, NULL); + if (FD_ISSET(pipefd, &rfds)) { + int ignorefd; + if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0) + exit(0); + FD_CLR(ignorefd, &devices->infds); + } + kill(getppid(), SIGUSR1); + } +} + +/* We don't want signal to kill us, just jerk us out of kernel. */ +static void wakeup(int signo) +{ +} + +static int setup_waker(struct device_list *device_list) +{ + int pipefd[2], child; + struct sigaction act; + + act.sa_handler = wakeup; + sigemptyset(&act.sa_mask); + act.sa_flags = 0; + sigaction(SIGUSR1, &act, NULL); + + pipe(pipefd); + child = fork(); + if (child == -1) + err(1, "forking"); + + if (child == 0) { + close(pipefd[1]); + wake_parent(pipefd[0], device_list); + } + close(pipefd[0]); + + return pipefd[1]; +} + +static void *_check_pointer(unsigned long addr, unsigned int size, + unsigned int line) +{ + if (addr >= LGUEST_GUEST_TOP || addr + size >= LGUEST_GUEST_TOP) + errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr); + return (void *)addr; +} +#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) + +/* Returns pointer to dma->used_len */ +static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num) +{ + unsigned int i; + struct lguest_dma *udma; + + udma = check_pointer(dma, sizeof(*udma)); + for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { + if (!udma->len[i]) + break; + + iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]); + iov[i].iov_len = udma->len[i]; + } + *num = i; + return &udma->used_len; +} + +static u32 *get_dma_buffer(int fd, void *key, + struct iovec iov[], unsigned int *num, u32 *irq) +{ + u32 buf[] = { LHREQ_GETDMA, (u32)key }; + unsigned long udma; + u32 *res; + + udma = write(fd, buf, sizeof(buf)); + if (udma == (unsigned long)-1) + return NULL; + + /* Kernel stashes irq in ->used_len. */ + res = dma2iov(udma, iov, num); + *irq = *res; + return res; +} + +static void trigger_irq(int fd, u32 irq) +{ + u32 buf[] = { LHREQ_IRQ, irq }; + if (write(fd, buf, sizeof(buf)) != 0) + err(1, "Triggering irq %i", irq); +} + +static void discard_iovec(struct iovec *iov, unsigned int *num) +{ + static char discard_buf[1024]; + *num = 1; + iov->iov_base = discard_buf; + iov->iov_len = sizeof(discard_buf); +} + +static struct termios orig_term; +static void restore_term(void) +{ + tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); +} + +struct console_abort +{ + int count; + struct timeval start; +}; + +/* We DMA input to buffer bound at start of console page. */ +static bool handle_console_input(int fd, struct device *dev) +{ + u32 irq = 0, *lenp; + int len; + unsigned int num; + struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; + struct console_abort *abort = dev->priv; + + lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq); + if (!lenp) { + warn("console: no dma buffer!"); + discard_iovec(iov, &num); + } + + len = readv(dev->fd, iov, num); + if (len <= 0) { + warnx("Failed to get console input, ignoring console."); + len = 0; + } + + if (lenp) { + *lenp = len; + trigger_irq(fd, irq); + } + + /* Three ^C within one second? Exit. */ + if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { + if (!abort->count++) + gettimeofday(&abort->start, NULL); + else if (abort->count == 3) { + struct timeval now; + gettimeofday(&now, NULL); + if (now.tv_sec <= abort->start.tv_sec+1) + exit(2); + abort->count = 0; + } + } else + abort->count = 0; + + if (!len) { + restore_term(); + return false; + } + return true; +} + +static u32 handle_console_output(int fd, const struct iovec *iov, + unsigned num, struct device*dev) +{ + return writev(STDOUT_FILENO, iov, num); +} + +static u32 handle_tun_output(int fd, const struct iovec *iov, + unsigned num, struct device *dev) +{ + /* Now we've seen output, we should warn if we can't get buffers. */ + *(bool *)dev->priv = true; + return writev(dev->fd, iov, num); +} + +static unsigned long peer_offset(unsigned int peernum) +{ + return 4 * peernum; +} + +static bool handle_tun_input(int fd, struct device *dev) +{ + u32 irq = 0, *lenp; + int len; + unsigned num; + struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; + + lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num, + &irq); + if (!lenp) { + if (*(bool *)dev->priv) + warn("network: no dma buffer!"); + discard_iovec(iov, &num); + } + + len = readv(dev->fd, iov, num); + if (len <= 0) + err(1, "reading network"); + if (lenp) { + *lenp = len; + trigger_irq(fd, irq); + } + verbose("tun input packet len %i [%02x %02x] (%s)\n", len, + ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1], + lenp ? "sent" : "discarded"); + return true; +} + +static u32 handle_block_output(int fd, const struct iovec *iov, + unsigned num, struct device *dev) +{ + struct lguest_block_page *p = dev->mem; + u32 irq, *lenp; + unsigned int len, reply_num; + struct iovec reply[LGUEST_MAX_DMA_SECTIONS]; + off64_t device_len, off = (off64_t)p->sector * 512; + + device_len = *(off64_t *)dev->priv; + + if (off >= device_len) + err(1, "Bad offset %llu vs %llu", off, device_len); + if (lseek64(dev->fd, off, SEEK_SET) != off) + err(1, "Bad seek to sector %i", p->sector); + + verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off); + + lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq); + if (!lenp) + err(1, "Block request didn't give us a dma buffer"); + + if (p->type) { + len = writev(dev->fd, iov, num); + if (off + len > device_len) { + ftruncate(dev->fd, device_len); + errx(1, "Write past end %llu+%u", off, len); + } + *lenp = 0; + } else { + len = readv(dev->fd, reply, reply_num); + *lenp = len; + } + + p->result = 1 + (p->bytes != len); + trigger_irq(fd, irq); + return 0; +} + +static void handle_output(int fd, unsigned long dma, unsigned long key, + struct device_list *devices) +{ + struct device *i; + u32 *lenp; + struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; + unsigned num = 0; + + lenp = dma2iov(dma, iov, &num); + for (i = devices->dev; i; i = i->next) { + if (i->handle_output && key == i->watch_key) { + *lenp = i->handle_output(fd, iov, num, i); + return; + } + } + warnx("Pending dma %p, key %p", (void *)dma, (void *)key); +} + +static void handle_input(int fd, int childfd, struct device_list *devices) +{ + struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; + + for (;;) { + struct device *i; + fd_set fds = devices->infds; + + if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0) + break; + + for (i = devices->dev; i; i = i->next) { + if (i->handle_input && FD_ISSET(i->fd, &fds)) { + if (!i->handle_input(fd, i)) { + FD_CLR(i->fd, &devices->infds); + /* Tell child to ignore it too... */ + write(childfd, &i->fd, sizeof(i->fd)); + } + } + } + } +} + +static struct lguest_device_desc *new_dev_desc(u16 type, u16 features, + u16 num_pages) +{ + static unsigned long top = LGUEST_GUEST_TOP; + struct lguest_device_desc *desc; + + desc = malloc(sizeof(*desc)); + desc->type = type; + desc->num_pages = num_pages; + desc->features = features; + desc->status = 0; + if (num_pages) { + top -= num_pages*getpagesize(); + map_zeroed_pages(top, num_pages); + desc->pfn = top / getpagesize(); + } else + desc->pfn = 0; + return desc; +} + +static struct device *new_device(struct device_list *devices, + u16 type, u16 num_pages, u16 features, + int fd, + bool (*handle_input)(int, struct device *), + unsigned long watch_off, + u32 (*handle_output)(int, + const struct iovec *, + unsigned, + struct device *)) +{ + struct device *dev = malloc(sizeof(*dev)); + + /* Append to device list. */ + *devices->lastdev = dev; + dev->next = NULL; + devices->lastdev = &dev->next; + + dev->fd = fd; + if (handle_input) + set_fd(dev->fd, devices); + dev->desc = new_dev_desc(type, features, num_pages); + dev->mem = (void *)(dev->desc->pfn * getpagesize()); + dev->handle_input = handle_input; + dev->watch_key = (unsigned long)dev->mem + watch_off; + dev->handle_output = handle_output; + return dev; +} + +static void setup_console(struct device_list *devices) +{ + struct device *dev; + + if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { + struct termios term = orig_term; + term.c_lflag &= ~(ISIG|ICANON|ECHO); + tcsetattr(STDIN_FILENO, TCSANOW, &term); + atexit(restore_term); + } + + /* We don't currently require a page for the console. */ + dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0, + STDIN_FILENO, handle_console_input, + LGUEST_CONSOLE_DMA_KEY, handle_console_output); + dev->priv = malloc(sizeof(struct console_abort)); + ((struct console_abort *)dev->priv)->count = 0; + verbose("device %p: console\n", + (void *)(dev->desc->pfn * getpagesize())); +} + +static void setup_block_file(const char *filename, struct device_list *devices) +{ + int fd; + struct device *dev; + off64_t *device_len; + struct lguest_block_page *p; + + fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT); + dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1, + LGUEST_DEVICE_F_RANDOMNESS, + fd, NULL, 0, handle_block_output); + device_len = dev->priv = malloc(sizeof(*device_len)); + *device_len = lseek64(fd, 0, SEEK_END); + p = dev->mem; + + p->num_sectors = *device_len/512; + verbose("device %p: block %i sectors\n", + (void *)(dev->desc->pfn * getpagesize()), p->num_sectors); +} + +/* We use fnctl locks to reserve network slots (autocleanup!) */ +static unsigned int find_slot(int netfd, const char *filename) +{ + struct flock fl; + + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + fl.l_len = 1; + for (fl.l_start = 0; + fl.l_start < getpagesize()/sizeof(struct lguest_net); + fl.l_start++) { + if (fcntl(netfd, F_SETLK, &fl) == 0) + return fl.l_start; + } + errx(1, "No free slots in network file %s", filename); +} + +static void setup_net_file(const char *filename, + struct device_list *devices) +{ + int netfd; + struct device *dev; + + netfd = open(filename, O_RDWR, 0); + if (netfd < 0) { + if (errno == ENOENT) { + netfd = open(filename, O_RDWR|O_CREAT, 0600); + if (netfd >= 0) { + char page[getpagesize()]; + memset(page, 0, sizeof(page)); + write(netfd, page, sizeof(page)); + } + } + if (netfd < 0) + err(1, "cannot open net file '%s'", filename); + } + + dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, + find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM, + -1, NULL, 0, NULL); + + /* We overwrite the /dev/zero mapping with the actual file. */ + if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE, + MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem) + err(1, "could not mmap '%s'", filename); + verbose("device %p: shared net %s, peer %i\n", + (void *)(dev->desc->pfn * getpagesize()), filename, + dev->desc->features & ~LGUEST_NET_F_NOCSUM); +} + +static u32 str2ip(const char *ipaddr) +{ + unsigned int byte[4]; + + sscanf(ipaddr, "%u.%u.%u.%u", &byte[0], &byte[1], &byte[2], &byte[3]); + return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3]; +} + +/* adapted from libbridge */ +static void add_to_bridge(int fd, const char *if_name, const char *br_name) +{ + int ifidx; + struct ifreq ifr; + + if (!*br_name) + errx(1, "must specify bridge name"); + + ifidx = if_nametoindex(if_name); + if (!ifidx) + errx(1, "interface %s does not exist!", if_name); + + strncpy(ifr.ifr_name, br_name, IFNAMSIZ); + ifr.ifr_ifindex = ifidx; + if (ioctl(fd, SIOCBRADDIF, &ifr) < 0) + err(1, "can't add %s to bridge %s", if_name, br_name); +} + +static void configure_device(int fd, const char *devname, u32 ipaddr, + unsigned char hwaddr[6]) +{ + struct ifreq ifr; + struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr; + + memset(&ifr, 0, sizeof(ifr)); + strcpy(ifr.ifr_name, devname); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = htonl(ipaddr); + if (ioctl(fd, SIOCSIFADDR, &ifr) != 0) + err(1, "Setting %s interface address", devname); + ifr.ifr_flags = IFF_UP; + if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) + err(1, "Bringing interface %s up", devname); + + if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0) + err(1, "getting hw address for %s", devname); + + memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); +} + +static void setup_tun_net(const char *arg, struct device_list *devices) +{ + struct device *dev; + struct ifreq ifr; + int netfd, ipfd; + u32 ip; + const char *br_name = NULL; + + netfd = open_or_die("/dev/net/tun", O_RDWR); + memset(&ifr, 0, sizeof(ifr)); + ifr.ifr_flags = IFF_TAP | IFF_NO_PI; + strcpy(ifr.ifr_name, "tap%d"); + if (ioctl(netfd, TUNSETIFF, &ifr) != 0) + err(1, "configuring /dev/net/tun"); + + /* You will be peer 1: we should create enough jitter to randomize */ + dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, + NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd, + handle_tun_input, peer_offset(0), handle_tun_output); + dev->priv = malloc(sizeof(bool)); + *(bool *)dev->priv = false; + + ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + if (ipfd < 0) + err(1, "opening IP socket"); + + if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { + ip = INADDR_ANY; + br_name = arg + strlen(BRIDGE_PFX); + add_to_bridge(ipfd, ifr.ifr_name, br_name); + } else + ip = str2ip(arg); + + /* We are peer 0, ie. first slot. */ + configure_device(ipfd, ifr.ifr_name, ip, dev->mem); + close(ipfd); + + verbose("device %p: tun net %u.%u.%u.%u\n", + (void *)(dev->desc->pfn * getpagesize()), + (u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip); + if (br_name) + verbose("attached to bridge: %s\n", br_name); +} + +/* Now we know how much memory we have, we copy in device descriptors */ +static void map_device_descriptors(struct device_list *devs, unsigned long mem) +{ + struct device *i; + unsigned int num; + struct lguest_device_desc *descs; + + /* Device descriptor array sits just above top of normal memory */ + descs = map_zeroed_pages(mem, 1); + + for (i = devs->dev, num = 0; i; i = i->next, num++) { + if (num == LGUEST_MAX_DEVICES) + errx(1, "too many devices"); + verbose("Device %i: %s\n", num, + i->desc->type == LGUEST_DEVICE_T_NET ? "net" + : i->desc->type == LGUEST_DEVICE_T_CONSOLE ? "console" + : i->desc->type == LGUEST_DEVICE_T_BLOCK ? "block" + : "unknown"); + descs[num] = *i->desc; + free(i->desc); + i->desc = &descs[num]; + } +} + +static void __attribute__((noreturn)) +run_guest(int lguest_fd, int waker_fd, struct device_list *device_list) +{ + sigset_t sigset; + + sigemptyset(&sigset); + sigaddset(&sigset, SIGUSR1); + for (;;) { + unsigned long arr[2]; + int readval; + + sigprocmask(SIG_UNBLOCK, &sigset, NULL); + readval = read(lguest_fd, arr, sizeof(arr)); + sigprocmask(SIG_BLOCK, &sigset, NULL); + + if (readval == sizeof(arr)) + handle_output(lguest_fd, arr[0], arr[1], device_list); + else if (errno == ENOENT) { + char reason[1024] = { 0 }; + read(lguest_fd, reason, sizeof(reason)-1); + errx(1, "%s", reason); + } else if (errno != EINTR) + err(1, "Running guest failed"); + handle_input(lguest_fd, waker_fd, device_list); + } +} + +static struct option opts[] = { + { "verbose", 0, NULL, 'v' }, + { "sharenet", 1, NULL, 's' }, + { "tunnet", 1, NULL, 't' }, + { "block", 1, NULL, 'b' }, + { "initrd", 1, NULL, 'i' }, +}; +static void usage(void) +{ + errx(1, "Usage: lguest [--verbose] " + "[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n" + "|--block=<filename>|--initrd=<filename>]...\n" + "<mem-in-mb> vmlinux [args...]"); +} + +int main(int argc, char *argv[]) +{ + unsigned long mem, pgdir, start, page_offset; + int c, lguest_fd, waker_fd; + struct device_list device_list; + struct lguest_boot_info *boot = (void *)0; + const char *initrd_name = NULL; + + device_list.max_infd = -1; + device_list.dev = NULL; + device_list.lastdev = &device_list.dev; + FD_ZERO(&device_list.infds); + + while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { + switch (c) { + case 'v': + verbose = true; + break; + case 's': + setup_net_file(optarg, &device_list); + break; + case 't': + setup_tun_net(optarg, &device_list); + break; + case 'b': + setup_block_file(optarg, &device_list); + break; + case 'i': + initrd_name = optarg; + break; + default: + warnx("Unknown argument %s", argv[optind]); + usage(); + } + } + if (optind + 2 > argc) + usage(); + + /* We need a console device */ + setup_console(&device_list); + + /* First we map /dev/zero over all of guest-physical memory. */ + mem = atoi(argv[optind]) * 1024 * 1024; + map_zeroed_pages(0, mem / getpagesize()); + + /* Now we load the kernel */ + start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), + &page_offset); + + /* Write the device descriptors into memory. */ + map_device_descriptors(&device_list, mem); + + /* Map the initrd image if requested */ + if (initrd_name) + boot->initrd_size = load_initrd(initrd_name, mem); + + /* Set up the initial linar pagetables. */ + pgdir = setup_pagetables(mem, boot->initrd_size, page_offset); + + /* Give the guest the boot information it needs. */ + concat(boot->cmdline, argv+optind+2); + boot->max_pfn = mem/getpagesize(); + + lguest_fd = tell_kernel(pgdir, start, page_offset); + waker_fd = setup_waker(&device_list); + + run_guest(lguest_fd, waker_fd, &device_list); +} =================================================================== --- /dev/null +++ b/Documentation/lguest/lguest.txt @@ -0,0 +1,125 @@ +Rusty's Remarkably Unreliable Guide to Lguest + - or, A Young Coder's Illustrated Hypervisor +http://lguest.ozlabs.org + +Lguest is designed to be a minimal hypervisor for the Linux kernel, for +Linux developers and users to experiment with virtualization with the +minimum of complexity. Nonetheless, it should have sufficient +features to make it useful for specific tasks, and, of course, you are +encouraged to fork and enhance it. + +Features: + +- Kernel module which runs in a normal kernel. +- Simple I/O model for communication. +- Simple program to create new guests. +- Logo contains cute puppies: http://lguest.ozlabs.org + +Developer features: + +- Fun to hack on. +- No ABI: being tied to a specific kernel anyway, you can change anything. +- Many opportunities for improvement or feature implementation. + +Running Lguest: + +- You will need to configure your kernel with the following options: + + CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1] + CONFIG_TUN=y/m ("Universal TUN/TAP device driver support") + CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers") + CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)") + CONFIG_LGUEST=y/m ("Linux hypervisor example code") + + and I recommend: + CONFIG_HZ=100 ("Timer frequency")[2] + +- A tool called "lguest" is available in this directory: type "make" + to build it. + +- Create or find a root disk image. There are several useful ones + around, such as the xm-test tiny root image at + http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img + + For more serious work, I usually use a distribution ISO image and + install it under qemu, then make multiple copies: + + dd if=/dev/zero of=rootfile bs=1M count=2048 + qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d + +- "modprobe lg" if you built it as a module. + +- Run an lguest as root: + + Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba + + Explanation: + 64m: the amount of memory to use. + + vmlinux: the kernel image found in the top of your build directory. You + can also use a standard bzImage. + + --tunnet=192.168.19.1: configures a "tap" device for networking with this + IP address. + + --block=rootfile: a file or block device which becomes /dev/lgba + inside the guest. + + root=/dev/lgba: this (and anything else on the command line) are + kernel boot parameters. + +- Configuring networking. I usually have the host masquerade, using + "iptables -t nat -o eth0 -j MASQUERADE" and "echo 1 > + /proc/sys/net/ipv4/ip_forward". In this example, I would configure + eth0 inside the guest at 192.168.19.2. + + Another method is to bridge the tap device to an external interface + using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest + to obtain an IP address. The bridge needs to be configured first: + this option simply adds the tap interface to it. + + A simple example on my system: + + ifconfig eth0 0.0.0.0 + brctl addbr lg0 + ifconfig lg0 up + brctl addif lg0 eth0 + dhclient lg0 + + Then use --tunnet=bridge:lg0 when launching the guest. + + See http://linux-net.osdl.org/index.php/Bridge for general information + on how to get bridging working. + +- You can also create an inter-guest network using + "--sharenet=<filename>": any two guests using the same file are on + the same network. This file is created if it does not exist. + +Lguest I/O model: + +Lguest uses a simplified DMA model plus shared memory for I/O. Guests +can communicate with each other if they share underlying memory +(usually by the lguest program mmaping the same file), but they can +use any non-shared memory to communicate with the lguest process. + +Guests can register DMA buffers at any key (must be a valid physical +address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq) +hypercall. "dmabufs" is the physical address of an array of "num" +"struct lguest_dma": each contains a used_len, and an array of +physical addresses and lengths. When a transfer occurs, the +"used_len" field of one of the buffers which has used_len 0 will be +set to the length transferred and the irq will fire. + +Using an irq value of 0 unbinds the dma buffers. + +To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used, +and the bytes used is written to the used_len field. This can be 0 if +noone else has bound a DMA buffer to that key or some other error. +DMA buffers bound by the same guest are ignored. + +Cheers! +Rusty Russell rusty@rustcorp.com.au. + +[1] These are on various places on the TODO list, waiting for you to + get annoyed enough at the limitation to fix it. +[2] Lguest is not yet tickless when idle. See [1]. ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: PATCH 7/8] lguest: the block driver [not found] ` <1176203528.26372.40.camel@localhost.localdomain> 2007-04-10 11:12 ` [PATCH 8/8] lguest: the documentation, example launcher Rusty Russell @ 2007-04-10 11:28 ` Pekka Enberg [not found] ` <84144f020704100428r6e35e34cued8758c94728a6a0@mail.gmail.com> 2 siblings, 0 replies; 13+ messages in thread From: Pekka Enberg @ 2007-04-10 11:28 UTC (permalink / raw) To: Rusty Russell Cc: Andi Kleen, Andrew Morton, lkml - Kernel Mailing List, virtualization Hi Rusty, On 4/10/07, Rusty Russell <rusty@rustcorp.com.au> wrote: > +/* Jens gave me this nice helper to end all chunks of a request. */ > +static void end_entire_request(struct request *req, int uptodate) > +{ > + if (end_that_request_first(req, uptodate, req->hard_nr_sectors)) > + BUG(); > + add_disk_randomness(req->rq_disk); > + blkdev_dequeue_request(req); > + end_that_request_last(req, uptodate); > +} Perhaps we should move this to generic code (i.e. block/ll_rw_blk.c)? ^ permalink raw reply [flat|nested] 13+ messages in thread
[parent not found: <84144f020704100428r6e35e34cued8758c94728a6a0@mail.gmail.com>]
* Re: PATCH 7/8] lguest: the block driver [not found] ` <84144f020704100428r6e35e34cued8758c94728a6a0@mail.gmail.com> @ 2007-04-10 11:36 ` Pekka Enberg [not found] ` <84144f020704100436y3405f104x8345c72556d961e9@mail.gmail.com> 1 sibling, 0 replies; 13+ messages in thread From: Pekka Enberg @ 2007-04-10 11:36 UTC (permalink / raw) To: Rusty Russell Cc: Andi Kleen, Andrew Morton, jens.axboe, lkml - Kernel Mailing List, virtualization On 4/10/07, Rusty Russell <rusty@rustcorp.com.au> wrote: > > +/* Jens gave me this nice helper to end all chunks of a request. */ > > +static void end_entire_request(struct request *req, int uptodate) > > +{ > > + if (end_that_request_first(req, uptodate, req->hard_nr_sectors)) > > + BUG(); > > + add_disk_randomness(req->rq_disk); > > + blkdev_dequeue_request(req); > > + end_that_request_last(req, uptodate); > > +} On 4/10/07, Pekka Enberg <penberg@cs.helsinki.fi> wrote: > Perhaps we should move this to generic code (i.e. block/ll_rw_blk.c)? Uhm, I am bit confused now. Why don't you just use end_request() here? ^ permalink raw reply [flat|nested] 13+ messages in thread
[parent not found: <84144f020704100436y3405f104x8345c72556d961e9@mail.gmail.com>]
* Re: PATCH 7/8] lguest: the block driver [not found] ` <84144f020704100436y3405f104x8345c72556d961e9@mail.gmail.com> @ 2007-04-11 4:00 ` Rusty Russell 2007-04-11 5:29 ` Pekka Enberg 0 siblings, 1 reply; 13+ messages in thread From: Rusty Russell @ 2007-04-11 4:00 UTC (permalink / raw) To: Pekka Enberg Cc: Andi Kleen, Andrew Morton, jens.axboe, lkml - Kernel Mailing List, virtualization On Tue, 2007-04-10 at 14:36 +0300, Pekka Enberg wrote: > On 4/10/07, Rusty Russell <rusty@rustcorp.com.au> wrote: > > > +/* Jens gave me this nice helper to end all chunks of a request. */ > > > +static void end_entire_request(struct request *req, int uptodate) > > > +{ > > > + if (end_that_request_first(req, uptodate, req->hard_nr_sectors)) > > > + BUG(); > > > + add_disk_randomness(req->rq_disk); > > > + blkdev_dequeue_request(req); > > > + end_that_request_last(req, uptodate); > > > +} > > On 4/10/07, Pekka Enberg <penberg@cs.helsinki.fi> wrote: > > Perhaps we should move this to generic code (i.e. block/ll_rw_blk.c)? Yeah, Jens said to put it in here and he'd hoist it later. > Uhm, I am bit confused now. Why don't you just use end_request() here? What a question! end_request() doesn't end a request! What a crazy idea! As far as I can tell, every name in the block layer is actually some variant of "fuck off, this is too complicated for you to understand". Hope that clarifies! Rusty. ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: PATCH 7/8] lguest: the block driver 2007-04-11 4:00 ` Rusty Russell @ 2007-04-11 5:29 ` Pekka Enberg 0 siblings, 0 replies; 13+ messages in thread From: Pekka Enberg @ 2007-04-11 5:29 UTC (permalink / raw) To: Rusty Russell Cc: Andi Kleen, Andrew Morton, jens.axboe, lkml - Kernel Mailing List, virtualization On 4/11/07, Rusty Russell <rusty@rustcorp.com.au> wrote: > What a question! end_request() doesn't end a request! What a crazy > idea! Aah, indeed, end_request() uses req->hard_cur_sectors while end_entire_request() uses req->hard_nr_sectors which I missed. On 4/11/07, Rusty Russell <rusty@rustcorp.com.au> wrote: > Hope that clarifies! It does. Thanks Rusty :) Pekka ^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2007-04-11 5:29 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <1176203068.26372.21.camel@localhost.localdomain>
2007-04-10 11:05 ` [PATCH 1/8] lguest: the guest code Rusty Russell
[not found] ` <1176203130.26372.23.camel@localhost.localdomain>
2007-04-10 11:06 ` [PATCH 2/8] lguest: the host code Rusty Russell
[not found] ` <1176203176.26372.25.camel@localhost.localdomain>
2007-04-10 11:07 ` [PATCH 3/8] lguest: the asm offsets Rusty Russell
[not found] ` <1176203231.26372.27.camel@localhost.localdomain>
2007-04-10 11:08 ` [PATCH 4/8] lguest: the Makefile and Kconfig Rusty Russell
2007-04-10 11:08 ` [PATCH 5/8] lguest: the console driver Rusty Russell
2007-04-10 11:09 ` [PATCH 5/8] lguest: the net driver Rusty Russell
2007-04-10 11:11 ` [PATCH 6/8] " Rusty Russell
[not found] ` <1176203475.26372.37.camel@localhost.localdomain>
2007-04-10 11:12 ` PATCH 7/8] lguest: the block driver Rusty Russell
[not found] ` <1176203528.26372.40.camel@localhost.localdomain>
2007-04-10 11:12 ` [PATCH 8/8] lguest: the documentation, example launcher Rusty Russell
2007-04-10 11:28 ` PATCH 7/8] lguest: the block driver Pekka Enberg
[not found] ` <84144f020704100428r6e35e34cued8758c94728a6a0@mail.gmail.com>
2007-04-10 11:36 ` Pekka Enberg
[not found] ` <84144f020704100436y3405f104x8345c72556d961e9@mail.gmail.com>
2007-04-11 4:00 ` Rusty Russell
2007-04-11 5:29 ` Pekka Enberg
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).