* [PATCH 1/8] lguest: the guest code
[not found] <1176203068.26372.21.camel@localhost.localdomain>
@ 2007-04-10 11:05 ` Rusty Russell
[not found] ` <1176203130.26372.23.camel@localhost.localdomain>
1 sibling, 0 replies; 13+ messages in thread
From: Rusty Russell @ 2007-04-10 11:05 UTC (permalink / raw)
To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization
This is the code and headers required to make an i386 kernel an lguest guest.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
drivers/lguest/lguest.c | 509 +++++++++++++++++++++++++++++++++++++++++++
drivers/lguest/lguest_asm.S | 59 ++++
drivers/lguest/lguest_bus.c | 148 ++++++++++++
include/linux/lguest.h | 85 +++++++
include/linux/lguest_bus.h | 33 ++
5 files changed, 834 insertions(+)
===================================================================
--- /dev/null
+++ b/drivers/lguest/lguest.c
@@ -0,0 +1,509 @@
+/*
+ * Lguest specific paravirt-ops implementation
+ *
+ * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/start_kernel.h>
+#include <linux/string.h>
+#include <linux/console.h>
+#include <linux/screen_info.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/lguest.h>
+#include <linux/lguest_launcher.h>
+#include <asm/paravirt.h>
+#include <asm/param.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/e820.h>
+#include <asm/pda.h>
+#include <asm/mce.h>
+
+/* Declarations for definitions in lguest_guest.S */
+extern char lguest_noirq_start[], lguest_noirq_end[];
+extern const char lgstart_cli[], lgend_cli[];
+extern const char lgstart_sti[], lgend_sti[];
+extern const char lgstart_popf[], lgend_popf[];
+extern const char lgstart_pushf[], lgend_pushf[];
+extern const char lgstart_iret[], lgend_iret[];
+extern asmlinkage void lguest_maybe_init(void);
+extern void lguest_iret(void);
+
+struct lguest_data lguest_data = {
+ .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
+ .noirq_start = (u32)lguest_noirq_start,
+ .noirq_end = (u32)lguest_noirq_end,
+ .blocked_interrupts = { 1 }, /* Block timer interrupts */
+};
+struct lguest_device_desc *lguest_devices;
+static __initdata const struct lguest_boot_info *boot = __va(0);
+
+static enum paravirt_lazy_mode lazy_mode;
+static void lguest_lazy_mode(enum paravirt_lazy_mode mode)
+{
+ lazy_mode = mode;
+ if (mode == PARAVIRT_LAZY_NONE)
+ hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+}
+
+static void lazy_hcall(unsigned long call,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3)
+{
+ if (lazy_mode == PARAVIRT_LAZY_NONE)
+ hcall(call, arg1, arg2, arg3);
+ else
+ async_hcall(call, arg1, arg2, arg3);
+}
+
+void async_hcall(unsigned long call,
+ unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+ /* Note: This code assumes we're uniprocessor. */
+ static unsigned int next_call;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ if (lguest_data.hcall_status[next_call] != 0xFF) {
+ /* Table full, so do normal hcall which will flush table. */
+ hcall(call, arg1, arg2, arg3);
+ } else {
+ lguest_data.hcalls[next_call].eax = call;
+ lguest_data.hcalls[next_call].edx = arg1;
+ lguest_data.hcalls[next_call].ebx = arg2;
+ lguest_data.hcalls[next_call].ecx = arg3;
+ /* Make sure host sees arguments before "valid" flag. */
+ wmb();
+ lguest_data.hcall_status[next_call] = 0;
+ if (++next_call == LHCALL_RING_SIZE)
+ next_call = 0;
+ }
+ local_irq_restore(flags);
+}
+
+static unsigned long save_fl(void)
+{
+ return lguest_data.irq_enabled;
+}
+
+static void restore_fl(unsigned long flags)
+{
+ /* FIXME: Check if interrupt pending... */
+ lguest_data.irq_enabled = flags;
+}
+
+static void irq_disable(void)
+{
+ lguest_data.irq_enabled = 0;
+}
+
+static void irq_enable(void)
+{
+ /* FIXME: Check if interrupt pending... */
+ lguest_data.irq_enabled = X86_EFLAGS_IF;
+}
+
+static void lguest_write_idt_entry(struct desc_struct *dt,
+ int entrynum, u32 low, u32 high)
+{
+ write_dt_entry(dt, entrynum, low, high);
+ hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high);
+}
+
+static void lguest_load_idt(const struct Xgt_desc_struct *desc)
+{
+ unsigned int i;
+ struct desc_struct *idt = (void *)desc->address;
+
+ for (i = 0; i < (desc->size+1)/8; i++)
+ hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
+}
+
+static void lguest_load_gdt(const struct Xgt_desc_struct *desc)
+{
+ BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
+ hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
+}
+
+static void lguest_write_gdt_entry(struct desc_struct *dt,
+ int entrynum, u32 low, u32 high)
+{
+ write_dt_entry(dt, entrynum, low, high);
+ hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
+}
+
+static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
+}
+
+static void lguest_set_ldt(const void *addr, unsigned entries)
+{
+}
+
+static void lguest_load_tr_desc(void)
+{
+}
+
+static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ int function = *eax;
+
+ native_cpuid(eax, ebx, ecx, edx);
+ switch (function) {
+ case 1: /* Basic feature request. */
+ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
+ *ecx &= 0x00002201;
+ /* Similarly: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
+ *edx &= 0x07808101;
+ /* Host wants to know when we flush kernel pages: set PGE. */
+ *edx |= 0x00002000;
+ break;
+ case 0x80000000:
+ /* Futureproof this a little: if they ask how much extended
+ * processor information, limit it to known fields. */
+ if (*eax > 0x80000008)
+ *eax = 0x80000008;
+ break;
+ }
+}
+
+static unsigned long current_cr0, current_cr3;
+static void lguest_write_cr0(unsigned long val)
+{
+ hcall(LHCALL_TS, val & 8, 0, 0);
+ current_cr0 = val;
+}
+
+static unsigned long lguest_read_cr0(void)
+{
+ return current_cr0;
+}
+
+static void lguest_clts(void)
+{
+ lazy_hcall(LHCALL_TS, 0, 0, 0);
+ current_cr0 &= ~8U;
+}
+
+static unsigned long lguest_read_cr2(void)
+{
+ return lguest_data.cr2;
+}
+
+static void lguest_write_cr3(unsigned long cr3)
+{
+ hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
+ current_cr3 = cr3;
+}
+
+static unsigned long lguest_read_cr3(void)
+{
+ return current_cr3;
+}
+
+/* Used to enable/disable PGE, but we don't care. */
+static unsigned long lguest_read_cr4(void)
+{
+ return 0;
+}
+
+static void lguest_write_cr4(unsigned long val)
+{
+}
+
+static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval)
+{
+ *ptep = pteval;
+ lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low);
+}
+
+/* We only support two-level pagetables at the moment. */
+static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+ *pmdp = pmdval;
+ lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK,
+ (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
+}
+
+/* FIXME: Eliminate all callers of this. */
+static void lguest_set_pte(pte_t *ptep, pte_t pteval)
+{
+ *ptep = pteval;
+ /* Don't bother with hypercall before initial setup. */
+ if (current_cr3)
+ hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+}
+
+static void lguest_flush_tlb_single(unsigned long addr)
+{
+ /* Simply set it to zero, and it will fault back in. */
+ lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0);
+}
+
+static void lguest_flush_tlb_user(void)
+{
+ lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);
+}
+
+static void lguest_flush_tlb_kernel(void)
+{
+ lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+}
+
+static void disable_lguest_irq(unsigned int irq)
+{
+ set_bit(irq, lguest_data.blocked_interrupts);
+}
+
+static void enable_lguest_irq(unsigned int irq)
+{
+ clear_bit(irq, lguest_data.blocked_interrupts);
+ /* FIXME: If it's pending? */
+}
+
+static struct irq_chip lguest_irq_controller = {
+ .name = "lguest",
+ .mask = disable_lguest_irq,
+ .mask_ack = disable_lguest_irq,
+ .unmask = enable_lguest_irq,
+};
+
+static void __init lguest_init_IRQ(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < LGUEST_IRQS; i++) {
+ int vector = FIRST_EXTERNAL_VECTOR + i;
+ if (vector != SYSCALL_VECTOR) {
+ set_intr_gate(vector, interrupt[i]);
+ set_irq_chip_and_handler(i, &lguest_irq_controller,
+ handle_level_irq);
+ }
+ }
+ irq_ctx_init(smp_processor_id());
+}
+
+static unsigned long lguest_get_wallclock(void)
+{
+ return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0);
+}
+
+static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
+{
+ do_timer(hcall(LHCALL_TIMER_READ, 0, 0, 0));
+ update_process_times(user_mode_vm(get_irq_regs()));
+}
+
+static void lguest_time_init(void)
+{
+ set_irq_handler(0, lguest_time_irq);
+ hcall(LHCALL_TIMER_READ, 0, 0, 0);
+ enable_lguest_irq(0);
+}
+
+static void lguest_load_esp0(struct tss_struct *tss,
+ struct thread_struct *thread)
+{
+ lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0,
+ THREAD_SIZE/PAGE_SIZE);
+}
+
+static void lguest_set_debugreg(int regno, unsigned long value)
+{
+ /* FIXME: Implement */
+}
+
+static void lguest_wbinvd(void)
+{
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static void lguest_apic_write(unsigned long reg, unsigned long v)
+{
+}
+
+static unsigned long lguest_apic_read(unsigned long reg)
+{
+ return 0;
+}
+#endif
+
+static void lguest_safe_halt(void)
+{
+ hcall(LHCALL_HALT, 0, 0, 0);
+}
+
+static void lguest_power_off(void)
+{
+ hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
+}
+
+static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
+{
+ hcall(LHCALL_CRASH, __pa(p), 0, 0);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block paniced = {
+ .notifier_call = lguest_panic
+};
+
+static __init char *lguest_memory_setup(void)
+{
+ /* We do this here because lockcheck barfs if before start_kernel */
+ atomic_notifier_chain_register(&panic_notifier_list, &paniced);
+
+ e820.nr_map = 0;
+ add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM);
+ return "LGUEST";
+}
+
+static const struct lguest_insns
+{
+ const char *start, *end;
+} lguest_insns[] = {
+ [PARAVIRT_PATCH(irq_disable)] = { lgstart_cli, lgend_cli },
+ [PARAVIRT_PATCH(irq_enable)] = { lgstart_sti, lgend_sti },
+ [PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf },
+ [PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf },
+ [PARAVIRT_PATCH(iret)] = { lgstart_iret, lgend_iret },
+};
+static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len)
+{
+ unsigned int insn_len;
+
+ /* Don't touch it if we don't have a replacement */
+ if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
+ return len;
+
+ insn_len = lguest_insns[type].end - lguest_insns[type].start;
+
+ /* Similarly if we can't fit replacement. */
+ if (len < insn_len)
+ return len;
+
+ memcpy(insns, lguest_insns[type].start, insn_len);
+ if (type == PARAVIRT_PATCH(iret)) {
+ /* Jumps are relative. */
+ u32 off = (u32)lguest_iret - ((u32)insns + insn_len);
+ memcpy(insns+1, &off, sizeof(off));
+ }
+ return insn_len;
+}
+
+/* From head.S */
+extern void setup_pda(void);
+extern struct Xgt_desc_struct early_gdt_descr;
+
+__init void lguest_init(void)
+{
+ paravirt_ops.name = "lguest";
+ paravirt_ops.paravirt_enabled = 1;
+ paravirt_ops.kernel_rpl = 1;
+
+ paravirt_ops.save_fl = save_fl;
+ paravirt_ops.restore_fl = restore_fl;
+ paravirt_ops.irq_disable = irq_disable;
+ paravirt_ops.irq_enable = irq_enable;
+ paravirt_ops.load_gdt = lguest_load_gdt;
+ paravirt_ops.memory_setup = lguest_memory_setup;
+ paravirt_ops.cpuid = lguest_cpuid;
+ paravirt_ops.write_cr3 = lguest_write_cr3;
+ paravirt_ops.flush_tlb_user = lguest_flush_tlb_user;
+ paravirt_ops.flush_tlb_single = lguest_flush_tlb_single;
+ paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
+ paravirt_ops.set_pte = lguest_set_pte;
+ paravirt_ops.set_pte_at = lguest_set_pte_at;
+ paravirt_ops.set_pmd = lguest_set_pmd;
+#ifdef CONFIG_X86_LOCAL_APIC
+ paravirt_ops.apic_write = lguest_apic_write;
+ paravirt_ops.apic_write_atomic = lguest_apic_write;
+ paravirt_ops.apic_read = lguest_apic_read;
+#endif
+ paravirt_ops.load_idt = lguest_load_idt;
+ paravirt_ops.iret = lguest_iret;
+ paravirt_ops.load_esp0 = lguest_load_esp0;
+ paravirt_ops.load_tr_desc = lguest_load_tr_desc;
+ paravirt_ops.set_ldt = lguest_set_ldt;
+ paravirt_ops.load_tls = lguest_load_tls;
+ paravirt_ops.set_debugreg = lguest_set_debugreg;
+ paravirt_ops.clts = lguest_clts;
+ paravirt_ops.read_cr0 = lguest_read_cr0;
+ paravirt_ops.write_cr0 = lguest_write_cr0;
+ paravirt_ops.init_IRQ = lguest_init_IRQ;
+ paravirt_ops.read_cr2 = lguest_read_cr2;
+ paravirt_ops.read_cr3 = lguest_read_cr3;
+ paravirt_ops.read_cr4 = lguest_read_cr4;
+ paravirt_ops.write_cr4 = lguest_write_cr4;
+ paravirt_ops.write_gdt_entry = lguest_write_gdt_entry;
+ paravirt_ops.write_idt_entry = lguest_write_idt_entry;
+ paravirt_ops.patch = lguest_patch;
+ paravirt_ops.safe_halt = lguest_safe_halt;
+ paravirt_ops.get_wallclock = lguest_get_wallclock;
+ paravirt_ops.time_init = lguest_time_init;
+ paravirt_ops.set_lazy_mode = lguest_lazy_mode;
+ paravirt_ops.wbinvd = lguest_wbinvd;
+
+ hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
+ strncpy(boot_command_line, boot->cmdline, COMMAND_LINE_SIZE);
+
+ /* We use top of mem for initial pagetables. */
+ init_pg_tables_end = __pa(pg0);
+
+ /* set up PDA descriptor */
+ setup_pda();
+ load_gdt(&early_gdt_descr);
+ asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
+
+ reserve_top_address(lguest_data.reserve_mem);
+
+ cpu_detect(&new_cpu_data);
+ /* Need this before paging_init. */
+ set_bit(X86_FEATURE_PGE, new_cpu_data.x86_capability);
+ /* Math is always hard! */
+ new_cpu_data.hard_math = 1;
+
+#ifdef CONFIG_X86_MCE
+ mce_disabled = 1;
+#endif
+
+#ifdef CONFIG_ACPI
+ acpi_disabled = 1;
+ acpi_ht = 0;
+#endif
+
+ add_preferred_console("hvc", 0, NULL);
+
+ if (boot->initrd_size) {
+ /* We stash this at top of memory. */
+ INITRD_START = boot->max_pfn*PAGE_SIZE - boot->initrd_size;
+ INITRD_SIZE = boot->initrd_size;
+ LOADER_TYPE = 0xFF;
+ }
+
+ pm_power_off = lguest_power_off;
+ start_kernel();
+}
+paravirt_probe(lguest_maybe_init);
===================================================================
--- /dev/null
+++ b/drivers/lguest/lguest_asm.S
@@ -0,0 +1,59 @@
+#include <linux/linkage.h>
+#include <linux/lguest.h>
+#include <asm/asm-offsets.h>
+
+/* FIXME: Once asm/processor-flags.h goes in, include that */
+#define X86_EFLAGS_IF 0x00000200
+
+/*
+ * This is where we begin: head.S notes that paging is already enabled (which
+ * doesn't happen in native boot) and calls the registered paravirt_probe
+ * functions one at a time. Ours is a simple assembler test for magic
+ * registers. If they're correct we jump to lguest_init.
+ *
+ * We put it in .init.text will be discarded after boot.
+ */
+.section .init.text, "ax", @progbits
+ENTRY(lguest_maybe_init)
+ cmpl $LGUEST_MAGIC_EBP, %ebp
+ jne out
+ cmpl $LGUEST_MAGIC_EDI, %edi
+ jne out
+ cmpl $LGUEST_MAGIC_ESI, %esi
+ jne out
+ je lguest_init
+out:
+ ret
+
+/* The templates for inline patching. */
+#define LGUEST_PATCH(name, insns...) \
+ lgstart_##name: insns; lgend_##name:; \
+ .globl lgstart_##name; .globl lgend_##name
+
+LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
+LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
+LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
+LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
+LGUEST_PATCH(iret, .byte 0xE9,0,0,0,0) /* jmp <to-be-patched> */
+
+.text
+/* These demark the EIP range where host should never deliver interrupts. */
+.global lguest_noirq_start
+.global lguest_noirq_end
+
+/*
+ * We move eflags word to lguest_data.irq_enabled to restore interrupt state.
+ * For page faults, gpfs and virtual interrupts, the hypervisor has saved
+ * eflags manually, otherwise it was delivered directly and so eflags reflects
+ * the real machine IF state, ie. interrupts on. Since the kernel always dies
+ * if it takes such a trap with interrupts disabled anyway, turning interrupts
+ * back on unconditionally here is OK.
+ */
+ENTRY(lguest_iret)
+ pushl %eax
+ movl 12(%esp), %eax
+lguest_noirq_start:
+ movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled
+ popl %eax
+ iret
+lguest_noirq_end:
===================================================================
--- /dev/null
+++ b/drivers/lguest/lguest_bus.c
@@ -0,0 +1,148 @@
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/lguest_bus.h>
+#include <asm/io.h>
+
+static ssize_t type_show(struct device *_dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+ return sprintf(buf, "%hu", lguest_devices[dev->index].type);
+}
+static ssize_t features_show(struct device *_dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+ return sprintf(buf, "%hx", lguest_devices[dev->index].features);
+}
+static ssize_t pfn_show(struct device *_dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+ return sprintf(buf, "%u", lguest_devices[dev->index].pfn);
+}
+static ssize_t status_show(struct device *_dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+ return sprintf(buf, "%hx", lguest_devices[dev->index].status);
+}
+static ssize_t status_store(struct device *_dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+ if (sscanf(buf, "%hi", &lguest_devices[dev->index].status) != 1)
+ return -EINVAL;
+ return count;
+}
+static struct device_attribute lguest_dev_attrs[] = {
+ __ATTR_RO(type),
+ __ATTR_RO(features),
+ __ATTR_RO(pfn),
+ __ATTR(status, 0644, status_show, status_store),
+ __ATTR_NULL
+};
+
+static int lguest_dev_match(struct device *_dev, struct device_driver *_drv)
+{
+ struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+ struct lguest_driver *drv = container_of(_drv,struct lguest_driver,drv);
+
+ return (drv->device_type == lguest_devices[dev->index].type);
+}
+
+struct lguest_bus {
+ struct bus_type bus;
+ struct device dev;
+};
+
+static struct lguest_bus lguest_bus = {
+ .bus = {
+ .name = "lguest",
+ .match = lguest_dev_match,
+ .dev_attrs = lguest_dev_attrs,
+ },
+ .dev = {
+ .parent = NULL,
+ .bus_id = "lguest",
+ }
+};
+
+static int lguest_dev_probe(struct device *_dev)
+{
+ int ret;
+ struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+ struct lguest_driver *drv = container_of(dev->dev.driver,
+ struct lguest_driver, drv);
+
+ lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER;
+ ret = drv->probe(dev);
+ if (ret == 0)
+ lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER_OK;
+ return ret;
+}
+
+int register_lguest_driver(struct lguest_driver *drv)
+{
+ if (!lguest_devices)
+ return 0;
+
+ drv->drv.bus = &lguest_bus.bus;
+ drv->drv.name = drv->name;
+ drv->drv.owner = drv->owner;
+ drv->drv.probe = lguest_dev_probe;
+
+ return driver_register(&drv->drv);
+}
+EXPORT_SYMBOL_GPL(register_lguest_driver);
+
+static void add_lguest_device(unsigned int index)
+{
+ struct lguest_device *new;
+
+ lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE;
+ new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL);
+ if (!new) {
+ printk(KERN_EMERG "Cannot allocate lguest device %u\n", index);
+ lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
+ return;
+ }
+
+ new->index = index;
+ new->private = NULL;
+ memset(&new->dev, 0, sizeof(new->dev));
+ new->dev.parent = &lguest_bus.dev;
+ new->dev.bus = &lguest_bus.bus;
+ sprintf(new->dev.bus_id, "%u", index);
+ if (device_register(&new->dev) != 0) {
+ printk(KERN_EMERG "Cannot register lguest device %u\n", index);
+ lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
+ kfree(new);
+ }
+}
+
+static void scan_devices(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < LGUEST_MAX_DEVICES; i++)
+ if (lguest_devices[i].type)
+ add_lguest_device(i);
+}
+
+static int __init lguest_bus_init(void)
+{
+ if (strcmp(paravirt_ops.name, "lguest") != 0)
+ return 0;
+
+ /* Devices are in page above top of "normal" mem. */
+ lguest_devices = ioremap(max_pfn << PAGE_SHIFT, PAGE_SIZE);
+
+ if (bus_register(&lguest_bus.bus) != 0
+ || device_register(&lguest_bus.dev) != 0)
+ panic("lguest bus registration failed");
+
+ scan_devices();
+ return 0;
+}
+postcore_initcall(lguest_bus_init);
===================================================================
--- /dev/null
+++ b/include/linux/lguest.h
@@ -0,0 +1,85 @@
+/* Things the lguest guest needs to know. Note: like all lguest interfaces,
+ * this is subject to wild and random change between versions. */
+#ifndef _ASM_LGUEST_H
+#define _ASM_LGUEST_H
+
+/* These are randomly chosen numbers which indicate we're an lguest at boot */
+#define LGUEST_MAGIC_EBP 0x4C687970
+#define LGUEST_MAGIC_EDI 0x652D4D65
+#define LGUEST_MAGIC_ESI 0xFFFFFFFF
+
+#ifndef __ASSEMBLY__
+#include <asm/irq.h>
+
+#define LHCALL_FLUSH_ASYNC 0
+#define LHCALL_LGUEST_INIT 1
+#define LHCALL_CRASH 2
+#define LHCALL_LOAD_GDT 3
+#define LHCALL_NEW_PGTABLE 4
+#define LHCALL_FLUSH_TLB 5
+#define LHCALL_LOAD_IDT_ENTRY 6
+#define LHCALL_SET_STACK 7
+#define LHCALL_TS 8
+#define LHCALL_TIMER_READ 9
+#define LHCALL_HALT 10
+#define LHCALL_GET_WALLCLOCK 11
+#define LHCALL_BIND_DMA 12
+#define LHCALL_SEND_DMA 13
+#define LHCALL_SET_PTE 14
+#define LHCALL_SET_PMD 15
+#define LHCALL_LOAD_TLS 16
+
+#define LGUEST_TRAP_ENTRY 0x1F
+
+static inline unsigned long
+hcall(unsigned long call,
+ unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+ asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
+ : "=a"(call)
+ : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
+ : "memory");
+ return call;
+}
+
+void async_hcall(unsigned long call,
+ unsigned long arg1, unsigned long arg2, unsigned long arg3);
+
+/* Can't use our min() macro here: needs to be a constant */
+#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
+
+#define LHCALL_RING_SIZE 64
+struct hcall_ring
+{
+ u32 eax, edx, ebx, ecx;
+};
+
+/* All the good stuff happens here: guest registers it with LGUEST_INIT */
+struct lguest_data
+{
+/* Fields which change during running: */
+ /* 512 == enabled (same as eflags) */
+ unsigned int irq_enabled;
+ /* Interrupts blocked by guest. */
+ DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS);
+
+ /* Virtual address of page fault. */
+ unsigned long cr2;
+
+ /* Async hypercall ring. 0xFF == done, 0 == pending. */
+ u8 hcall_status[LHCALL_RING_SIZE];
+ struct hcall_ring hcalls[LHCALL_RING_SIZE];
+
+/* Fields initialized by the hypervisor at boot: */
+ /* Memory not to try to access */
+ unsigned long reserve_mem;
+ /* ID of this guest (used by network driver to set ethernet address) */
+ u16 guestid;
+
+/* Fields initialized by the guest at boot: */
+ /* Instruction range to suppress interrupts even if enabled */
+ unsigned long noirq_start, noirq_end;
+};
+extern struct lguest_data lguest_data;
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_LGUEST_H */
===================================================================
--- /dev/null
+++ b/include/linux/lguest_bus.h
@@ -0,0 +1,33 @@
+#ifndef _ASM_LGUEST_DEVICE_H
+#define _ASM_LGUEST_DEVICE_H
+/* Everything you need to know about lguest devices. */
+#include <linux/device.h>
+#include <linux/lguest.h>
+#include <linux/lguest_launcher.h>
+
+struct lguest_device {
+ /* Unique busid, and index into lguest_page->devices[] */
+ /* By convention, each device can use irq index+1 if it wants to. */
+ unsigned int index;
+
+ struct device dev;
+
+ /* Driver can hang data off here. */
+ void *private;
+};
+
+struct lguest_driver {
+ const char *name;
+ struct module *owner;
+ u16 device_type;
+ int (*probe)(struct lguest_device *dev);
+ void (*remove)(struct lguest_device *dev);
+
+ struct device_driver drv;
+};
+
+extern int register_lguest_driver(struct lguest_driver *drv);
+extern void unregister_lguest_driver(struct lguest_driver *drv);
+
+extern struct lguest_device_desc *lguest_devices; /* Just past max_pfn */
+#endif /* _ASM_LGUEST_DEVICE_H */
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 2/8] lguest: the host code
[not found] ` <1176203130.26372.23.camel@localhost.localdomain>
@ 2007-04-10 11:06 ` Rusty Russell
[not found] ` <1176203176.26372.25.camel@localhost.localdomain>
1 sibling, 0 replies; 13+ messages in thread
From: Rusty Russell @ 2007-04-10 11:06 UTC (permalink / raw)
To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization
This is the code for the "lg.ko" module, which allows lguest guests to
be launched.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
drivers/lguest/core.c | 444 +++++++++++++++++++++++++++++++++
drivers/lguest/hypercalls.c | 172 ++++++++++++
drivers/lguest/interrupts_and_traps.c | 258 +++++++++++++++++++
drivers/lguest/io.c | 412 ++++++++++++++++++++++++++++++
drivers/lguest/lg.h | 278 ++++++++++++++++++++
drivers/lguest/lguest_user.c | 192 ++++++++++++++
drivers/lguest/page_tables.c | 407 ++++++++++++++++++++++++++++++
drivers/lguest/segments.c | 115 ++++++++
drivers/lguest/switcher.S | 159 +++++++++++
include/linux/lguest_launcher.h | 80 +++++
10 files changed, 2517 insertions(+)
===================================================================
--- /dev/null
+++ b/drivers/lguest/core.c
@@ -0,0 +1,444 @@
+/* World's simplest hypervisor, to test paravirt_ops and show
+ * unbelievers that virtualization is the future. Plus, it's fun! */
+#include <linux/module.h>
+#include <linux/stringify.h>
+#include <linux/stddef.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/cpu.h>
+#include <linux/freezer.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/poll.h>
+#include <asm/highmem.h>
+#include <asm/asm-offsets.h>
+#include <asm/i387.h>
+#include "lg.h"
+
+/* Found in switcher.S */
+extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
+extern unsigned long default_idt_entries[];
+
+/* Every guest maps the core switcher code. */
+#define SHARED_SWITCHER_PAGES \
+ DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
+/* Pages for switcher itself, then two pages per cpu */
+#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
+
+/* We map at -4M for ease of mapping into the guest (one PTE page). */
+#define SWITCHER_ADDR 0xFFC00000
+
+static struct vm_struct *switcher_vma;
+static struct page **switcher_page;
+
+static int cpu_had_pge;
+static struct {
+ unsigned long offset;
+ unsigned short segment;
+} lguest_entry;
+
+/* This One Big lock protects all inter-guest data structures. */
+DEFINE_MUTEX(lguest_lock);
+static DEFINE_PER_CPU(struct lguest *, last_guest);
+
+/* FIXME: Make dynamic. */
+#define MAX_LGUEST_GUESTS 16
+struct lguest lguests[MAX_LGUEST_GUESTS];
+
+/* Offset from where switcher.S was compiled to where we've copied it */
+static unsigned long switcher_offset(void)
+{
+ return SWITCHER_ADDR - (unsigned long)start_switcher_text;
+}
+
+/* This cpu's struct lguest_pages. */
+static struct lguest_pages *lguest_pages(unsigned int cpu)
+{
+ return &(((struct lguest_pages *)
+ (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
+}
+
+static __init int map_switcher(void)
+{
+ int i, err;
+ struct page **pagep;
+
+ switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
+ GFP_KERNEL);
+ if (!switcher_page) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
+ unsigned long addr = get_zeroed_page(GFP_KERNEL);
+ if (!addr) {
+ err = -ENOMEM;
+ goto free_some_pages;
+ }
+ switcher_page[i] = virt_to_page(addr);
+ }
+
+ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
+ VM_ALLOC, SWITCHER_ADDR, VMALLOC_END);
+ if (!switcher_vma) {
+ err = -ENOMEM;
+ printk("lguest: could not map switcher pages high\n");
+ goto free_pages;
+ }
+
+ pagep = switcher_page;
+ err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep);
+ if (err) {
+ printk("lguest: map_vm_area failed: %i\n", err);
+ goto free_vma;
+ }
+ memcpy(switcher_vma->addr, start_switcher_text,
+ end_switcher_text - start_switcher_text);
+
+ /* Fix up IDT entries to point into copied text. */
+ for (i = 0; i < IDT_ENTRIES; i++)
+ default_idt_entries[i] += switcher_offset();
+
+ for_each_possible_cpu(i) {
+ struct lguest_pages *pages = lguest_pages(i);
+ struct lguest_ro_state *state = &pages->state;
+
+ /* These fields are static: rest done in copy_in_guest_info */
+ state->host_gdt_desc.size = GDT_SIZE-1;
+ state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
+ store_idt(&state->host_idt_desc);
+ state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
+ state->guest_idt_desc.address = (long)&state->guest_idt;
+ state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
+ state->guest_gdt_desc.address = (long)&state->guest_gdt;
+ state->guest_tss.esp0 = (long)(&pages->regs + 1);
+ state->guest_tss.ss0 = LGUEST_DS;
+ /* No I/O for you! */
+ state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
+ setup_default_gdt_entries(state);
+ setup_default_idt_entries(state, default_idt_entries);
+
+ /* Setup LGUEST segments on all cpus */
+ get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+ get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+ }
+
+ /* Initialize entry point into switcher. */
+ lguest_entry.offset = (long)switch_to_guest + switcher_offset();
+ lguest_entry.segment = LGUEST_CS;
+
+ printk(KERN_INFO "lguest: mapped switcher at %p\n",
+ switcher_vma->addr);
+ return 0;
+
+free_vma:
+ vunmap(switcher_vma->addr);
+free_pages:
+ i = TOTAL_SWITCHER_PAGES;
+free_some_pages:
+ for (--i; i >= 0; i--)
+ __free_pages(switcher_page[i], 0);
+ kfree(switcher_page);
+out:
+ return err;
+}
+
+static void unmap_switcher(void)
+{
+ unsigned int i;
+
+ vunmap(switcher_vma->addr);
+ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
+ __free_pages(switcher_page[i], 0);
+}
+
+/* IN/OUT insns: enough to get us past boot-time probing. */
+static int emulate_insn(struct lguest *lg)
+{
+ u8 insn;
+ unsigned int insnlen = 0, in = 0, shift = 0;
+ unsigned long physaddr = guest_pa(lg, lg->regs->eip);
+
+ /* This only works for addresses in linear mapping... */
+ if (lg->regs->eip < lg->page_offset)
+ return 0;
+ lgread(lg, &insn, physaddr, 1);
+
+ /* Operand size prefix means it's actually for ax. */
+ if (insn == 0x66) {
+ shift = 16;
+ insnlen = 1;
+ lgread(lg, &insn, physaddr + insnlen, 1);
+ }
+
+ switch (insn & 0xFE) {
+ case 0xE4: /* in <next byte>,%al */
+ insnlen += 2;
+ in = 1;
+ break;
+ case 0xEC: /* in (%dx),%al */
+ insnlen += 1;
+ in = 1;
+ break;
+ case 0xE6: /* out %al,<next byte> */
+ insnlen += 2;
+ break;
+ case 0xEE: /* out %al,(%dx) */
+ insnlen += 1;
+ break;
+ default:
+ return 0;
+ }
+
+ if (in) {
+ /* Lower bit tells is whether it's a 16 or 32 bit access */
+ if (insn & 0x1)
+ lg->regs->eax = 0xFFFFFFFF;
+ else
+ lg->regs->eax |= (0xFFFF << shift);
+ }
+ lg->regs->eip += insnlen;
+ return 1;
+}
+
+int lguest_address_ok(const struct lguest *lg,
+ unsigned long addr, unsigned long len)
+{
+ return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr);
+}
+
+/* Just like get_user, but don't let guest access lguest binary. */
+u32 lgread_u32(struct lguest *lg, u32 addr)
+{
+ u32 val = 0;
+
+ /* Don't let them access lguest binary */
+ if (!lguest_address_ok(lg, addr, sizeof(val))
+ || get_user(val, (u32 __user *)addr) != 0)
+ kill_guest(lg, "bad read address %u", addr);
+ return val;
+}
+
+void lgwrite_u32(struct lguest *lg, u32 addr, u32 val)
+{
+ if (!lguest_address_ok(lg, addr, sizeof(val))
+ || put_user(val, (u32 __user *)addr) != 0)
+ kill_guest(lg, "bad write address %u", addr);
+}
+
+void lgread(struct lguest *lg, void *b, u32 addr, unsigned bytes)
+{
+ if (!lguest_address_ok(lg, addr, bytes)
+ || copy_from_user(b, (void __user *)addr, bytes) != 0) {
+ /* copy_from_user should do this, but as we rely on it... */
+ memset(b, 0, bytes);
+ kill_guest(lg, "bad read address %u len %u", addr, bytes);
+ }
+}
+
+void lgwrite(struct lguest *lg, u32 addr, const void *b, unsigned bytes)
+{
+ if (!lguest_address_ok(lg, addr, bytes)
+ || copy_to_user((void __user *)addr, b, bytes) != 0)
+ kill_guest(lg, "bad write address %u len %u", addr, bytes);
+}
+
+static void set_ts(void)
+{
+ u32 cr0;
+
+ cr0 = read_cr0();
+ if (!(cr0 & 8))
+ write_cr0(cr0|8);
+}
+
+static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
+{
+ if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
+ __get_cpu_var(last_guest) = lg;
+ lg->last_pages = pages;
+ lg->changed = CHANGED_ALL;
+ }
+
+ /* These are pretty cheap, so we do them unconditionally. */
+ pages->state.host_cr3 = __pa(current->mm->pgd);
+ map_switcher_in_guest(lg, pages);
+ pages->state.guest_tss.esp1 = lg->esp1;
+ pages->state.guest_tss.ss1 = lg->ss1;
+
+ /* Copy direct trap entries. */
+ if (lg->changed & CHANGED_IDT)
+ copy_traps(lg, pages->state.guest_idt, default_idt_entries);
+
+ /* Copy all GDT entries but the TSS. */
+ if (lg->changed & CHANGED_GDT)
+ copy_gdt(lg, pages->state.guest_gdt);
+
+ lg->changed = 0;
+}
+
+static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
+{
+ unsigned int clobber;
+
+ copy_in_guest_info(lg, pages);
+
+ /* Put eflags on stack, lcall does rest: suitable for iret return. */
+ asm volatile("pushf; lcall *lguest_entry"
+ : "=a"(clobber), "=b"(clobber)
+ : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
+ : "memory", "%edx", "%ecx", "%edi", "%esi");
+}
+
+int run_guest(struct lguest *lg, char *__user user)
+{
+ while (!lg->dead) {
+ unsigned int cr2 = 0; /* Damn gcc */
+
+ /* Hypercalls first: we might have been out to userspace */
+ do_hypercalls(lg);
+ if (lg->dma_is_pending) {
+ put_user(lg->pending_dma, (unsigned long *)user);
+ put_user(lg->pending_key, (unsigned long *)user+1);
+ return sizeof(unsigned long)*2;
+ }
+
+ if (signal_pending(current))
+ return -EINTR;
+ maybe_do_interrupt(lg);
+
+ try_to_freeze();
+
+ if (lg->dead)
+ break;
+
+ if (lg->halted) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(1);
+ continue;
+ }
+
+ local_irq_disable();
+
+ /* Even if *we* don't want FPU trap, guest might... */
+ if (lg->ts)
+ set_ts();
+
+ run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
+
+ /* Save cr2 now if we page-faulted. */
+ if (lg->regs->trapnum == 14)
+ cr2 = read_cr2();
+ else if (lg->regs->trapnum == 7)
+ math_state_restore();
+ local_irq_enable();
+
+ switch (lg->regs->trapnum) {
+ case 13: /* We've intercepted a GPF. */
+ if (lg->regs->errcode == 0) {
+ if (emulate_insn(lg))
+ continue;
+ }
+ break;
+ case 14: /* We've intercepted a page fault. */
+ if (demand_page(lg, cr2, lg->regs->errcode & 2))
+ continue;
+
+ /* If lguest_data is NULL, this won't hurt. */
+ put_user(cr2, &lg->lguest_data->cr2);
+ break;
+ case 7: /* We've intercepted a Device Not Available fault. */
+ /* If they don't want to know, just absorb it. */
+ if (!lg->ts)
+ continue;
+ break;
+ case 32 ... 255: /* Real interrupt, fall thru */
+ cond_resched();
+ case LGUEST_TRAP_ENTRY: /* Handled at top of loop */
+ continue;
+ }
+
+ if (deliver_trap(lg, lg->regs->trapnum))
+ continue;
+
+ kill_guest(lg, "unhandled trap %i at %#x (%#x)",
+ lg->regs->trapnum, lg->regs->eip,
+ lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode);
+ }
+ return -ENOENT;
+}
+
+int find_free_guest(void)
+{
+ unsigned int i;
+ for (i = 0; i < MAX_LGUEST_GUESTS; i++)
+ if (!lguests[i].tsk)
+ return i;
+ return -1;
+}
+
+static void adjust_pge(void *on)
+{
+ if (on)
+ write_cr4(read_cr4() | X86_CR4_PGE);
+ else
+ write_cr4(read_cr4() & ~X86_CR4_PGE);
+}
+
+static int __init init(void)
+{
+ int err;
+
+ if (paravirt_enabled()) {
+ printk("lguest is afraid of %s\n", paravirt_ops.name);
+ return -EPERM;
+ }
+
+ err = map_switcher();
+ if (err)
+ return err;
+
+ err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
+ if (err) {
+ unmap_switcher();
+ return err;
+ }
+ lguest_io_init();
+
+ err = lguest_device_init();
+ if (err) {
+ free_pagetables();
+ unmap_switcher();
+ return err;
+ }
+ lock_cpu_hotplug();
+ if (cpu_has_pge) { /* We have a broader idea of "global". */
+ cpu_had_pge = 1;
+ on_each_cpu(adjust_pge, 0, 0, 1);
+ clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+ }
+ unlock_cpu_hotplug();
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ lguest_device_remove();
+ free_pagetables();
+ unmap_switcher();
+ lock_cpu_hotplug();
+ if (cpu_had_pge) {
+ set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+ on_each_cpu(adjust_pge, (void *)1, 0, 1);
+ }
+ unlock_cpu_hotplug();
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
===================================================================
--- /dev/null
+++ b/drivers/lguest/hypercalls.c
@@ -0,0 +1,172 @@
+/* Actual hypercalls, which allow guests to actually do something.
+ Copyright (C) 2006 Rusty Russell IBM Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <irq_vectors.h>
+#include "lg.h"
+
+static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
+{
+ switch (regs->eax) {
+ case LHCALL_FLUSH_ASYNC:
+ break;
+ case LHCALL_LGUEST_INIT:
+ kill_guest(lg, "already have lguest_data");
+ break;
+ case LHCALL_CRASH: {
+ char msg[128];
+ lgread(lg, msg, regs->edx, sizeof(msg));
+ msg[sizeof(msg)-1] = '\0';
+ kill_guest(lg, "CRASH: %s", msg);
+ break;
+ }
+ case LHCALL_FLUSH_TLB:
+ if (regs->edx)
+ guest_pagetable_clear_all(lg);
+ else
+ guest_pagetable_flush_user(lg);
+ break;
+ case LHCALL_TIMER_READ: {
+ u32 now = jiffies;
+ mb();
+ regs->eax = now - lg->last_timer;
+ lg->last_timer = now;
+ break;
+ }
+ case LHCALL_GET_WALLCLOCK: {
+ struct timespec ts;
+ ktime_get_real_ts(&ts);
+ regs->eax = ts.tv_sec;
+ break;
+ }
+ case LHCALL_BIND_DMA:
+ regs->eax = bind_dma(lg, regs->edx, regs->ebx,
+ regs->ecx >> 8, regs->ecx & 0xFF);
+ break;
+ case LHCALL_SEND_DMA:
+ send_dma(lg, regs->edx, regs->ebx);
+ break;
+ case LHCALL_LOAD_GDT:
+ load_guest_gdt(lg, regs->edx, regs->ebx);
+ break;
+ case LHCALL_LOAD_IDT_ENTRY:
+ load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx);
+ break;
+ case LHCALL_NEW_PGTABLE:
+ guest_new_pagetable(lg, regs->edx);
+ break;
+ case LHCALL_SET_STACK:
+ guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx);
+ break;
+ case LHCALL_SET_PTE:
+ guest_set_pte(lg, regs->edx, regs->ebx, mkgpte(regs->ecx));
+ break;
+ case LHCALL_SET_PMD:
+ guest_set_pmd(lg, regs->edx, regs->ebx);
+ break;
+ case LHCALL_LOAD_TLS:
+ guest_load_tls(lg, (struct desc_struct __user*)regs->edx);
+ break;
+ case LHCALL_TS:
+ lg->ts = regs->edx;
+ break;
+ case LHCALL_HALT:
+ lg->halted = 1;
+ break;
+ default:
+ kill_guest(lg, "Bad hypercall %i\n", regs->eax);
+ }
+}
+
+/* We always do queued calls before actual hypercall. */
+static void do_async_hcalls(struct lguest *lg)
+{
+ unsigned int i;
+ u8 st[LHCALL_RING_SIZE];
+
+ copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st));
+ for (i = 0; i < ARRAY_SIZE(st); i++) {
+ struct lguest_regs regs;
+ unsigned int n = lg->next_hcall;
+
+ if (st[n] == 0xFF)
+ break;
+
+ if (++lg->next_hcall == LHCALL_RING_SIZE)
+ lg->next_hcall = 0;
+
+ get_user(regs.eax, &lg->lguest_data->hcalls[n].eax);
+ get_user(regs.edx, &lg->lguest_data->hcalls[n].edx);
+ get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx);
+ get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx);
+ do_hcall(lg, ®s);
+ put_user(0xFF, &lg->lguest_data->hcall_status[n]);
+ if (lg->dma_is_pending)
+ break;
+ }
+}
+
+static void initialize(struct lguest *lg)
+{
+ if (lg->regs->eax != LHCALL_LGUEST_INIT) {
+ kill_guest(lg, "hypercall %i before LGUEST_INIT",
+ lg->regs->eax);
+ return;
+ }
+
+ lg->lguest_data = (struct lguest_data __user *)lg->regs->edx;
+ /* We check here so we can simply copy_to_user/from_user */
+ if (!lguest_address_ok(lg, (long)lg->lguest_data,
+ sizeof(*lg->lguest_data))) {
+ kill_guest(lg, "bad guest page %p", lg->lguest_data);
+ return;
+ }
+ get_user(lg->noirq_start, &lg->lguest_data->noirq_start);
+ get_user(lg->noirq_end, &lg->lguest_data->noirq_end);
+ /* We reserve the top pgd entry. */
+ put_user(4U*1024*1024, &lg->lguest_data->reserve_mem);
+ put_user(lg->guestid, &lg->lguest_data->guestid);
+}
+
+/* Even if we go out to userspace and come back, we don't want to do
+ * the hypercall again. */
+static void clear_hcall(struct lguest *lg)
+{
+ lg->regs->trapnum = 255;
+}
+
+void do_hypercalls(struct lguest *lg)
+{
+ if (unlikely(!lg->lguest_data)) {
+ if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
+ initialize(lg);
+ clear_hcall(lg);
+ }
+ return;
+ }
+
+ do_async_hcalls(lg);
+ if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
+ do_hcall(lg, lg->regs);
+ clear_hcall(lg);
+ }
+ set_wakeup_process(lg, NULL);
+}
===================================================================
--- /dev/null
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -0,0 +1,258 @@
+#include <linux/uaccess.h>
+#include "lg.h"
+
+static unsigned long idt_address(u32 lo, u32 hi)
+{
+ return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
+}
+
+static int idt_type(u32 lo, u32 hi)
+{
+ return (hi >> 8) & 0xF;
+}
+
+static int idt_present(u32 lo, u32 hi)
+{
+ return (hi & 0x8000);
+}
+
+static void push_guest_stack(struct lguest *lg, u32 __user **gstack, u32 val)
+{
+ lgwrite_u32(lg, (u32)--(*gstack), val);
+}
+
+static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
+{
+ u32 __user *gstack;
+ u32 eflags, ss, irq_enable;
+
+ /* If they want a ring change, we use new stack and push old ss/esp */
+ if ((lg->regs->ss&0x3) != GUEST_PL) {
+ gstack = (u32 __user *)guest_pa(lg, lg->esp1);
+ ss = lg->ss1;
+ push_guest_stack(lg, &gstack, lg->regs->ss);
+ push_guest_stack(lg, &gstack, lg->regs->esp);
+ } else {
+ gstack = (u32 __user *)guest_pa(lg, lg->regs->esp);
+ ss = lg->regs->ss;
+ }
+
+ /* We use IF bit in eflags to indicate whether irqs were disabled
+ (it's always 0, since irqs are enabled when guest is running). */
+ eflags = lg->regs->eflags;
+ get_user(irq_enable, &lg->lguest_data->irq_enabled);
+ eflags |= (irq_enable & X86_EFLAGS_IF);
+
+ push_guest_stack(lg, &gstack, eflags);
+ push_guest_stack(lg, &gstack, lg->regs->cs);
+ push_guest_stack(lg, &gstack, lg->regs->eip);
+
+ if (has_err)
+ push_guest_stack(lg, &gstack, lg->regs->errcode);
+
+ /* Change the real stack so switcher returns to trap handler */
+ lg->regs->ss = ss;
+ lg->regs->esp = (u32)gstack + lg->page_offset;
+ lg->regs->cs = (__KERNEL_CS|GUEST_PL);
+ lg->regs->eip = idt_address(lo, hi);
+
+ /* Disable interrupts for an interrupt gate. */
+ if (idt_type(lo, hi) == 0xE)
+ put_user(0, &lg->lguest_data->irq_enabled);
+}
+
+void maybe_do_interrupt(struct lguest *lg)
+{
+ unsigned int irq;
+ DECLARE_BITMAP(blk, LGUEST_IRQS);
+ struct desc_struct *idt;
+
+ if (!lg->lguest_data)
+ return;
+
+ /* If timer has changed, set timer interrupt. */
+ if (jiffies != lg->last_timer)
+ set_bit(0, lg->irqs_pending);
+
+ /* Mask out any interrupts they have blocked. */
+ copy_from_user(&blk, lg->lguest_data->blocked_interrupts, sizeof(blk));
+ bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS);
+
+ irq = find_first_bit(blk, LGUEST_IRQS);
+ if (irq >= LGUEST_IRQS)
+ return;
+
+ if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end)
+ return;
+
+ /* If they're halted, we re-enable interrupts. */
+ if (lg->halted) {
+ /* Re-enable interrupts. */
+ put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled);
+ lg->halted = 0;
+ } else {
+ /* Maybe they have interrupts disabled? */
+ u32 irq_enabled;
+ get_user(irq_enabled, &lg->lguest_data->irq_enabled);
+ if (!irq_enabled)
+ return;
+ }
+
+ idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq];
+ if (idt_present(idt->a, idt->b)) {
+ clear_bit(irq, lg->irqs_pending);
+ set_guest_interrupt(lg, idt->a, idt->b, 0);
+ }
+}
+
+static int has_err(unsigned int trap)
+{
+ return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
+}
+
+int deliver_trap(struct lguest *lg, unsigned int num)
+{
+ u32 lo = lg->idt[num].a, hi = lg->idt[num].b;
+
+ if (!idt_present(lo, hi))
+ return 0;
+ set_guest_interrupt(lg, lo, hi, has_err(num));
+ return 1;
+}
+
+static int direct_trap(const struct lguest *lg,
+ const struct desc_struct *trap,
+ unsigned int num)
+{
+ /* Hardware interrupts don't go to guest (except syscall). */
+ if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR)
+ return 0;
+
+ /* We intercept page fault (demand shadow paging & cr2 saving)
+ protection fault (in/out emulation) and device not
+ available (TS handling), and hypercall */
+ if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY)
+ return 0;
+
+ /* Interrupt gates (0xE) or not present (0x0) can't go direct. */
+ return idt_type(trap->a, trap->b) == 0xF;
+}
+
+static void pin_stack_pages(struct lguest *lg)
+{
+ unsigned int i;
+
+ for (i = 0; i < lg->stack_pages; i++)
+ pin_page(lg, lg->esp1 - i * PAGE_SIZE);
+}
+
+/* We need to ensure all the direct trap pages are mapped after we
+ * clear shadow mappings. */
+void pin_trap_pages(struct lguest *lg)
+{
+ unsigned int i;
+ struct desc_struct *trap;
+
+ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) {
+ trap = &lg->idt[i];
+ if (direct_trap(lg, trap, i))
+ pin_page(lg, idt_address(trap->a, trap->b));
+ }
+
+ trap = &lg->syscall_idt;
+ if (direct_trap(lg, trap, SYSCALL_VECTOR))
+ pin_page(lg, idt_address(trap->a, trap->b));
+ pin_stack_pages(lg);
+}
+
+void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
+{
+ /* You cannot have a stack segment with priv level 0. */
+ if ((seg & 0x3) != GUEST_PL)
+ kill_guest(lg, "bad stack segment %i", seg);
+ if (pages > 2)
+ kill_guest(lg, "bad stack pages %u", pages);
+ lg->ss1 = seg;
+ lg->esp1 = esp;
+ lg->stack_pages = pages;
+ pin_stack_pages(lg);
+}
+
+/* Set up trap in IDT. */
+static void set_trap(struct lguest *lg, struct desc_struct *trap,
+ unsigned int num, u32 lo, u32 hi)
+{
+ u8 type = idt_type(lo, hi);
+
+ if (!idt_present(lo, hi)) {
+ trap->a = trap->b = 0;
+ return;
+ }
+
+ if (type != 0xE && type != 0xF)
+ kill_guest(lg, "bad IDT type %i", type);
+
+ trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
+ trap->b = (hi&0xFFFFEF00);
+
+ /* Make sure trap address is available so we don't fault. In
+ * theory, it could overlap two pages, in practice it's aligned. */
+ if (direct_trap(lg, trap, num))
+ pin_page(lg, idt_address(lo, hi));
+}
+
+void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
+{
+ /* Guest never handles: NMI, doublefault, hypercall, spurious irq. */
+ if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
+ return;
+
+ lg->changed |= CHANGED_IDT;
+ if (num < ARRAY_SIZE(lg->idt))
+ set_trap(lg, &lg->idt[num], num, lo, hi);
+ else if (num == SYSCALL_VECTOR)
+ set_trap(lg, &lg->syscall_idt, num, lo, hi);
+}
+
+static void default_idt_entry(struct desc_struct *idt,
+ int trap,
+ const unsigned long handler)
+{
+ u32 flags = 0x8e00;
+
+ /* They can't "int" into any of them except hypercall. */
+ if (trap == LGUEST_TRAP_ENTRY)
+ flags |= (GUEST_PL << 13);
+
+ idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
+ idt->b = (handler&0xFFFF0000) | flags;
+}
+
+void setup_default_idt_entries(struct lguest_ro_state *state,
+ const unsigned long *def)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
+ default_idt_entry(&state->guest_idt[i], i, def[i]);
+}
+
+void copy_traps(const struct lguest *lg, struct desc_struct *idt,
+ const unsigned long *def)
+{
+ unsigned int i;
+
+ /* All hardware interrupts are same whatever the guest: only the
+ * traps might be different. */
+ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) {
+ if (direct_trap(lg, &lg->idt[i], i))
+ idt[i] = lg->idt[i];
+ else
+ default_idt_entry(&idt[i], i, def[i]);
+ }
+ i = SYSCALL_VECTOR;
+ if (direct_trap(lg, &lg->syscall_idt, i))
+ idt[i] = lg->syscall_idt;
+ else
+ default_idt_entry(&idt[i], i, def[i]);
+}
===================================================================
--- /dev/null
+++ b/drivers/lguest/io.c
@@ -0,0 +1,412 @@
+/* Simple I/O model for guests, based on shared memory.
+ * Copyright (C) 2006 Rusty Russell IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <linux/types.h>
+#include <linux/futex.h>
+#include <linux/jhash.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/uaccess.h>
+#include "lg.h"
+
+static struct list_head dma_hash[61];
+
+void lguest_io_init(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(dma_hash); i++)
+ INIT_LIST_HEAD(&dma_hash[i]);
+}
+
+/* FIXME: allow multi-page lengths. */
+static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma)
+{
+ unsigned int i;
+
+ for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+ if (!dma->len[i])
+ return 1;
+ if (!lguest_address_ok(lg, dma->addr[i], dma->len[i]))
+ goto kill;
+ if (dma->len[i] > PAGE_SIZE)
+ goto kill;
+ /* We could do over a page, but is it worth it? */
+ if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE)
+ goto kill;
+ }
+ return 1;
+
+kill:
+ kill_guest(lg, "bad DMA entry: %u@%#x", dma->len[i], dma->addr[i]);
+ return 0;
+}
+
+static unsigned int hash(const union futex_key *key)
+{
+ return jhash2((u32*)&key->both.word,
+ (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+ key->both.offset)
+ % ARRAY_SIZE(dma_hash);
+}
+
+static inline int key_eq(const union futex_key *a, const union futex_key *b)
+{
+ return (a->both.word == b->both.word
+ && a->both.ptr == b->both.ptr
+ && a->both.offset == b->both.offset);
+}
+
+/* Must hold read lock on dmainfo owner's current->mm->mmap_sem */
+static void unlink_dma(struct lguest_dma_info *dmainfo)
+{
+ BUG_ON(!mutex_is_locked(&lguest_lock));
+ dmainfo->interrupt = 0;
+ list_del(&dmainfo->list);
+ drop_futex_key_refs(&dmainfo->key);
+}
+
+static int unbind_dma(struct lguest *lg,
+ const union futex_key *key,
+ unsigned long dmas)
+{
+ int i, ret = 0;
+
+ for (i = 0; i < LGUEST_MAX_DMA; i++) {
+ if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) {
+ unlink_dma(&lg->dma[i]);
+ ret = 1;
+ break;
+ }
+ }
+ return ret;
+}
+
+int bind_dma(struct lguest *lg,
+ unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt)
+{
+ unsigned int i;
+ int ret = 0;
+ union futex_key key;
+
+ if (interrupt >= LGUEST_IRQS)
+ return 0;
+
+ mutex_lock(&lguest_lock);
+ down_read(¤t->mm->mmap_sem);
+ if (get_futex_key((u32 __user *)ukey, &key) != 0) {
+ kill_guest(lg, "bad dma key %#lx", ukey);
+ goto unlock;
+ }
+ get_futex_key_refs(&key);
+
+ if (interrupt == 0)
+ ret = unbind_dma(lg, &key, dmas);
+ else {
+ for (i = 0; i < LGUEST_MAX_DMA; i++) {
+ if (lg->dma[i].interrupt)
+ continue;
+
+ lg->dma[i].dmas = dmas;
+ lg->dma[i].num_dmas = numdmas;
+ lg->dma[i].next_dma = 0;
+ lg->dma[i].key = key;
+ lg->dma[i].guestid = lg->guestid;
+ lg->dma[i].interrupt = interrupt;
+ list_add(&lg->dma[i].list, &dma_hash[hash(&key)]);
+ ret = 1;
+ goto unlock;
+ }
+ }
+ drop_futex_key_refs(&key);
+unlock:
+ up_read(¤t->mm->mmap_sem);
+ mutex_unlock(&lguest_lock);
+ return ret;
+}
+
+/* lgread from another guest */
+static int lgread_other(struct lguest *lg,
+ void *buf, u32 addr, unsigned bytes)
+{
+ if (!lguest_address_ok(lg, addr, bytes)
+ || access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) {
+ memset(buf, 0, bytes);
+ kill_guest(lg, "bad address in registered DMA struct");
+ return 0;
+ }
+ return 1;
+}
+
+/* lgwrite to another guest */
+static int lgwrite_other(struct lguest *lg, u32 addr,
+ const void *buf, unsigned bytes)
+{
+ if (!lguest_address_ok(lg, addr, bytes)
+ || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1)
+ != bytes)) {
+ kill_guest(lg, "bad address writing to registered DMA");
+ return 0;
+ }
+ return 1;
+}
+
+static u32 copy_data(struct lguest *srclg,
+ const struct lguest_dma *src,
+ const struct lguest_dma *dst,
+ struct page *pages[])
+{
+ unsigned int totlen, si, di, srcoff, dstoff;
+ void *maddr = NULL;
+
+ totlen = 0;
+ si = di = 0;
+ srcoff = dstoff = 0;
+ while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si]
+ && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) {
+ u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff);
+
+ if (!maddr)
+ maddr = kmap(pages[di]);
+
+ /* FIXME: This is not completely portable, since
+ archs do different things for copy_to_user_page. */
+ if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE,
+ (void *__user)src->addr[si], len) != 0) {
+ kill_guest(srclg, "bad address in sending DMA");
+ totlen = 0;
+ break;
+ }
+
+ totlen += len;
+ srcoff += len;
+ dstoff += len;
+ if (srcoff == src->len[si]) {
+ si++;
+ srcoff = 0;
+ }
+ if (dstoff == dst->len[di]) {
+ kunmap(pages[di]);
+ maddr = NULL;
+ di++;
+ dstoff = 0;
+ }
+ }
+
+ if (maddr)
+ kunmap(pages[di]);
+
+ return totlen;
+}
+
+/* Src is us, ie. current. */
+static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src,
+ struct lguest *dstlg, const struct lguest_dma *dst)
+{
+ int i;
+ u32 ret;
+ struct page *pages[LGUEST_MAX_DMA_SECTIONS];
+
+ if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src))
+ return 0;
+
+ /* First get the destination pages */
+ for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+ if (dst->len[i] == 0)
+ break;
+ if (get_user_pages(dstlg->tsk, dstlg->mm,
+ dst->addr[i], 1, 1, 1, pages+i, NULL)
+ != 1) {
+ kill_guest(dstlg, "Error mapping DMA pages");
+ ret = 0;
+ goto drop_pages;
+ }
+ }
+
+ /* Now copy until we run out of src or dst. */
+ ret = copy_data(srclg, src, dst, pages);
+
+drop_pages:
+ while (--i >= 0)
+ put_page(pages[i]);
+ return ret;
+}
+
+static int dma_transfer(struct lguest *srclg,
+ unsigned long udma,
+ struct lguest_dma_info *dst)
+{
+ struct lguest_dma dst_dma, src_dma;
+ struct lguest *dstlg;
+ u32 i, dma = 0;
+
+ dstlg = &lguests[dst->guestid];
+ /* Get our dma list. */
+ lgread(srclg, &src_dma, udma, sizeof(src_dma));
+
+ /* We can't deadlock against them dmaing to us, because this
+ * is all under the lguest_lock. */
+ down_read(&dstlg->mm->mmap_sem);
+
+ for (i = 0; i < dst->num_dmas; i++) {
+ dma = (dst->next_dma + i) % dst->num_dmas;
+ if (!lgread_other(dstlg, &dst_dma,
+ dst->dmas + dma * sizeof(struct lguest_dma),
+ sizeof(dst_dma))) {
+ goto fail;
+ }
+ if (!dst_dma.used_len)
+ break;
+ }
+ if (i != dst->num_dmas) {
+ unsigned long used_lenp;
+ unsigned int ret;
+
+ ret = do_dma(srclg, &src_dma, dstlg, &dst_dma);
+ /* Put used length in src. */
+ lgwrite_u32(srclg,
+ udma+offsetof(struct lguest_dma, used_len), ret);
+ if (ret == 0 && src_dma.len[0] != 0)
+ goto fail;
+
+ /* Make sure destination sees contents before length. */
+ wmb();
+ used_lenp = dst->dmas
+ + dma * sizeof(struct lguest_dma)
+ + offsetof(struct lguest_dma, used_len);
+ lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret));
+ dst->next_dma++;
+ }
+ up_read(&dstlg->mm->mmap_sem);
+
+ /* Do this last so dst doesn't simply sleep on lock. */
+ set_bit(dst->interrupt, dstlg->irqs_pending);
+ set_wakeup_process(srclg, dstlg->tsk);
+ return i == dst->num_dmas;
+
+fail:
+ up_read(&dstlg->mm->mmap_sem);
+ return 0;
+}
+
+void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma)
+{
+ union futex_key key;
+ int empty = 0;
+
+again:
+ mutex_lock(&lguest_lock);
+ down_read(¤t->mm->mmap_sem);
+ if (get_futex_key((u32 __user *)ukey, &key) != 0) {
+ kill_guest(lg, "bad sending DMA key");
+ goto unlock;
+ }
+ /* Shared mapping? Look for other guests... */
+ if (key.shared.offset & 1) {
+ struct lguest_dma_info *i;
+ list_for_each_entry(i, &dma_hash[hash(&key)], list) {
+ if (i->guestid == lg->guestid)
+ continue;
+ if (!key_eq(&key, &i->key))
+ continue;
+
+ empty += dma_transfer(lg, udma, i);
+ break;
+ }
+ if (empty == 1) {
+ /* Give any recipients one chance to restock. */
+ up_read(¤t->mm->mmap_sem);
+ mutex_unlock(&lguest_lock);
+ set_wakeup_process(lg, NULL);
+ empty++;
+ goto again;
+ }
+ } else {
+ /* Private mapping: tell our userspace. */
+ lg->dma_is_pending = 1;
+ lg->pending_dma = udma;
+ lg->pending_key = ukey;
+ }
+unlock:
+ up_read(¤t->mm->mmap_sem);
+ mutex_unlock(&lguest_lock);
+}
+
+void release_all_dma(struct lguest *lg)
+{
+ unsigned int i;
+
+ BUG_ON(!mutex_is_locked(&lguest_lock));
+
+ down_read(&lg->mm->mmap_sem);
+ for (i = 0; i < LGUEST_MAX_DMA; i++) {
+ if (lg->dma[i].interrupt)
+ unlink_dma(&lg->dma[i]);
+ }
+ up_read(&lg->mm->mmap_sem);
+}
+
+/* We cache one process to wakeup: helps for batching & wakes outside locks. */
+void set_wakeup_process(struct lguest *lg, struct task_struct *p)
+{
+ if (p == lg->wake)
+ return;
+
+ if (lg->wake) {
+ wake_up_process(lg->wake);
+ put_task_struct(lg->wake);
+ }
+ lg->wake = p;
+ if (lg->wake)
+ get_task_struct(lg->wake);
+}
+
+/* Userspace wants a dma buffer from this guest. */
+unsigned long get_dma_buffer(struct lguest *lg,
+ unsigned long ukey, unsigned long *interrupt)
+{
+ unsigned long ret = 0;
+ union futex_key key;
+ struct lguest_dma_info *i;
+
+ mutex_lock(&lguest_lock);
+ down_read(¤t->mm->mmap_sem);
+ if (get_futex_key((u32 __user *)ukey, &key) != 0) {
+ kill_guest(lg, "bad registered DMA buffer");
+ goto unlock;
+ }
+ list_for_each_entry(i, &dma_hash[hash(&key)], list) {
+ if (key_eq(&key, &i->key) && i->guestid == lg->guestid) {
+ unsigned int j;
+ for (j = 0; j < i->num_dmas; j++) {
+ struct lguest_dma dma;
+
+ ret = i->dmas + j * sizeof(struct lguest_dma);
+ lgread(lg, &dma, ret, sizeof(dma));
+ if (dma.used_len == 0)
+ break;
+ }
+ *interrupt = i->interrupt;
+ break;
+ }
+ }
+unlock:
+ up_read(¤t->mm->mmap_sem);
+ mutex_unlock(&lguest_lock);
+ return ret;
+}
+
===================================================================
--- /dev/null
+++ b/drivers/lguest/lg.h
@@ -0,0 +1,278 @@
+#ifndef _LGUEST_H
+#define _LGUEST_H
+
+#include <asm/desc.h>
+
+#define GDT_ENTRY_LGUEST_CS 10
+#define GDT_ENTRY_LGUEST_DS 11
+#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
+#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/stringify.h>
+#include <linux/binfmts.h>
+#include <linux/futex.h>
+#include <linux/lguest.h>
+#include <linux/lguest_launcher.h>
+#include <linux/err.h>
+#include <asm/semaphore.h>
+#include "irq_vectors.h"
+
+#define GUEST_PL 1
+
+struct lguest_regs
+{
+ /* Manually saved part. */
+ u32 ebx, ecx, edx;
+ u32 esi, edi, ebp;
+ u32 gs;
+ u32 eax;
+ u32 fs, ds, es;
+ u32 trapnum, errcode;
+ /* Trap pushed part */
+ u32 eip;
+ u32 cs;
+ u32 eflags;
+ u32 esp;
+ u32 ss;
+};
+
+void free_pagetables(void);
+int init_pagetables(struct page **switcher_page, unsigned int pages);
+
+/* Full 4G segment descriptors, suitable for CS and DS. */
+#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
+#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300})
+
+struct lguest_dma_info
+{
+ struct list_head list;
+ union futex_key key;
+ unsigned long dmas;
+ u16 next_dma;
+ u16 num_dmas;
+ u16 guestid;
+ u8 interrupt; /* 0 when not registered */
+};
+
+/* We have separate types for the guest's ptes & pgds and the shadow ptes &
+ * pgds. Since this host might use three-level pagetables and the guest and
+ * shadow pagetables don't, we can't use the normal pte_t/pgd_t. */
+typedef union {
+ struct { unsigned flags:12, pfn:20; };
+ struct { unsigned long val; } raw;
+} spgd_t;
+typedef union {
+ struct { unsigned flags:12, pfn:20; };
+ struct { unsigned long val; } raw;
+} spte_t;
+typedef union {
+ struct { unsigned flags:12, pfn:20; };
+ struct { unsigned long val; } raw;
+} gpgd_t;
+typedef union {
+ struct { unsigned flags:12, pfn:20; };
+ struct { unsigned long val; } raw;
+} gpte_t;
+#define mkgpte(_val) ((gpte_t){.raw.val = _val})
+#define mkgpgd(_val) ((gpgd_t){.raw.val = _val})
+
+struct pgdir
+{
+ unsigned long cr3;
+ spgd_t *pgdir;
+};
+
+/* FIXME: When this is defined in processor.h, delete this definition. */
+struct i386_hw_tss {
+ unsigned short back_link,__blh;
+ unsigned long esp0;
+ unsigned short ss0,__ss0h;
+ unsigned long esp1;
+ unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
+ unsigned long esp2;
+ unsigned short ss2,__ss2h;
+ unsigned long __cr3;
+ unsigned long eip;
+ unsigned long eflags;
+ unsigned long eax,ecx,edx,ebx;
+ unsigned long esp;
+ unsigned long ebp;
+ unsigned long esi;
+ unsigned long edi;
+ unsigned short es, __esh;
+ unsigned short cs, __csh;
+ unsigned short ss, __ssh;
+ unsigned short ds, __dsh;
+ unsigned short fs, __fsh;
+ unsigned short gs, __gsh;
+ unsigned short ldt, __ldth;
+ unsigned short trace, io_bitmap_base;
+};
+
+/* This is a guest-specific page (mapped ro) into the guest. */
+struct lguest_ro_state
+{
+ /* Host information we need to restore when we switch back. */
+ u32 host_cr3;
+ struct Xgt_desc_struct host_idt_desc;
+ struct Xgt_desc_struct host_gdt_desc;
+ u32 host_sp;
+
+ /* Fields which are used when guest is running. */
+ struct Xgt_desc_struct guest_idt_desc;
+ struct Xgt_desc_struct guest_gdt_desc;
+ struct i386_hw_tss guest_tss;
+ struct desc_struct guest_idt[IDT_ENTRIES];
+ struct desc_struct guest_gdt[GDT_ENTRIES];
+};
+
+/* We have two pages shared with guests, per cpu. */
+struct lguest_pages
+{
+ /* This is the stack page mapped rw in guest */
+ char spare[PAGE_SIZE - sizeof(struct lguest_regs)];
+ struct lguest_regs regs;
+
+ /* This is the host state & guest descriptor page, ro in guest */
+ struct lguest_ro_state state;
+} __attribute__((aligned(PAGE_SIZE)));
+
+#define CHANGED_IDT 1
+#define CHANGED_GDT 2
+#define CHANGED_ALL 3
+
+/* The private info the thread maintains about the guest. */
+struct lguest
+{
+ /* At end of a page shared mapped over lguest_pages in guest. */
+ unsigned long regs_page;
+ struct lguest_regs *regs;
+ struct lguest_data __user *lguest_data;
+ struct task_struct *tsk;
+ struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */
+ u16 guestid;
+ u32 pfn_limit;
+ u32 page_offset;
+ u32 cr2;
+ int halted;
+ int ts;
+ u32 last_timer;
+ u32 next_hcall;
+ u32 esp1;
+ u8 ss1;
+
+ /* Bitmap of what has changed: see CHANGED_* above. */
+ int changed;
+ struct lguest_pages *last_pages;
+
+ /* We keep a small number of these. */
+ u32 pgdidx;
+ struct pgdir pgdirs[4];
+
+ /* Cached wakeup: we hold a reference to this task. */
+ struct task_struct *wake;
+
+ unsigned long noirq_start, noirq_end;
+ int dma_is_pending;
+ unsigned long pending_dma; /* struct lguest_dma */
+ unsigned long pending_key; /* address they're sending to */
+
+ unsigned int stack_pages;
+
+ struct lguest_dma_info dma[LGUEST_MAX_DMA];
+
+ /* Dead? */
+ const char *dead;
+
+ /* The GDT entries copied into lguest_ro_state when running. */
+ struct desc_struct gdt[GDT_ENTRIES];
+
+ /* The IDT entries: some copied into lguest_ro_state when running. */
+ struct desc_struct idt[FIRST_EXTERNAL_VECTOR+LGUEST_IRQS];
+ struct desc_struct syscall_idt;
+
+ /* Pending virtual interrupts */
+ DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
+};
+
+extern struct lguest lguests[];
+extern struct mutex lguest_lock;
+
+/* core.c: */
+u32 lgread_u32(struct lguest *lg, u32 addr);
+void lgwrite_u32(struct lguest *lg, u32 val, u32 addr);
+void lgread(struct lguest *lg, void *buf, u32 addr, unsigned bytes);
+void lgwrite(struct lguest *lg, u32 addr, const void *buf, unsigned bytes);
+int find_free_guest(void);
+int lguest_address_ok(const struct lguest *lg,
+ unsigned long addr, unsigned long len);
+int run_guest(struct lguest *lg, char *__user user);
+
+
+/* interrupts_and_traps.c: */
+void maybe_do_interrupt(struct lguest *lg);
+int deliver_trap(struct lguest *lg, unsigned int num);
+void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi);
+void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages);
+void pin_trap_pages(struct lguest *lg);
+void setup_default_idt_entries(struct lguest_ro_state *state,
+ const unsigned long *def);
+void copy_traps(const struct lguest *lg, struct desc_struct *idt,
+ const unsigned long *def);
+
+/* segments.c: */
+void setup_default_gdt_entries(struct lguest_ro_state *state);
+void setup_guest_gdt(struct lguest *lg);
+void load_guest_gdt(struct lguest *lg, u32 table, u32 num);
+void guest_load_tls(struct lguest *lg,
+ const struct desc_struct __user *tls_array);
+void copy_gdt(const struct lguest *lg, struct desc_struct *gdt);
+
+/* page_tables.c: */
+int init_guest_pagetable(struct lguest *lg, unsigned long pgtable);
+void free_guest_pagetable(struct lguest *lg);
+void guest_new_pagetable(struct lguest *lg, unsigned long pgtable);
+void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 i);
+void guest_pagetable_clear_all(struct lguest *lg);
+void guest_pagetable_flush_user(struct lguest *lg);
+void guest_set_pte(struct lguest *lg, unsigned long cr3,
+ unsigned long vaddr, gpte_t val);
+void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages);
+int demand_page(struct lguest *info, unsigned long cr2, int write);
+void pin_page(struct lguest *lg, unsigned long vaddr);
+
+/* lguest_user.c: */
+int lguest_device_init(void);
+void lguest_device_remove(void);
+
+/* io.c: */
+void lguest_io_init(void);
+int bind_dma(struct lguest *lg,
+ unsigned long key, unsigned long udma, u16 numdmas, u8 interrupt);
+void send_dma(struct lguest *info, unsigned long key, unsigned long udma);
+void release_all_dma(struct lguest *lg);
+unsigned long get_dma_buffer(struct lguest *lg, unsigned long key,
+ unsigned long *interrupt);
+void set_wakeup_process(struct lguest *lg, struct task_struct *p);
+
+/* hypercalls.c: */
+void do_hypercalls(struct lguest *lg);
+
+#define kill_guest(lg, fmt...) \
+do { \
+ if (!(lg)->dead) { \
+ (lg)->dead = kasprintf(GFP_ATOMIC, fmt); \
+ if (!(lg)->dead) \
+ (lg)->dead = ERR_PTR(-ENOMEM); \
+ } \
+} while(0)
+
+static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
+{
+ return vaddr - lg->page_offset;
+}
+#endif /* __ASSEMBLY__ */
+#endif /* _LGUEST_H */
===================================================================
--- /dev/null
+++ b/drivers/lguest/lguest_user.c
@@ -0,0 +1,192 @@
+/* Userspace control of the guest, via /dev/lguest. */
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include "lg.h"
+
+static void setup_regs(struct lguest_regs *regs, unsigned long start)
+{
+ /* Write out stack in format lguest expects, so we can switch to it. */
+ regs->edi = LGUEST_MAGIC_EDI;
+ regs->ebp = LGUEST_MAGIC_EBP;
+ regs->esi = LGUEST_MAGIC_ESI;
+ regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
+ regs->cs = __KERNEL_CS|GUEST_PL;
+ regs->eflags = 0x202; /* Interrupts enabled. */
+ regs->eip = start;
+}
+
+/* + addr */
+static long user_get_dma(struct lguest *lg, const u32 __user *input)
+{
+ unsigned long key, udma, irq;
+
+ if (get_user(key, input) != 0)
+ return -EFAULT;
+ udma = get_dma_buffer(lg, key, &irq);
+ if (!udma)
+ return -ENOENT;
+
+ /* We put irq number in udma->used_len. */
+ lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq);
+ return udma;
+}
+
+/* + irq */
+static int user_send_irq(struct lguest *lg, const u32 __user *input)
+{
+ u32 irq;
+
+ if (get_user(irq, input) != 0)
+ return -EFAULT;
+ if (irq >= LGUEST_IRQS)
+ return -EINVAL;
+ set_bit(irq, lg->irqs_pending);
+ return 0;
+}
+
+static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
+{
+ struct lguest *lg = file->private_data;
+
+ if (!lg)
+ return -EINVAL;
+
+ if (lg->dead) {
+ size_t len;
+
+ if (IS_ERR(lg->dead))
+ return PTR_ERR(lg->dead);
+
+ len = min(size, strlen(lg->dead)+1);
+ if (copy_to_user(user, lg->dead, len) != 0)
+ return -EFAULT;
+ return len;
+ }
+
+ if (lg->dma_is_pending)
+ lg->dma_is_pending = 0;
+
+ return run_guest(lg, user);
+}
+
+/* Take: pfnlimit, pgdir, start, pageoffset. */
+static int initialize(struct file *file, const u32 __user *input)
+{
+ struct lguest *lg;
+ int err, i;
+ u32 args[4];
+
+ if (file->private_data)
+ return -EBUSY;
+
+ if (copy_from_user(args, input, sizeof(args)) != 0)
+ return -EFAULT;
+
+ mutex_lock(&lguest_lock);
+ i = find_free_guest();
+ if (i < 0) {
+ err = -ENOSPC;
+ goto unlock;
+ }
+ lg = &lguests[i];
+ lg->guestid = i;
+ lg->pfn_limit = args[0];
+ lg->page_offset = args[3];
+ lg->regs_page = get_zeroed_page(GFP_KERNEL);
+ if (!lg->regs_page) {
+ err = -ENOMEM;
+ goto release_guest;
+ }
+ lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs);
+
+ err = init_guest_pagetable(lg, args[1]);
+ if (err)
+ goto free_regs;
+
+ setup_regs(lg->regs, args[2]);
+ setup_guest_gdt(lg);
+ lg->tsk = current;
+ lg->mm = get_task_mm(current);
+ lg->last_pages = NULL;
+ mutex_unlock(&lguest_lock);
+
+ file->private_data = lg;
+ return sizeof(args);
+
+free_regs:
+ free_page(lg->regs_page);
+release_guest:
+ memset(lg, 0, sizeof(*lg));
+unlock:
+ mutex_unlock(&lguest_lock);
+ return err;
+}
+
+static ssize_t write(struct file *file, const char __user *input,
+ size_t size, loff_t *off)
+{
+ struct lguest *lg = file->private_data;
+ u32 req;
+
+ if (get_user(req, input) != 0)
+ return -EFAULT;
+ input += sizeof(req);
+
+ if (req != LHREQ_INITIALIZE && !lg)
+ return -EINVAL;
+ if (lg && lg->dead)
+ return -ENOENT;
+
+ switch (req) {
+ case LHREQ_INITIALIZE:
+ return initialize(file, (const u32 __user *)input);
+ case LHREQ_GETDMA:
+ return user_get_dma(lg, (const u32 __user *)input);
+ case LHREQ_IRQ:
+ return user_send_irq(lg, (const u32 __user *)input);
+ default:
+ return -EINVAL;
+ }
+}
+
+static int close(struct inode *inode, struct file *file)
+{
+ struct lguest *lg = file->private_data;
+
+ if (!lg)
+ return 0;
+
+ mutex_lock(&lguest_lock);
+ release_all_dma(lg);
+ free_guest_pagetable(lg);
+ mmput(lg->mm);
+ if (!IS_ERR(lg->dead))
+ kfree(lg->dead);
+ free_page(lg->regs_page);
+ memset(lg, 0, sizeof(*lg));
+ mutex_unlock(&lguest_lock);
+ return 0;
+}
+
+static struct file_operations lguest_fops = {
+ .owner = THIS_MODULE,
+ .release = close,
+ .write = write,
+ .read = read,
+};
+static struct miscdevice lguest_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "lguest",
+ .fops = &lguest_fops,
+};
+
+int __init lguest_device_init(void)
+{
+ return misc_register(&lguest_dev);
+}
+
+void __exit lguest_device_remove(void)
+{
+ misc_deregister(&lguest_dev);
+}
===================================================================
--- /dev/null
+++ b/drivers/lguest/page_tables.c
@@ -0,0 +1,407 @@
+/* Shadow page table operations.
+ * Copyright (C) Rusty Russell IBM Corporation 2006.
+ * GPL v2 and any later version */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/percpu.h>
+#include <asm/tlbflush.h>
+#include "lg.h"
+
+#define PTES_PER_PAGE_SHIFT 10
+#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT)
+#define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1)
+
+static DEFINE_PER_CPU(spte_t *, switcher_pte_pages) = { NULL };
+#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
+
+static unsigned vaddr_to_pgd_index(unsigned long vaddr)
+{
+ return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
+}
+
+/* These access the shadow versions (ie. the ones used by the CPU). */
+static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
+{
+ unsigned int index = vaddr_to_pgd_index(vaddr);
+
+ if (index >= SWITCHER_PGD_INDEX) {
+ kill_guest(lg, "attempt to access switcher pages");
+ index = 0;
+ }
+ return &lg->pgdirs[i].pgdir[index];
+}
+
+static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr)
+{
+ spte_t *page = __va(spgd.pfn << PAGE_SHIFT);
+ BUG_ON(!(spgd.flags & _PAGE_PRESENT));
+ return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE];
+}
+
+/* These access the guest versions. */
+static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
+{
+ unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
+ return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t);
+}
+
+static unsigned long gpte_addr(struct lguest *lg,
+ gpgd_t gpgd, unsigned long vaddr)
+{
+ unsigned long gpage = gpgd.pfn << PAGE_SHIFT;
+ BUG_ON(!(gpgd.flags & _PAGE_PRESENT));
+ return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t);
+}
+
+/* Do a virtual -> physical mapping on a user page. */
+static unsigned long get_pfn(unsigned long virtpfn, int write)
+{
+ struct page *page;
+ unsigned long ret = -1UL;
+
+ down_read(¤t->mm->mmap_sem);
+ if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT,
+ 1, write, 1, &page, NULL) == 1)
+ ret = page_to_pfn(page);
+ up_read(¤t->mm->mmap_sem);
+ return ret;
+}
+
+static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write)
+{
+ spte_t spte;
+ unsigned long pfn;
+
+ /* We ignore the global flag. */
+ spte.flags = (gpte.flags & ~_PAGE_GLOBAL);
+ pfn = get_pfn(gpte.pfn, write);
+ if (pfn == -1UL) {
+ kill_guest(lg, "failed to get page %u", gpte.pfn);
+ /* Must not put_page() bogus page on cleanup. */
+ spte.flags = 0;
+ }
+ spte.pfn = pfn;
+ return spte;
+}
+
+static void release_pte(spte_t pte)
+{
+ if (pte.flags & _PAGE_PRESENT)
+ put_page(pfn_to_page(pte.pfn));
+}
+
+static void check_gpte(struct lguest *lg, gpte_t gpte)
+{
+ if ((gpte.flags & (_PAGE_PWT|_PAGE_PSE)) || gpte.pfn >= lg->pfn_limit)
+ kill_guest(lg, "bad page table entry");
+}
+
+static void check_gpgd(struct lguest *lg, gpgd_t gpgd)
+{
+ if ((gpgd.flags & ~_PAGE_TABLE) || gpgd.pfn >= lg->pfn_limit)
+ kill_guest(lg, "bad page directory entry");
+}
+
+/* FIXME: We hold reference to pages, which prevents them from being
+ swapped. It'd be nice to have a callback when Linux wants to swap out. */
+
+/* We fault pages in, which allows us to update accessed/dirty bits.
+ * Return true if we got page. */
+int demand_page(struct lguest *lg, unsigned long vaddr, int write)
+{
+ gpgd_t gpgd;
+ spgd_t *spgd;
+ unsigned long gpte_ptr;
+ gpte_t gpte;
+ spte_t *spte;
+
+ gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr)));
+ if (!(gpgd.flags & _PAGE_PRESENT))
+ return 0;
+
+ spgd = spgd_addr(lg, lg->pgdidx, vaddr);
+ if (!(spgd->flags & _PAGE_PRESENT)) {
+ /* Get a page of PTEs for them. */
+ unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+ /* FIXME: Steal from self in this case? */
+ if (!ptepage) {
+ kill_guest(lg, "out of memory allocating pte page");
+ return 0;
+ }
+ check_gpgd(lg, gpgd);
+ spgd->raw.val = (__pa(ptepage) | gpgd.flags);
+ }
+
+ gpte_ptr = gpte_addr(lg, gpgd, vaddr);
+ gpte = mkgpte(lgread_u32(lg, gpte_ptr));
+
+ /* No page? */
+ if (!(gpte.flags & _PAGE_PRESENT))
+ return 0;
+
+ /* Write to read-only page? */
+ if (write && !(gpte.flags & _PAGE_RW))
+ return 0;
+
+ check_gpte(lg, gpte);
+ gpte.flags |= _PAGE_ACCESSED;
+ if (write)
+ gpte.flags |= _PAGE_DIRTY;
+
+ /* We're done with the old pte. */
+ spte = spte_addr(lg, *spgd, vaddr);
+ release_pte(*spte);
+
+ /* We don't make it writable if this isn't a write: later
+ * write will fault so we can set dirty bit in guest. */
+ if (gpte.flags & _PAGE_DIRTY)
+ *spte = gpte_to_spte(lg, gpte, 1);
+ else {
+ gpte_t ro_gpte = gpte;
+ ro_gpte.flags &= ~_PAGE_RW;
+ *spte = gpte_to_spte(lg, ro_gpte, 0);
+ }
+
+ /* Now we update dirty/accessed on guest. */
+ lgwrite_u32(lg, gpte_ptr, gpte.raw.val);
+ return 1;
+}
+
+/* This is much faster than the full demand_page logic. */
+static int page_writable(struct lguest *lg, unsigned long vaddr)
+{
+ spgd_t *spgd;
+ unsigned long flags;
+
+ spgd = spgd_addr(lg, lg->pgdidx, vaddr);
+ if (!(spgd->flags & _PAGE_PRESENT))
+ return 0;
+
+ flags = spte_addr(lg, *spgd, vaddr)->flags;
+ return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
+}
+
+void pin_page(struct lguest *lg, unsigned long vaddr)
+{
+ if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 0))
+ kill_guest(lg, "bad trap page %#lx", vaddr);
+}
+
+static void release_pgd(struct lguest *lg, spgd_t *spgd)
+{
+ if (spgd->flags & _PAGE_PRESENT) {
+ unsigned int i;
+ spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT);
+ for (i = 0; i < PTES_PER_PAGE; i++)
+ release_pte(ptepage[i]);
+ free_page((long)ptepage);
+ spgd->raw.val = 0;
+ }
+}
+
+static void flush_user_mappings(struct lguest *lg, int idx)
+{
+ unsigned int i;
+ for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++)
+ release_pgd(lg, lg->pgdirs[idx].pgdir + i);
+}
+
+void guest_pagetable_flush_user(struct lguest *lg)
+{
+ flush_user_mappings(lg, lg->pgdidx);
+}
+
+static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
+{
+ unsigned int i;
+ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+ if (lg->pgdirs[i].cr3 == pgtable)
+ break;
+ return i;
+}
+
+static unsigned int new_pgdir(struct lguest *lg,
+ unsigned long cr3,
+ int *blank_pgdir)
+{
+ unsigned int next;
+
+ next = random32() % ARRAY_SIZE(lg->pgdirs);
+ if (!lg->pgdirs[next].pgdir) {
+ lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL);
+ if (!lg->pgdirs[next].pgdir)
+ next = lg->pgdidx;
+ else
+ /* There are no mappings: you'll need to re-pin */
+ *blank_pgdir = 1;
+ }
+ lg->pgdirs[next].cr3 = cr3;
+ /* Release all the non-kernel mappings. */
+ flush_user_mappings(lg, next);
+
+ return next;
+}
+
+void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
+{
+ int newpgdir, repin = 0;
+
+ newpgdir = find_pgdir(lg, pgtable);
+ if (newpgdir == ARRAY_SIZE(lg->pgdirs))
+ newpgdir = new_pgdir(lg, pgtable, &repin);
+ lg->pgdidx = newpgdir;
+ if (repin)
+ pin_trap_pages(lg);
+}
+
+static void release_all_pagetables(struct lguest *lg)
+{
+ unsigned int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+ if (lg->pgdirs[i].pgdir)
+ for (j = 0; j < SWITCHER_PGD_INDEX; j++)
+ release_pgd(lg, lg->pgdirs[i].pgdir + j);
+}
+
+void guest_pagetable_clear_all(struct lguest *lg)
+{
+ release_all_pagetables(lg);
+ pin_trap_pages(lg);
+}
+
+static void do_set_pte(struct lguest *lg, int idx,
+ unsigned long vaddr, gpte_t gpte)
+{
+ spgd_t *spgd = spgd_addr(lg, idx, vaddr);
+ if (spgd->flags & _PAGE_PRESENT) {
+ spte_t *spte = spte_addr(lg, *spgd, vaddr);
+ release_pte(*spte);
+ if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
+ check_gpte(lg, gpte);
+ *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY);
+ } else
+ spte->raw.val = 0;
+ }
+}
+
+void guest_set_pte(struct lguest *lg,
+ unsigned long cr3, unsigned long vaddr, gpte_t gpte)
+{
+ /* Kernel mappings must be changed on all top levels. */
+ if (vaddr >= lg->page_offset) {
+ unsigned int i;
+ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+ if (lg->pgdirs[i].pgdir)
+ do_set_pte(lg, i, vaddr, gpte);
+ } else {
+ int pgdir = find_pgdir(lg, cr3);
+ if (pgdir != ARRAY_SIZE(lg->pgdirs))
+ do_set_pte(lg, pgdir, vaddr, gpte);
+ }
+}
+
+void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
+{
+ int pgdir;
+
+ if (idx >= SWITCHER_PGD_INDEX)
+ return;
+
+ pgdir = find_pgdir(lg, cr3);
+ if (pgdir < ARRAY_SIZE(lg->pgdirs))
+ release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
+}
+
+int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
+{
+ /* We assume this in flush_user_mappings, so check now */
+ if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX)
+ return -EINVAL;
+ lg->pgdidx = 0;
+ lg->pgdirs[lg->pgdidx].cr3 = pgtable;
+ lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL);
+ if (!lg->pgdirs[lg->pgdidx].pgdir)
+ return -ENOMEM;
+ return 0;
+}
+
+void free_guest_pagetable(struct lguest *lg)
+{
+ unsigned int i;
+
+ release_all_pagetables(lg);
+ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+ free_page((long)lg->pgdirs[i].pgdir);
+}
+
+/* Caller must be preempt-safe */
+void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
+{
+ spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
+ spgd_t switcher_pgd;
+ spte_t regs_pte;
+
+ /* Since switcher less that 4MB, we simply mug top pte page. */
+ switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT;
+ switcher_pgd.flags = _PAGE_KERNEL;
+ lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
+
+ /* Map our regs page over stack page. */
+ regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT;
+ regs_pte.flags = _PAGE_KERNEL;
+ switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE]
+ = regs_pte;
+}
+
+static void free_switcher_pte_pages(void)
+{
+ unsigned int i;
+
+ for_each_possible_cpu(i)
+ free_page((long)switcher_pte_page(i));
+}
+
+static __init void populate_switcher_pte_page(unsigned int cpu,
+ struct page *switcher_page[],
+ unsigned int pages)
+{
+ unsigned int i;
+ spte_t *pte = switcher_pte_page(cpu);
+
+ for (i = 0; i < pages; i++) {
+ pte[i].pfn = page_to_pfn(switcher_page[i]);
+ pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED;
+ }
+
+ /* We only map this CPU's pages, so guest can't see others. */
+ i = pages + cpu*2;
+
+ /* First page (regs) is rw, second (state) is ro. */
+ pte[i].pfn = page_to_pfn(switcher_page[i]);
+ pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW;
+ pte[i+1].pfn = page_to_pfn(switcher_page[i+1]);
+ pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED;
+}
+
+__init int init_pagetables(struct page **switcher_page, unsigned int pages)
+{
+ unsigned int i;
+
+ for_each_possible_cpu(i) {
+ switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!switcher_pte_page(i)) {
+ free_switcher_pte_pages();
+ return -ENOMEM;
+ }
+ populate_switcher_pte_page(i, switcher_page, pages);
+ }
+ return 0;
+}
+
+void free_pagetables(void)
+{
+ free_switcher_pte_pages();
+}
===================================================================
--- /dev/null
+++ b/drivers/lguest/segments.c
@@ -0,0 +1,115 @@
+#include "lg.h"
+
+static int desc_ok(const struct desc_struct *gdt)
+{
+ /* MBZ=0, P=1, DT=1 */
+ return ((gdt->b & 0x00209000) == 0x00009000);
+}
+
+static int segment_present(const struct desc_struct *gdt)
+{
+ return gdt->b & 0x8000;
+}
+
+static int ignored_gdt(unsigned int num)
+{
+ return (num == GDT_ENTRY_TSS
+ || num == GDT_ENTRY_LGUEST_CS
+ || num == GDT_ENTRY_LGUEST_DS
+ || num == GDT_ENTRY_DOUBLEFAULT_TSS);
+}
+
+/* We don't allow removal of CS, DS or SS; it doesn't make sense. */
+static void check_segment_use(struct lguest *lg, unsigned int desc)
+{
+ if (lg->regs->gs / 8 == desc)
+ lg->regs->gs = 0;
+ if (lg->regs->fs / 8 == desc)
+ lg->regs->fs = 0;
+ if (lg->regs->es / 8 == desc)
+ lg->regs->es = 0;
+ if (lg->regs->ds / 8 == desc
+ || lg->regs->cs / 8 == desc
+ || lg->regs->ss / 8 == desc)
+ kill_guest(lg, "Removed live GDT entry %u", desc);
+}
+
+static void fixup_gdt_table(struct lguest *lg)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(lg->gdt); i++) {
+ /* We never copy these ones to real gdt */
+ if (ignored_gdt(i))
+ continue;
+
+ /* We could fault in switch_to_guest if they are using
+ * a removed segment. */
+ if (!segment_present(&lg->gdt[i])) {
+ check_segment_use(lg, i);
+ continue;
+ }
+
+ if (!desc_ok(&lg->gdt[i]))
+ kill_guest(lg, "Bad GDT descriptor %i", i);
+
+ /* DPL 0 presumably means "for use by guest". */
+ if ((lg->gdt[i].b & 0x00006000) == 0)
+ lg->gdt[i].b |= (GUEST_PL << 13);
+
+ /* Set accessed bit, since gdt isn't writable. */
+ lg->gdt[i].b |= 0x00000100;
+ }
+}
+
+void setup_default_gdt_entries(struct lguest_ro_state *state)
+{
+ struct desc_struct *gdt = state->guest_gdt;
+ unsigned long tss = (unsigned long)&state->guest_tss;
+
+ /* Hypervisor segments. */
+ gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+ gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+
+ /* This is the one which we *cannot* copy from guest, since tss
+ is depended on this lguest_ro_state, ie. this cpu. */
+ gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);
+ gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000)
+ | ((tss >> 16) & 0x000000FF);
+}
+
+void setup_guest_gdt(struct lguest *lg)
+{
+ lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
+ lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
+ lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
+ lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
+}
+
+void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
+{
+ unsigned int i;
+
+ for (i = 0; i < GDT_ENTRIES; i++)
+ if (!ignored_gdt(i))
+ gdt[i] = lg->gdt[i];
+}
+
+void load_guest_gdt(struct lguest *lg, u32 table, u32 num)
+{
+ if (num > ARRAY_SIZE(lg->gdt))
+ kill_guest(lg, "too many gdt entries %i", num);
+
+ lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0]));
+ fixup_gdt_table(lg);
+ lg->changed |= CHANGED_GDT;
+}
+
+void guest_load_tls(struct lguest *lg, const struct desc_struct __user *gtls)
+{
+ struct desc_struct *tls = &lg->gdt[GDT_ENTRY_TLS_MIN];
+
+ lgread(lg, tls, (u32)gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
+ fixup_gdt_table(lg);
+ lg->changed |= CHANGED_GDT;
+}
===================================================================
--- /dev/null
+++ b/drivers/lguest/switcher.S
@@ -0,0 +1,159 @@
+/* This code sits at 0xFFC00000 to do the low-level guest<->host switch.
+
+ There is are two pages above us for this CPU (struct lguest_pages).
+ The second page (struct lguest_ro_state) becomes read-only after the
+ context switch. The first page (the stack for traps) remains writable,
+ but while we're in here, the guest cannot be running.
+*/
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include "lg.h"
+
+.text
+ENTRY(start_switcher_text)
+
+/* %eax points to lguest pages for this CPU. %ebx contains cr3 value.
+ All normal registers can be clobbered! */
+ENTRY(switch_to_guest)
+ /* Save host segments on host stack. */
+ pushl %es
+ pushl %ds
+ pushl %gs
+ pushl %fs
+ /* With CONFIG_FRAME_POINTER, gcc doesn't let us clobber this! */
+ pushl %ebp
+ /* Save host stack. */
+ movl %esp, LGUEST_PAGES_host_sp(%eax)
+ /* Switch to guest stack: if we get NMI we expect to be there. */
+ movl %eax, %edx
+ addl $LGUEST_PAGES_regs, %edx
+ movl %edx, %esp
+ /* Switch to guest's GDT, IDT. */
+ lgdt LGUEST_PAGES_guest_gdt_desc(%eax)
+ lidt LGUEST_PAGES_guest_idt_desc(%eax)
+ /* Switch to guest's TSS while GDT still writable. */
+ movl $(GDT_ENTRY_TSS*8), %edx
+ ltr %dx
+ /* Set host's TSS GDT entry to available (clear byte 5 bit 2). */
+ movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
+ andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
+ /* Switch to guest page tables: lguest_pages->state now read-only. */
+ movl %ebx, %cr3
+ /* Restore guest regs */
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %gs
+ popl %eax
+ popl %fs
+ popl %ds
+ popl %es
+ /* Skip error code and trap number */
+ addl $8, %esp
+ iret
+
+#define SWITCH_TO_HOST \
+ /* Save guest state */ \
+ pushl %es; \
+ pushl %ds; \
+ pushl %fs; \
+ pushl %eax; \
+ pushl %gs; \
+ pushl %ebp; \
+ pushl %edi; \
+ pushl %esi; \
+ pushl %edx; \
+ pushl %ecx; \
+ pushl %ebx; \
+ /* Load lguest ds segment for convenience. */ \
+ movl $(LGUEST_DS), %eax; \
+ movl %eax, %ds; \
+ /* Figure out where we are, based on stack (at top of regs). */ \
+ movl %esp, %eax; \
+ subl $LGUEST_PAGES_regs, %eax; \
+ /* Put trap number in %ebx before we switch cr3 and lose it. */ \
+ movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \
+ /* Switch to host page tables (host GDT, IDT and stack are in host \
+ mem, so need this first) */ \
+ movl LGUEST_PAGES_host_cr3(%eax), %edx; \
+ movl %edx, %cr3; \
+ /* Set guest's TSS to available (clear byte 5 bit 2). */ \
+ andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
+ /* Switch to host's GDT & IDT. */ \
+ lgdt LGUEST_PAGES_host_gdt_desc(%eax); \
+ lidt LGUEST_PAGES_host_idt_desc(%eax); \
+ /* Switch to host's stack. */ \
+ movl LGUEST_PAGES_host_sp(%eax), %esp; \
+ /* Switch to host's TSS */ \
+ movl $(GDT_ENTRY_TSS*8), %edx; \
+ ltr %dx; \
+ popl %ebp; \
+ popl %fs; \
+ popl %gs; \
+ popl %ds; \
+ popl %es
+
+/* Return to run_guest_once. */
+return_to_host:
+ SWITCH_TO_HOST
+ iret
+
+deliver_to_host:
+ SWITCH_TO_HOST
+ /* Decode IDT and jump to hosts' irq handler. When that does iret, it
+ * will return to run_guest_once. This is a feature. */
+ movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
+ leal (%edx,%ebx,8), %eax
+ movzwl (%eax),%edx
+ movl 4(%eax), %eax
+ xorw %ax, %ax
+ orl %eax, %edx
+ jmp *%edx
+
+/* Real hardware interrupts are delivered straight to the host. Others
+ cause us to return to run_guest_once so it can decide what to do. Note
+ that some of these are overridden by the guest to deliver directly, and
+ never enter here (see load_guest_idt_entry). */
+.macro IRQ_STUB N TARGET
+ .data; .long 1f; .text; 1:
+ /* Make an error number for most traps, which don't have one. */
+ .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
+ pushl $0
+ .endif
+ pushl $\N
+ jmp \TARGET
+ ALIGN
+.endm
+
+.macro IRQ_STUBS FIRST LAST TARGET
+ irq=\FIRST
+ .rept \LAST-\FIRST+1
+ IRQ_STUB irq \TARGET
+ irq=irq+1
+ .endr
+.endm
+
+/* We intercept every interrupt, because we may need to switch back to
+ * host. Unfortunately we can't tell them apart except by entry
+ * point, so we need 256 entry points.
+ */
+.data
+.global default_idt_entries
+default_idt_entries:
+.text
+ IRQ_STUBS 0 1 return_to_host /* First two traps */
+ IRQ_STUB 2 handle_nmi /* NMI */
+ IRQ_STUBS 3 31 return_to_host /* Rest of traps */
+ IRQ_STUBS 32 127 deliver_to_host /* Real interrupts */
+ IRQ_STUB 128 return_to_host /* System call (overridden) */
+ IRQ_STUBS 129 255 deliver_to_host /* Other real interrupts */
+
+/* We ignore NMI and return. */
+handle_nmi:
+ addl $8, %esp
+ iret
+
+ENTRY(end_switcher_text)
===================================================================
--- /dev/null
+++ b/include/linux/lguest_launcher.h
@@ -0,0 +1,80 @@
+#ifndef _ASM_LGUEST_USER
+#define _ASM_LGUEST_USER
+/* Everything the "lguest" userspace program needs to know. */
+/* They can register up to 32 arrays of lguest_dma. */
+#define LGUEST_MAX_DMA 32
+/* At most we can dma 16 lguest_dma in one op. */
+#define LGUEST_MAX_DMA_SECTIONS 16
+
+/* How many devices? Assume each one wants up to two dma arrays per device. */
+#define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2)
+
+struct lguest_dma
+{
+ /* 0 if free to be used, filled by hypervisor. */
+ u32 used_len;
+ u32 addr[LGUEST_MAX_DMA_SECTIONS];
+ u16 len[LGUEST_MAX_DMA_SECTIONS];
+};
+
+/* This is found at address 0. */
+struct lguest_boot_info
+{
+ u32 max_pfn;
+ u32 initrd_size;
+ char cmdline[256];
+};
+
+struct lguest_block_page
+{
+ /* 0 is a read, 1 is a write. */
+ int type;
+ u32 sector; /* Offset in device = sector * 512. */
+ u32 bytes; /* Length expected to be read/written in bytes */
+ /* 0 = pending, 1 = done, 2 = done, error */
+ int result;
+ u32 num_sectors; /* Disk length = num_sectors * 512 */
+};
+
+/* There is a shared page of these. */
+struct lguest_net
+{
+ /* Simply the mac address (with multicast bit meaning promisc). */
+ unsigned char mac[6];
+};
+
+/* Where the Host expects the Guest to SEND_DMA console output to. */
+#define LGUEST_CONSOLE_DMA_KEY 0
+
+/* We have a page of these descriptors in the lguest_device page. */
+struct lguest_device_desc {
+ u16 type;
+#define LGUEST_DEVICE_T_CONSOLE 1
+#define LGUEST_DEVICE_T_NET 2
+#define LGUEST_DEVICE_T_BLOCK 3
+
+ u16 features;
+#define LGUEST_NET_F_NOCSUM 0x4000 /* Don't bother checksumming */
+#define LGUEST_DEVICE_F_RANDOMNESS 0x8000 /* IRQ is fairly random */
+
+ u16 status;
+/* 256 and above are device specific. */
+#define LGUEST_DEVICE_S_ACKNOWLEDGE 1 /* We have seen device. */
+#define LGUEST_DEVICE_S_DRIVER 2 /* We have found a driver */
+#define LGUEST_DEVICE_S_DRIVER_OK 4 /* Driver says OK! */
+#define LGUEST_DEVICE_S_REMOVED 8 /* Device has gone away. */
+#define LGUEST_DEVICE_S_REMOVED_ACK 16 /* Driver has been told. */
+#define LGUEST_DEVICE_S_FAILED 128 /* Something actually failed */
+
+ u16 num_pages;
+ u32 pfn;
+};
+
+/* Write command first word is a request. */
+enum lguest_req
+{
+ LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */
+ LHREQ_GETDMA, /* + addr (returns &lguest_dma, irq in ->used_len) */
+ LHREQ_IRQ, /* + irq */
+};
+#endif /* _ASM_LGUEST_USER */
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 3/8] lguest: the asm offsets
[not found] ` <1176203176.26372.25.camel@localhost.localdomain>
@ 2007-04-10 11:07 ` Rusty Russell
[not found] ` <1176203231.26372.27.camel@localhost.localdomain>
1 sibling, 0 replies; 13+ messages in thread
From: Rusty Russell @ 2007-04-10 11:07 UTC (permalink / raw)
To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization
This is the structure offsets required by lg.ko's switcher.S.
Unfortunately we don't have infrastructure for private asm-offsets
creation.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
arch/i386/kernel/asm-offsets.c | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
===================================================================
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -16,6 +16,10 @@
#include <asm/thread_info.h>
#include <asm/elf.h>
#include <asm/pda.h>
+#ifdef CONFIG_LGUEST_GUEST
+#include <linux/lguest.h>
+#include "../../../drivers/lguest/lg.h"
+#endif
#define DEFINE(sym, val) \
asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -114,4 +118,19 @@ void foo(void)
OFFSET(PARAVIRT_iret, paravirt_ops, iret);
OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
#endif
+
+#ifdef CONFIG_LGUEST_GUEST
+ BLANK();
+ OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+ OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
+ OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
+ OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
+ OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
+ OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
+ OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
+ OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
+ OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
+ OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
+ OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
+#endif
}
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 4/8] lguest: the Makefile and Kconfig
[not found] ` <1176203231.26372.27.camel@localhost.localdomain>
@ 2007-04-10 11:08 ` Rusty Russell
2007-04-10 11:08 ` [PATCH 5/8] lguest: the console driver Rusty Russell
0 siblings, 1 reply; 13+ messages in thread
From: Rusty Russell @ 2007-04-10 11:08 UTC (permalink / raw)
To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization
This is the Kconfig and Makefile to allow lguest to actually be
compiled.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
drivers/Kconfig | 2 ++
drivers/Makefile | 1 +
drivers/lguest/Kconfig | 20 ++++++++++++++++++++
drivers/lguest/Makefile | 7 +++++++
4 files changed, 30 insertions(+)
===================================================================
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -87,4 +87,6 @@ source "drivers/kvm/Kconfig"
source "drivers/kvm/Kconfig"
source "drivers/uio/Kconfig"
+
+source "drivers/lguest/Kconfig"
endmenu
===================================================================
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -86,3 +86,4 @@ obj-$(CONFIG_HID) += hid/
obj-$(CONFIG_HID) += hid/
obj-$(CONFIG_PPC_PS3) += ps3/
obj-$(CONFIG_SSB) += ssb/
+obj-$(CONFIG_LGUEST_GUEST) += lguest/
===================================================================
--- /dev/null
+++ b/drivers/lguest/Kconfig
@@ -0,0 +1,20 @@
+config LGUEST
+ tristate "Linux hypervisor example code"
+ depends on X86 && PARAVIRT && NET && EXPERIMENTAL && !X86_PAE
+ select LGUEST_GUEST
+ select HVC_DRIVER
+ ---help---
+ This is a very simple module which allows you to run
+ multiple instances of the same Linux kernel, using the
+ "lguest" command found in the Documentation/lguest directory.
+ Note that "lguest" is pronounced to rhyme with "fell quest",
+ not "rustyvisor". See Documentation/lguest/lguest.txt.
+
+ If unsure, say N. If curious, say M. If masochistic, say Y.
+
+config LGUEST_GUEST
+ bool
+ help
+ The guest needs code built-in, even if the host has lguest
+ support as a module. The drivers are tiny, so we build them
+ in too.
===================================================================
--- /dev/null
+++ b/drivers/lguest/Makefile
@@ -0,0 +1,7 @@
+# Guest requires the paravirt_ops replacement and the bus driver.
+obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o
+
+# Host requires the other files, which can be a module.
+obj-$(CONFIG_LGUEST) += lg.o
+lg-objs := core.o hypercalls.o page_tables.o interrupts_and_traps.o \
+ segments.o io.o lguest_user.o switcher.o
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 5/8] lguest: the console driver
2007-04-10 11:08 ` [PATCH 4/8] lguest: the Makefile and Kconfig Rusty Russell
@ 2007-04-10 11:08 ` Rusty Russell
2007-04-10 11:09 ` [PATCH 5/8] lguest: the net driver Rusty Russell
` (2 more replies)
0 siblings, 3 replies; 13+ messages in thread
From: Rusty Russell @ 2007-04-10 11:08 UTC (permalink / raw)
To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization
A simple console driver for lguest.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
drivers/char/Makefile | 1
drivers/char/hvc_lguest.c | 99 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 100 insertions(+)
===================================================================
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -44,6 +44,7 @@ obj-$(CONFIG_SX) += sx.o generic_serial
obj-$(CONFIG_SX) += sx.o generic_serial.o
obj-$(CONFIG_RIO) += rio/ generic_serial.o
obj-$(CONFIG_HVC_CONSOLE) += hvc_vio.o hvsi.o
+obj-$(CONFIG_LGUEST_GUEST) += hvc_lguest.o
obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o
obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o
obj-$(CONFIG_HVC_BEAT) += hvc_beat.o
===================================================================
--- /dev/null
+++ b/drivers/char/hvc_lguest.c
@@ -0,0 +1,99 @@
+/* Simple console for lguest.
+ *
+ * Copyright (C) 2006 Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/lguest_bus.h>
+#include "hvc_console.h"
+
+static char inbuf[256];
+static struct lguest_dma cons_input = { .used_len = 0,
+ .addr[0] = __pa(inbuf),
+ .len[0] = sizeof(inbuf),
+ .len[1] = 0 };
+
+static int put_chars(u32 vtermno, const char *buf, int count)
+{
+ struct lguest_dma dma;
+
+ /* FIXME: what if it's over a page boundary? */
+ dma.len[0] = count;
+ dma.len[1] = 0;
+ dma.addr[0] = __pa(buf);
+
+ hcall(LHCALL_SEND_DMA, LGUEST_CONSOLE_DMA_KEY, __pa(&dma), 0);
+ return count;
+}
+
+static int get_chars(u32 vtermno, char *buf, int count)
+{
+ static int cons_offset;
+
+ if (!cons_input.used_len)
+ return 0;
+
+ if (cons_input.used_len - cons_offset < count)
+ count = cons_input.used_len - cons_offset;
+
+ memcpy(buf, inbuf + cons_offset, count);
+ cons_offset += count;
+ if (cons_offset == cons_input.used_len) {
+ cons_offset = 0;
+ cons_input.used_len = 0;
+ }
+ return count;
+}
+
+struct hv_ops lguest_cons = {
+ .get_chars = get_chars,
+ .put_chars = put_chars,
+};
+
+static int __init cons_init(void)
+{
+ if (strcmp(paravirt_ops.name, "lguest") != 0)
+ return 0;
+
+ return hvc_instantiate(0, 0, &lguest_cons);
+}
+console_initcall(cons_init);
+
+static int lguestcons_probe(struct lguest_device *lgdev)
+{
+ lgdev->private = hvc_alloc(0, lgdev->index+1, &lguest_cons, 256);
+ if (IS_ERR(lgdev->private))
+ return PTR_ERR(lgdev->private);
+
+ if (!hcall(LHCALL_BIND_DMA, LGUEST_CONSOLE_DMA_KEY, __pa(&cons_input),
+ (1<<8) + lgdev->index+1))
+ printk("lguest console: failed to bind buffer.\n");
+ return 0;
+}
+
+static struct lguest_driver lguestcons_drv = {
+ .name = "lguestcons",
+ .owner = THIS_MODULE,
+ .device_type = LGUEST_DEVICE_T_CONSOLE,
+ .probe = lguestcons_probe,
+};
+
+static int __init hvc_lguest_init(void)
+{
+ return register_lguest_driver(&lguestcons_drv);
+}
+module_init(hvc_lguest_init);
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 5/8] lguest: the net driver
2007-04-10 11:08 ` [PATCH 5/8] lguest: the console driver Rusty Russell
@ 2007-04-10 11:09 ` Rusty Russell
2007-04-10 11:11 ` [PATCH 6/8] " Rusty Russell
[not found] ` <1176203475.26372.37.camel@localhost.localdomain>
2 siblings, 0 replies; 13+ messages in thread
From: Rusty Russell @ 2007-04-10 11:09 UTC (permalink / raw)
To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization
A simple net driver for lguest.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
drivers/net/Makefile | 1
drivers/net/lguest_net.c | 355 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 356 insertions(+)
===================================================================
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -220,3 +220,4 @@ obj-$(CONFIG_FS_ENET) += fs_enet/
obj-$(CONFIG_FS_ENET) += fs_enet/
obj-$(CONFIG_NETXEN_NIC) += netxen/
+obj-$(CONFIG_LGUEST_GUEST) += lguest_net.o
===================================================================
--- /dev/null
+++ b/drivers/net/lguest_net.c
@@ -0,0 +1,355 @@
+/* A simple network driver for lguest.
+ *
+ * Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+//#define DEBUG
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/module.h>
+#include <linux/mm_types.h>
+#include <linux/lguest_bus.h>
+#include <asm/io.h>
+
+#define SHARED_SIZE PAGE_SIZE
+#define MAX_LANS 4
+#define NUM_SKBS 8
+
+struct lguestnet_info
+{
+ /* The shared page(s). */
+ struct lguest_net *peer;
+ unsigned long peer_phys;
+ unsigned long mapsize;
+
+ /* My peerid. */
+ unsigned int me;
+
+ struct net_device_stats stats;
+
+ /* Receive queue. */
+ struct sk_buff *skb[NUM_SKBS];
+ struct lguest_dma dma[NUM_SKBS];
+};
+
+/* How many bytes left in this page. */
+static unsigned int rest_of_page(void *data)
+{
+ return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE);
+}
+
+/* Simple convention: offset 4 * peernum. */
+static unsigned long peer_key(struct lguestnet_info *info, unsigned peernum)
+{
+ return info->peer_phys + 4 * peernum;
+}
+
+static void skb_to_dma(const struct sk_buff *skb, unsigned int headlen,
+ struct lguest_dma *dma)
+{
+ unsigned int i, seg;
+
+ for (i = seg = 0; i < headlen; seg++, i += rest_of_page(skb->data+i)) {
+ dma->addr[seg] = virt_to_phys(skb->data + i);
+ dma->len[seg] = min((unsigned)(headlen - i),
+ rest_of_page(skb->data + i));
+ }
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, seg++) {
+ const skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+ /* Should not happen with MTU less than 64k - 2 * PAGE_SIZE. */
+ if (seg == LGUEST_MAX_DMA_SECTIONS) {
+ printk("Woah dude! Megapacket!\n");
+ break;
+ }
+ dma->addr[seg] = page_to_phys(f->page) + f->page_offset;
+ dma->len[seg] = f->size;
+ }
+ if (seg < LGUEST_MAX_DMA_SECTIONS)
+ dma->len[seg] = 0;
+}
+
+/* We overload multicast bit to show promiscuous mode. */
+#define PROMISC_BIT 0x01
+
+static void lguestnet_set_multicast(struct net_device *dev)
+{
+ struct lguestnet_info *info = dev->priv;
+
+ if ((dev->flags & (IFF_PROMISC|IFF_ALLMULTI)) || dev->mc_count)
+ info->peer[info->me].mac[0] |= PROMISC_BIT;
+ else
+ info->peer[info->me].mac[0] &= ~PROMISC_BIT;
+}
+
+static int promisc(struct lguestnet_info *info, unsigned int peer)
+{
+ return info->peer[peer].mac[0] & PROMISC_BIT;
+}
+
+static int mac_eq(const unsigned char mac[ETH_ALEN],
+ struct lguestnet_info *info, unsigned int peer)
+{
+ /* Ignore multicast bit, which peer turns on to mean promisc. */
+ if ((info->peer[peer].mac[0] & (~PROMISC_BIT)) != mac[0])
+ return 0;
+ return memcmp(mac+1, info->peer[peer].mac+1, ETH_ALEN-1) == 0;
+}
+
+static void transfer_packet(struct net_device *dev,
+ struct sk_buff *skb,
+ unsigned int peernum)
+{
+ struct lguestnet_info *info = dev->priv;
+ struct lguest_dma dma;
+
+ skb_to_dma(skb, skb_headlen(skb), &dma);
+ pr_debug("xfer length %04x (%u)\n", htons(skb->len), skb->len);
+
+ hcall(LHCALL_SEND_DMA, peer_key(info,peernum), __pa(&dma), 0);
+ if (dma.used_len != skb->len) {
+ info->stats.tx_carrier_errors++;
+ pr_debug("Bad xfer to peer %i: %i of %i (dma %p/%i)\n",
+ peernum, dma.used_len, skb->len,
+ (void *)dma.addr[0], dma.len[0]);
+ } else {
+ info->stats.tx_bytes += skb->len;
+ info->stats.tx_packets++;
+ }
+}
+
+static int unused_peer(const struct lguest_net peer[], unsigned int num)
+{
+ return peer[num].mac[0] == 0;
+}
+
+static int lguestnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ unsigned int i;
+ int broadcast;
+ struct lguestnet_info *info = dev->priv;
+ const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
+
+ pr_debug("%s: xmit %02x:%02x:%02x:%02x:%02x:%02x\n",
+ dev->name, dest[0],dest[1],dest[2],dest[3],dest[4],dest[5]);
+
+ broadcast = is_multicast_ether_addr(dest);
+ for (i = 0; i < info->mapsize/sizeof(struct lguest_net); i++) {
+ if (i == info->me || unused_peer(info->peer, i))
+ continue;
+
+ if (!broadcast && !promisc(info, i) && !mac_eq(dest, info, i))
+ continue;
+
+ pr_debug("lguestnet %s: sending from %i to %i\n",
+ dev->name, info->me, i);
+ transfer_packet(dev, skb, i);
+ }
+ dev_kfree_skb(skb);
+ return 0;
+}
+
+/* Find a new skb to put in this slot in shared mem. */
+static int fill_slot(struct net_device *dev, unsigned int slot)
+{
+ struct lguestnet_info *info = dev->priv;
+ /* Try to create and register a new one. */
+ info->skb[slot] = netdev_alloc_skb(dev, ETH_HLEN + ETH_DATA_LEN);
+ if (!info->skb[slot]) {
+ printk("%s: could not fill slot %i\n", dev->name, slot);
+ return -ENOMEM;
+ }
+
+ skb_to_dma(info->skb[slot], ETH_HLEN + ETH_DATA_LEN, &info->dma[slot]);
+ wmb();
+ /* Now we tell hypervisor it can use the slot. */
+ info->dma[slot].used_len = 0;
+ return 0;
+}
+
+static irqreturn_t lguestnet_rcv(int irq, void *dev_id)
+{
+ struct net_device *dev = dev_id;
+ struct lguestnet_info *info = dev->priv;
+ unsigned int i, done = 0;
+
+ for (i = 0; i < ARRAY_SIZE(info->dma); i++) {
+ unsigned int length;
+ struct sk_buff *skb;
+
+ length = info->dma[i].used_len;
+ if (length == 0)
+ continue;
+
+ done++;
+ skb = info->skb[i];
+ fill_slot(dev, i);
+
+ if (length < ETH_HLEN || length > ETH_HLEN + ETH_DATA_LEN) {
+ pr_debug(KERN_WARNING "%s: unbelievable skb len: %i\n",
+ dev->name, length);
+ dev_kfree_skb(skb);
+ continue;
+ }
+
+ skb_put(skb, length);
+ skb->protocol = eth_type_trans(skb, dev);
+ /* This is a reliable transport. */
+ if (dev->features & NETIF_F_NO_CSUM)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
+ ntohs(skb->protocol), skb->len, skb->pkt_type);
+
+ info->stats.rx_bytes += skb->len;
+ info->stats.rx_packets++;
+ netif_rx(skb);
+ }
+ return done ? IRQ_HANDLED : IRQ_NONE;
+}
+
+static int lguestnet_open(struct net_device *dev)
+{
+ int i;
+ struct lguestnet_info *info = dev->priv;
+
+ /* Set up our MAC address */
+ memcpy(info->peer[info->me].mac, dev->dev_addr, ETH_ALEN);
+
+ /* Turn on promisc mode if needed */
+ lguestnet_set_multicast(dev);
+
+ for (i = 0; i < ARRAY_SIZE(info->dma); i++) {
+ if (fill_slot(dev, i) != 0)
+ goto cleanup;
+ }
+ if (!hcall(LHCALL_BIND_DMA, peer_key(info, info->me), __pa(info->dma),
+ (NUM_SKBS << 8) | dev->irq))
+ goto cleanup;
+ return 0;
+
+cleanup:
+ while (--i >= 0)
+ dev_kfree_skb(info->skb[i]);
+ return -ENOMEM;
+}
+
+static int lguestnet_close(struct net_device *dev)
+{
+ unsigned int i;
+ struct lguestnet_info *info = dev->priv;
+
+ /* Clear all trace: others might deliver packets, we'll ignore it. */
+ memset(&info->peer[info->me], 0xFF, sizeof(info->peer[info->me]));
+
+ /* Deregister sg lists. */
+ hcall(LHCALL_BIND_DMA, peer_key(info, info->me), __pa(info->dma), 0);
+ for (i = 0; i < ARRAY_SIZE(info->dma); i++)
+ dev_kfree_skb(info->skb[i]);
+ return 0;
+}
+
+static struct net_device_stats *lguestnet_get_stats(struct net_device *dev)
+{
+ struct lguestnet_info *info = dev->priv;
+
+ return &info->stats;
+}
+
+static int lguestnet_probe(struct lguest_device *lgdev)
+{
+ int err, irqf = IRQF_SHARED;
+ struct net_device *dev;
+ struct lguestnet_info *info;
+ struct lguest_device_desc *desc = &lguest_devices[lgdev->index];
+
+ pr_debug("lguest_net: probing for device %i\n", lgdev->index);
+
+ dev = alloc_etherdev(sizeof(struct lguestnet_info));
+ if (!dev)
+ return -ENOMEM;
+
+ SET_MODULE_OWNER(dev);
+
+ /* Ethernet defaults with some changes */
+ ether_setup(dev);
+ dev->set_mac_address = NULL;
+
+ dev->dev_addr[0] = 0x02; /* set local assignment bit (IEEE802) */
+ dev->dev_addr[1] = 0x00;
+ memcpy(&dev->dev_addr[2], &lguest_data.guestid, 2);
+ dev->dev_addr[4] = 0x00;
+ dev->dev_addr[5] = 0x00;
+
+ dev->open = lguestnet_open;
+ dev->stop = lguestnet_close;
+ dev->hard_start_xmit = lguestnet_start_xmit;
+ dev->get_stats = lguestnet_get_stats;
+
+ /* Turning on/off promisc will call dev->set_multicast_list.
+ * We don't actually support multicast yet */
+ dev->set_multicast_list = lguestnet_set_multicast;
+ dev->mem_start = ((unsigned long)desc->pfn << PAGE_SHIFT);
+ dev->mem_end = dev->mem_start + PAGE_SIZE * desc->num_pages;
+ dev->irq = lgdev->index+1;
+ dev->features = NETIF_F_SG;
+ if (desc->features & LGUEST_NET_F_NOCSUM)
+ dev->features |= NETIF_F_NO_CSUM;
+
+ info = dev->priv;
+ info->mapsize = PAGE_SIZE * desc->num_pages;
+ info->peer_phys = ((unsigned long)desc->pfn << PAGE_SHIFT);
+ info->peer = (void *)ioremap(info->peer_phys, info->mapsize);
+ /* This stores our peerid (upper bits reserved for future). */
+ info->me = (desc->features & (info->mapsize-1));
+
+ err = register_netdev(dev);
+ if (err) {
+ pr_debug("lguestnet: registering device failed\n");
+ goto free;
+ }
+
+ if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
+ irqf |= IRQF_SAMPLE_RANDOM;
+ if (request_irq(dev->irq, lguestnet_rcv, irqf, "lguestnet", dev) != 0) {
+ pr_debug("lguestnet: could not get net irq %i\n", dev->irq);
+ goto unregister;
+ }
+
+ pr_debug("lguestnet: registered device %s\n", dev->name);
+ lgdev->private = dev;
+ return 0;
+
+unregister:
+ unregister_netdev(dev);
+free:
+ free_netdev(dev);
+ return err;
+}
+
+static struct lguest_driver lguestnet_drv = {
+ .name = "lguestnet",
+ .owner = THIS_MODULE,
+ .device_type = LGUEST_DEVICE_T_NET,
+ .probe = lguestnet_probe,
+};
+
+static __init int lguestnet_init(void)
+{
+ return register_lguest_driver(&lguestnet_drv);
+}
+module_init(lguestnet_init);
+
+MODULE_DESCRIPTION("Lguest network driver");
+MODULE_LICENSE("GPL");
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 6/8] lguest: the net driver
2007-04-10 11:08 ` [PATCH 5/8] lguest: the console driver Rusty Russell
2007-04-10 11:09 ` [PATCH 5/8] lguest: the net driver Rusty Russell
@ 2007-04-10 11:11 ` Rusty Russell
[not found] ` <1176203475.26372.37.camel@localhost.localdomain>
2 siblings, 0 replies; 13+ messages in thread
From: Rusty Russell @ 2007-04-10 11:11 UTC (permalink / raw)
To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization
Lguest net driver
A simple net driver for lguest.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
drivers/net/Makefile | 1
drivers/net/lguest_net.c | 355 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 356 insertions(+)
===================================================================
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -220,3 +220,4 @@ obj-$(CONFIG_FS_ENET) += fs_enet/
obj-$(CONFIG_FS_ENET) += fs_enet/
obj-$(CONFIG_NETXEN_NIC) += netxen/
+obj-$(CONFIG_LGUEST_GUEST) += lguest_net.o
===================================================================
--- /dev/null
+++ b/drivers/net/lguest_net.c
@@ -0,0 +1,355 @@
+/* A simple network driver for lguest.
+ *
+ * Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+//#define DEBUG
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/module.h>
+#include <linux/mm_types.h>
+#include <linux/lguest_bus.h>
+#include <asm/io.h>
+
+#define SHARED_SIZE PAGE_SIZE
+#define MAX_LANS 4
+#define NUM_SKBS 8
+
+struct lguestnet_info
+{
+ /* The shared page(s). */
+ struct lguest_net *peer;
+ unsigned long peer_phys;
+ unsigned long mapsize;
+
+ /* My peerid. */
+ unsigned int me;
+
+ struct net_device_stats stats;
+
+ /* Receive queue. */
+ struct sk_buff *skb[NUM_SKBS];
+ struct lguest_dma dma[NUM_SKBS];
+};
+
+/* How many bytes left in this page. */
+static unsigned int rest_of_page(void *data)
+{
+ return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE);
+}
+
+/* Simple convention: offset 4 * peernum. */
+static unsigned long peer_key(struct lguestnet_info *info, unsigned peernum)
+{
+ return info->peer_phys + 4 * peernum;
+}
+
+static void skb_to_dma(const struct sk_buff *skb, unsigned int headlen,
+ struct lguest_dma *dma)
+{
+ unsigned int i, seg;
+
+ for (i = seg = 0; i < headlen; seg++, i += rest_of_page(skb->data+i)) {
+ dma->addr[seg] = virt_to_phys(skb->data + i);
+ dma->len[seg] = min((unsigned)(headlen - i),
+ rest_of_page(skb->data + i));
+ }
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, seg++) {
+ const skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+ /* Should not happen with MTU less than 64k - 2 * PAGE_SIZE. */
+ if (seg == LGUEST_MAX_DMA_SECTIONS) {
+ printk("Woah dude! Megapacket!\n");
+ break;
+ }
+ dma->addr[seg] = page_to_phys(f->page) + f->page_offset;
+ dma->len[seg] = f->size;
+ }
+ if (seg < LGUEST_MAX_DMA_SECTIONS)
+ dma->len[seg] = 0;
+}
+
+/* We overload multicast bit to show promiscuous mode. */
+#define PROMISC_BIT 0x01
+
+static void lguestnet_set_multicast(struct net_device *dev)
+{
+ struct lguestnet_info *info = dev->priv;
+
+ if ((dev->flags & (IFF_PROMISC|IFF_ALLMULTI)) || dev->mc_count)
+ info->peer[info->me].mac[0] |= PROMISC_BIT;
+ else
+ info->peer[info->me].mac[0] &= ~PROMISC_BIT;
+}
+
+static int promisc(struct lguestnet_info *info, unsigned int peer)
+{
+ return info->peer[peer].mac[0] & PROMISC_BIT;
+}
+
+static int mac_eq(const unsigned char mac[ETH_ALEN],
+ struct lguestnet_info *info, unsigned int peer)
+{
+ /* Ignore multicast bit, which peer turns on to mean promisc. */
+ if ((info->peer[peer].mac[0] & (~PROMISC_BIT)) != mac[0])
+ return 0;
+ return memcmp(mac+1, info->peer[peer].mac+1, ETH_ALEN-1) == 0;
+}
+
+static void transfer_packet(struct net_device *dev,
+ struct sk_buff *skb,
+ unsigned int peernum)
+{
+ struct lguestnet_info *info = dev->priv;
+ struct lguest_dma dma;
+
+ skb_to_dma(skb, skb_headlen(skb), &dma);
+ pr_debug("xfer length %04x (%u)\n", htons(skb->len), skb->len);
+
+ hcall(LHCALL_SEND_DMA, peer_key(info,peernum), __pa(&dma), 0);
+ if (dma.used_len != skb->len) {
+ info->stats.tx_carrier_errors++;
+ pr_debug("Bad xfer to peer %i: %i of %i (dma %p/%i)\n",
+ peernum, dma.used_len, skb->len,
+ (void *)dma.addr[0], dma.len[0]);
+ } else {
+ info->stats.tx_bytes += skb->len;
+ info->stats.tx_packets++;
+ }
+}
+
+static int unused_peer(const struct lguest_net peer[], unsigned int num)
+{
+ return peer[num].mac[0] == 0;
+}
+
+static int lguestnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ unsigned int i;
+ int broadcast;
+ struct lguestnet_info *info = dev->priv;
+ const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
+
+ pr_debug("%s: xmit %02x:%02x:%02x:%02x:%02x:%02x\n",
+ dev->name, dest[0],dest[1],dest[2],dest[3],dest[4],dest[5]);
+
+ broadcast = is_multicast_ether_addr(dest);
+ for (i = 0; i < info->mapsize/sizeof(struct lguest_net); i++) {
+ if (i == info->me || unused_peer(info->peer, i))
+ continue;
+
+ if (!broadcast && !promisc(info, i) && !mac_eq(dest, info, i))
+ continue;
+
+ pr_debug("lguestnet %s: sending from %i to %i\n",
+ dev->name, info->me, i);
+ transfer_packet(dev, skb, i);
+ }
+ dev_kfree_skb(skb);
+ return 0;
+}
+
+/* Find a new skb to put in this slot in shared mem. */
+static int fill_slot(struct net_device *dev, unsigned int slot)
+{
+ struct lguestnet_info *info = dev->priv;
+ /* Try to create and register a new one. */
+ info->skb[slot] = netdev_alloc_skb(dev, ETH_HLEN + ETH_DATA_LEN);
+ if (!info->skb[slot]) {
+ printk("%s: could not fill slot %i\n", dev->name, slot);
+ return -ENOMEM;
+ }
+
+ skb_to_dma(info->skb[slot], ETH_HLEN + ETH_DATA_LEN, &info->dma[slot]);
+ wmb();
+ /* Now we tell hypervisor it can use the slot. */
+ info->dma[slot].used_len = 0;
+ return 0;
+}
+
+static irqreturn_t lguestnet_rcv(int irq, void *dev_id)
+{
+ struct net_device *dev = dev_id;
+ struct lguestnet_info *info = dev->priv;
+ unsigned int i, done = 0;
+
+ for (i = 0; i < ARRAY_SIZE(info->dma); i++) {
+ unsigned int length;
+ struct sk_buff *skb;
+
+ length = info->dma[i].used_len;
+ if (length == 0)
+ continue;
+
+ done++;
+ skb = info->skb[i];
+ fill_slot(dev, i);
+
+ if (length < ETH_HLEN || length > ETH_HLEN + ETH_DATA_LEN) {
+ pr_debug(KERN_WARNING "%s: unbelievable skb len: %i\n",
+ dev->name, length);
+ dev_kfree_skb(skb);
+ continue;
+ }
+
+ skb_put(skb, length);
+ skb->protocol = eth_type_trans(skb, dev);
+ /* This is a reliable transport. */
+ if (dev->features & NETIF_F_NO_CSUM)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
+ ntohs(skb->protocol), skb->len, skb->pkt_type);
+
+ info->stats.rx_bytes += skb->len;
+ info->stats.rx_packets++;
+ netif_rx(skb);
+ }
+ return done ? IRQ_HANDLED : IRQ_NONE;
+}
+
+static int lguestnet_open(struct net_device *dev)
+{
+ int i;
+ struct lguestnet_info *info = dev->priv;
+
+ /* Set up our MAC address */
+ memcpy(info->peer[info->me].mac, dev->dev_addr, ETH_ALEN);
+
+ /* Turn on promisc mode if needed */
+ lguestnet_set_multicast(dev);
+
+ for (i = 0; i < ARRAY_SIZE(info->dma); i++) {
+ if (fill_slot(dev, i) != 0)
+ goto cleanup;
+ }
+ if (!hcall(LHCALL_BIND_DMA, peer_key(info, info->me), __pa(info->dma),
+ (NUM_SKBS << 8) | dev->irq))
+ goto cleanup;
+ return 0;
+
+cleanup:
+ while (--i >= 0)
+ dev_kfree_skb(info->skb[i]);
+ return -ENOMEM;
+}
+
+static int lguestnet_close(struct net_device *dev)
+{
+ unsigned int i;
+ struct lguestnet_info *info = dev->priv;
+
+ /* Clear all trace: others might deliver packets, we'll ignore it. */
+ memset(&info->peer[info->me], 0xFF, sizeof(info->peer[info->me]));
+
+ /* Deregister sg lists. */
+ hcall(LHCALL_BIND_DMA, peer_key(info, info->me), __pa(info->dma), 0);
+ for (i = 0; i < ARRAY_SIZE(info->dma); i++)
+ dev_kfree_skb(info->skb[i]);
+ return 0;
+}
+
+static struct net_device_stats *lguestnet_get_stats(struct net_device *dev)
+{
+ struct lguestnet_info *info = dev->priv;
+
+ return &info->stats;
+}
+
+static int lguestnet_probe(struct lguest_device *lgdev)
+{
+ int err, irqf = IRQF_SHARED;
+ struct net_device *dev;
+ struct lguestnet_info *info;
+ struct lguest_device_desc *desc = &lguest_devices[lgdev->index];
+
+ pr_debug("lguest_net: probing for device %i\n", lgdev->index);
+
+ dev = alloc_etherdev(sizeof(struct lguestnet_info));
+ if (!dev)
+ return -ENOMEM;
+
+ SET_MODULE_OWNER(dev);
+
+ /* Ethernet defaults with some changes */
+ ether_setup(dev);
+ dev->set_mac_address = NULL;
+
+ dev->dev_addr[0] = 0x02; /* set local assignment bit (IEEE802) */
+ dev->dev_addr[1] = 0x00;
+ memcpy(&dev->dev_addr[2], &lguest_data.guestid, 2);
+ dev->dev_addr[4] = 0x00;
+ dev->dev_addr[5] = 0x00;
+
+ dev->open = lguestnet_open;
+ dev->stop = lguestnet_close;
+ dev->hard_start_xmit = lguestnet_start_xmit;
+ dev->get_stats = lguestnet_get_stats;
+
+ /* Turning on/off promisc will call dev->set_multicast_list.
+ * We don't actually support multicast yet */
+ dev->set_multicast_list = lguestnet_set_multicast;
+ dev->mem_start = ((unsigned long)desc->pfn << PAGE_SHIFT);
+ dev->mem_end = dev->mem_start + PAGE_SIZE * desc->num_pages;
+ dev->irq = lgdev->index+1;
+ dev->features = NETIF_F_SG;
+ if (desc->features & LGUEST_NET_F_NOCSUM)
+ dev->features |= NETIF_F_NO_CSUM;
+
+ info = dev->priv;
+ info->mapsize = PAGE_SIZE * desc->num_pages;
+ info->peer_phys = ((unsigned long)desc->pfn << PAGE_SHIFT);
+ info->peer = (void *)ioremap(info->peer_phys, info->mapsize);
+ /* This stores our peerid (upper bits reserved for future). */
+ info->me = (desc->features & (info->mapsize-1));
+
+ err = register_netdev(dev);
+ if (err) {
+ pr_debug("lguestnet: registering device failed\n");
+ goto free;
+ }
+
+ if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
+ irqf |= IRQF_SAMPLE_RANDOM;
+ if (request_irq(dev->irq, lguestnet_rcv, irqf, "lguestnet", dev) != 0) {
+ pr_debug("lguestnet: could not get net irq %i\n", dev->irq);
+ goto unregister;
+ }
+
+ pr_debug("lguestnet: registered device %s\n", dev->name);
+ lgdev->private = dev;
+ return 0;
+
+unregister:
+ unregister_netdev(dev);
+free:
+ free_netdev(dev);
+ return err;
+}
+
+static struct lguest_driver lguestnet_drv = {
+ .name = "lguestnet",
+ .owner = THIS_MODULE,
+ .device_type = LGUEST_DEVICE_T_NET,
+ .probe = lguestnet_probe,
+};
+
+static __init int lguestnet_init(void)
+{
+ return register_lguest_driver(&lguestnet_drv);
+}
+module_init(lguestnet_init);
+
+MODULE_DESCRIPTION("Lguest network driver");
+MODULE_LICENSE("GPL");
^ permalink raw reply [flat|nested] 13+ messages in thread
* PATCH 7/8] lguest: the block driver
[not found] ` <1176203475.26372.37.camel@localhost.localdomain>
@ 2007-04-10 11:12 ` Rusty Russell
[not found] ` <1176203528.26372.40.camel@localhost.localdomain>
1 sibling, 0 replies; 13+ messages in thread
From: Rusty Russell @ 2007-04-10 11:12 UTC (permalink / raw)
To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization
Lguest block driver
A simple block driver for lguest.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
drivers/block/Makefile | 1
drivers/block/lguest_blk.c | 271 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 272 insertions(+)
===================================================================
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -28,4 +28,5 @@ obj-$(CONFIG_VIODASD) += viodasd.o
obj-$(CONFIG_VIODASD) += viodasd.o
obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
obj-$(CONFIG_BLK_DEV_UB) += ub.o
+obj-$(CONFIG_LGUEST_GUEST) += lguest_blk.o
===================================================================
--- /dev/null
+++ b/drivers/block/lguest_blk.c
@@ -0,0 +1,271 @@
+/* A simple block driver for lguest.
+ *
+ * Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+//#define DEBUG
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/lguest_bus.h>
+
+static char next_block_index = 'a';
+
+struct blockdev
+{
+ spinlock_t lock;
+
+ /* The disk structure for the kernel. */
+ struct gendisk *disk;
+
+ /* The major number for this disk. */
+ int major;
+ int irq;
+
+ unsigned long phys_addr;
+ /* The ioremap'ed block page. */
+ struct lguest_block_page *lb_page;
+
+ /* We only have a single request outstanding at a time. */
+ struct lguest_dma dma;
+ struct request *req;
+};
+
+/* Jens gave me this nice helper to end all chunks of a request. */
+static void end_entire_request(struct request *req, int uptodate)
+{
+ if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
+ BUG();
+ add_disk_randomness(req->rq_disk);
+ blkdev_dequeue_request(req);
+ end_that_request_last(req, uptodate);
+}
+
+static irqreturn_t lgb_irq(int irq, void *_bd)
+{
+ struct blockdev *bd = _bd;
+ unsigned long flags;
+
+ if (!bd->req) {
+ pr_debug("No work!\n");
+ return IRQ_NONE;
+ }
+
+ if (!bd->lb_page->result) {
+ pr_debug("No result!\n");
+ return IRQ_NONE;
+ }
+
+ spin_lock_irqsave(&bd->lock, flags);
+ end_entire_request(bd->req, bd->lb_page->result == 1);
+ bd->req = NULL;
+ bd->dma.used_len = 0;
+ blk_start_queue(bd->disk->queue);
+ spin_unlock_irqrestore(&bd->lock, flags);
+ return IRQ_HANDLED;
+}
+
+static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
+{
+ unsigned int i = 0, idx, len = 0;
+ struct bio *bio;
+
+ rq_for_each_bio(bio, req) {
+ struct bio_vec *bvec;
+ bio_for_each_segment(bvec, bio, idx) {
+ BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
+ BUG_ON(!bvec->bv_len);
+ dma->addr[i] = page_to_phys(bvec->bv_page)
+ + bvec->bv_offset;
+ dma->len[i] = bvec->bv_len;
+ len += bvec->bv_len;
+ i++;
+ }
+ }
+ if (i < LGUEST_MAX_DMA_SECTIONS)
+ dma->len[i] = 0;
+ return len;
+}
+
+static void empty_dma(struct lguest_dma *dma)
+{
+ dma->len[0] = 0;
+}
+
+static void setup_req(struct blockdev *bd,
+ int type, struct request *req, struct lguest_dma *dma)
+{
+ bd->lb_page->type = type;
+ bd->lb_page->sector = req->sector;
+ bd->lb_page->result = 0;
+ bd->req = req;
+ bd->lb_page->bytes = req_to_dma(req, dma);
+}
+
+static void do_write(struct blockdev *bd, struct request *req)
+{
+ struct lguest_dma send;
+
+ pr_debug("lgb: WRITE sector %li\n", (long)req->sector);
+ setup_req(bd, 1, req, &send);
+
+ hcall(LHCALL_SEND_DMA, bd->phys_addr, __pa(&send), 0);
+}
+
+static void do_read(struct blockdev *bd, struct request *req)
+{
+ struct lguest_dma ping;
+
+ pr_debug("lgb: READ sector %li\n", (long)req->sector);
+ setup_req(bd, 0, req, &bd->dma);
+
+ empty_dma(&ping);
+ hcall(LHCALL_SEND_DMA, bd->phys_addr, __pa(&ping), 0);
+}
+
+static void do_lgb_request(request_queue_t *q)
+{
+ struct blockdev *bd;
+ struct request *req;
+
+again:
+ req = elv_next_request(q);
+ if (!req)
+ return;
+
+ bd = req->rq_disk->private_data;
+ /* Sometimes we get repeated requests after blk_stop_queue. */
+ if (bd->req)
+ return;
+
+ if (!blk_fs_request(req)) {
+ pr_debug("Got non-command 0x%08x\n", req->cmd_type);
+ req->errors++;
+ end_entire_request(req, 0);
+ goto again;
+ }
+
+ if (rq_data_dir(req) == WRITE)
+ do_write(bd, req);
+ else
+ do_read(bd, req);
+
+ /* Wait for interrupt to tell us it's done. */
+ blk_stop_queue(q);
+}
+
+static struct block_device_operations lguestblk_fops = {
+ .owner = THIS_MODULE,
+};
+
+static int lguestblk_probe(struct lguest_device *lgdev)
+{
+ struct blockdev *bd;
+ int err;
+ int irqflags = IRQF_SHARED;
+
+ bd = kmalloc(sizeof(*bd), GFP_KERNEL);
+ if (!bd)
+ return -ENOMEM;
+
+ spin_lock_init(&bd->lock);
+ bd->irq = lgdev->index+1;
+ bd->req = NULL;
+ bd->dma.used_len = 0;
+ bd->dma.len[0] = 0;
+ bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT);
+
+ bd->lb_page = (void *)ioremap(bd->phys_addr, PAGE_SIZE);
+ if (!bd->lb_page) {
+ err = -ENOMEM;
+ goto out_free_bd;
+ }
+
+ bd->major = register_blkdev(0, "lguestblk");
+ if (bd->major < 0) {
+ err = bd->major;
+ goto out_unmap;
+ }
+
+ bd->disk = alloc_disk(1);
+ if (!bd->disk) {
+ err = -ENOMEM;
+ goto out_unregister_blkdev;
+ }
+
+ bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock);
+ if (!bd->disk->queue) {
+ err = -ENOMEM;
+ goto out_put_disk;
+ }
+
+ /* We can only handle a certain number of sg entries */
+ blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS);
+ /* Buffers must not cross page boundaries */
+ blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1);
+
+ sprintf(bd->disk->disk_name, "lgb%c", next_block_index++);
+ if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
+ irqflags |= IRQF_SAMPLE_RANDOM;
+ err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd);
+ if (err)
+ goto out_cleanup_queue;
+
+ hcall(LHCALL_BIND_DMA, bd->phys_addr, __pa(&bd->dma), (1<<8)+bd->irq);
+
+ bd->disk->major = bd->major;
+ bd->disk->first_minor = 0;
+ bd->disk->private_data = bd;
+ bd->disk->fops = &lguestblk_fops;
+ /* This is initialized to the disk size by the other end. */
+ set_capacity(bd->disk, bd->lb_page->num_sectors);
+ add_disk(bd->disk);
+
+ printk(KERN_INFO "%s: device %i at major %d\n",
+ bd->disk->disk_name, lgdev->index, bd->major);
+
+ lgdev->private = bd;
+ return 0;
+
+out_cleanup_queue:
+ blk_cleanup_queue(bd->disk->queue);
+out_put_disk:
+ put_disk(bd->disk);
+out_unregister_blkdev:
+ unregister_blkdev(bd->major, "lguestblk");
+out_unmap:
+ iounmap(bd->lb_page);
+out_free_bd:
+ kfree(bd);
+ return err;
+}
+
+static struct lguest_driver lguestblk_drv = {
+ .name = "lguestblk",
+ .owner = THIS_MODULE,
+ .device_type = LGUEST_DEVICE_T_BLOCK,
+ .probe = lguestblk_probe,
+};
+
+static __init int lguestblk_init(void)
+{
+ return register_lguest_driver(&lguestblk_drv);
+}
+module_init(lguestblk_init);
+
+MODULE_DESCRIPTION("Lguest block driver");
+MODULE_LICENSE("GPL");
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 8/8] lguest: the documentation, example launcher
[not found] ` <1176203528.26372.40.camel@localhost.localdomain>
@ 2007-04-10 11:12 ` Rusty Russell
2007-04-10 11:28 ` PATCH 7/8] lguest: the block driver Pekka Enberg
[not found] ` <84144f020704100428r6e35e34cued8758c94728a6a0@mail.gmail.com>
2 siblings, 0 replies; 13+ messages in thread
From: Rusty Russell @ 2007-04-10 11:12 UTC (permalink / raw)
To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List, virtualization
A brief document describing how to use lguest. Because lguest doesn't
have an ABI we also include an example launcher in the Documentation
directory.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
Documentation/lguest/Makefile | 20
Documentation/lguest/lguest.c | 982 +++++++++++++++++++++++++++++++++++++++
Documentation/lguest/lguest.txt | 125 ++++
3 files changed, 1127 insertions(+)
===================================================================
--- /dev/null
+++ b/Documentation/lguest/Makefile
@@ -0,0 +1,20 @@
+# This creates the demonstration utility "lguest" which runs a Linux guest.
+
+# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
+include ../../.config
+LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
+
+CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 \
+ -static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds
+LDLIBS:=-lz
+
+all: lguest.lds lguest
+
+# The linker script on x86 is so complex the only way of creating one
+# which will link our binary in the right place is to mangle the
+# default one.
+lguest.lds:
+ $(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@
+
+clean:
+ rm -f lguest.lds lguest
===================================================================
--- /dev/null
+++ b/Documentation/lguest/lguest.c
@@ -0,0 +1,982 @@
+/* Simple program to layout "physical" memory for new lguest guest.
+ * Linked high to avoid likely physical memory. */
+#define _LARGEFILE64_SOURCE
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <err.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <elf.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <signal.h>
+#include <ctype.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <time.h>
+#include <netinet/in.h>
+#include <net/if.h>
+#include <linux/sockios.h>
+#include <linux/if_tun.h>
+#include <sys/uio.h>
+#include <termios.h>
+#include <getopt.h>
+#include <zlib.h>
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+#include "../../include/linux/lguest_launcher.h"
+
+#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
+#define NET_PEERNUM 1
+#define BRIDGE_PFX "bridge:"
+
+static bool verbose;
+#define verbose(args...) \
+ do { if (verbose) printf(args); } while(0)
+
+struct device_list
+{
+ fd_set infds;
+ int max_infd;
+
+ struct device *dev;
+ struct device **lastdev;
+};
+
+struct device
+{
+ struct device *next;
+ struct lguest_device_desc *desc;
+ void *mem;
+
+ /* Watch this fd if handle_input non-NULL. */
+ int fd;
+ bool (*handle_input)(int fd, struct device *me);
+
+ /* Watch DMA to this key if handle_input non-NULL. */
+ unsigned long watch_key;
+ u32 (*handle_output)(int fd, const struct iovec *iov,
+ unsigned int num, struct device *me);
+
+ /* Device-specific data. */
+ void *priv;
+};
+
+static int open_or_die(const char *name, int flags)
+{
+ int fd = open(name, flags);
+ if (fd < 0)
+ err(1, "Failed to open %s", name);
+ return fd;
+}
+
+static void *map_zeroed_pages(unsigned long addr, unsigned int num)
+{
+ static int fd = -1;
+
+ if (fd == -1)
+ fd = open_or_die("/dev/zero", O_RDONLY);
+
+ if (mmap((void *)addr, getpagesize() * num,
+ PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0)
+ != (void *)addr)
+ err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr);
+ return (void *)addr;
+}
+
+/* Returns the entry point */
+static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
+ unsigned long *page_offset)
+{
+ void *addr;
+ Elf32_Phdr phdr[ehdr->e_phnum];
+ unsigned int i;
+
+ /* Sanity checks. */
+ if (ehdr->e_type != ET_EXEC
+ || ehdr->e_machine != EM_386
+ || ehdr->e_phentsize != sizeof(Elf32_Phdr)
+ || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
+ errx(1, "Malformed elf header");
+
+ if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
+ err(1, "Seeking to program headers");
+ if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
+ err(1, "Reading program headers");
+
+ *page_offset = 0;
+ /* We map the loadable segments at virtual addresses corresponding
+ * to their physical addresses (our virtual == guest physical). */
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ if (phdr[i].p_type != PT_LOAD)
+ continue;
+
+ verbose("Section %i: size %i addr %p\n",
+ i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
+
+ /* We expect linear address space. */
+ if (!*page_offset)
+ *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
+ else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
+ errx(1, "Page offset of section %i different", i);
+
+ /* We map everything private, writable. */
+ addr = mmap((void *)phdr[i].p_paddr,
+ phdr[i].p_filesz,
+ PROT_READ|PROT_WRITE|PROT_EXEC,
+ MAP_FIXED|MAP_PRIVATE,
+ elf_fd, phdr[i].p_offset);
+ if (addr != (void *)phdr[i].p_paddr)
+ err(1, "Mmaping vmlinux seg %i gave %p not %p",
+ i, addr, (void *)phdr[i].p_paddr);
+ }
+
+ /* Entry is physical address: convert to virtual */
+ return ehdr->e_entry + *page_offset;
+}
+
+/* This is amazingly reliable. */
+static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
+{
+ unsigned int i, possibilities[256] = { 0 };
+
+ for (i = 0; i + 4 < len; i++) {
+ /* mov 0xXXXXXXXX,%eax */
+ if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3)
+ return (unsigned long)img[i+4] << 24;
+ }
+ errx(1, "could not determine page offset");
+}
+
+static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
+{
+ gzFile f;
+ int ret, len = 0;
+ void *img = (void *)0x100000;
+
+ f = gzdopen(fd, "rb");
+ if (gzdirect(f))
+ errx(1, "did not find correct gzip header");
+ while ((ret = gzread(f, img + len, 65536)) > 0)
+ len += ret;
+ if (ret < 0)
+ err(1, "reading image from bzImage");
+
+ verbose("Unpacked size %i addr %p\n", len, img);
+ *page_offset = intuit_page_offset(img, len);
+
+ /* Entry is physical address: convert to virtual */
+ return (unsigned long)img + *page_offset;
+}
+
+static unsigned long load_bzimage(int fd, unsigned long *page_offset)
+{
+ unsigned char c;
+ int state = 0;
+
+ /* Ugly brute force search for gzip header. */
+ while (read(fd, &c, 1) == 1) {
+ switch (state) {
+ case 0:
+ if (c == 0x1F)
+ state++;
+ break;
+ case 1:
+ if (c == 0x8B)
+ state++;
+ else
+ state = 0;
+ break;
+ case 2 ... 8:
+ state++;
+ break;
+ case 9:
+ lseek(fd, -10, SEEK_CUR);
+ if (c != 0x03) /* Compressed under UNIX. */
+ state = -1;
+ else
+ return unpack_bzimage(fd, page_offset);
+ }
+ }
+ errx(1, "Could not find kernel in bzImage");
+}
+
+static unsigned long load_kernel(int fd, unsigned long *page_offset)
+{
+ Elf32_Ehdr hdr;
+
+ if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
+ err(1, "Reading kernel");
+
+ if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
+ return map_elf(fd, &hdr, page_offset);
+
+ return load_bzimage(fd, page_offset);
+}
+
+/* initrd gets loaded at top of memory: return length. */
+static unsigned long load_initrd(const char *name, unsigned long mem)
+{
+ int ifd;
+ struct stat st;
+ void *iaddr;
+
+ ifd = open_or_die(name, O_RDONLY);
+ if (fstat(ifd, &st) < 0)
+ err(1, "fstat() on initrd '%s'", name);
+
+ iaddr = mmap((void *)mem - st.st_size, st.st_size,
+ PROT_READ|PROT_EXEC|PROT_WRITE,
+ MAP_FIXED|MAP_PRIVATE, ifd, 0);
+ if (iaddr != (void *)mem - st.st_size)
+ err(1, "Mmaping initrd '%s' returned %p not %p",
+ name, iaddr, (void *)mem - st.st_size);
+ close(ifd);
+ verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr);
+ return st.st_size;
+}
+
+static inline unsigned long page_align(unsigned long addr)
+{
+ return ((addr + getpagesize()-1) & ~(getpagesize()-1));
+}
+
+static unsigned long setup_pagetables(unsigned long mem,
+ unsigned long initrd_size,
+ unsigned long page_offset)
+{
+ u32 *pgdir, *linear;
+ unsigned int mapped_pages, i, linear_pages;
+ unsigned int ptes_per_page = getpagesize()/sizeof(u32);
+
+ /* If we can map all of memory above page_offset, we do so. */
+ if (mem > -page_offset)
+ mapped_pages = mem/getpagesize();
+ else
+ mapped_pages = -page_offset/getpagesize();
+
+ /* Each linear PTE page can map ptes_per_page pages. */
+ linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
+
+ /* We lay out top-level then linear mapping immediately below initrd */
+ pgdir = (void *)mem - initrd_size - getpagesize();
+ linear = (void *)pgdir - linear_pages*getpagesize();
+
+ for (i = 0; i < mapped_pages; i++)
+ linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
+
+ /* Now set up pgd so that this memory is at page_offset */
+ for (i = 0; i < mapped_pages; i += ptes_per_page) {
+ pgdir[(i + page_offset/getpagesize())/ptes_per_page]
+ = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT);
+ }
+
+ verbose("Linear mapping of %u pages in %u pte pages at %p\n",
+ mapped_pages, linear_pages, linear);
+
+ return (unsigned long)pgdir;
+}
+
+static void concat(char *dst, char *args[])
+{
+ unsigned int i, len = 0;
+
+ for (i = 0; args[i]; i++) {
+ strcpy(dst+len, args[i]);
+ strcat(dst+len, " ");
+ len += strlen(args[i]) + 1;
+ }
+ /* In case it's empty. */
+ dst[len] = '\0';
+}
+
+static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
+{
+ u32 args[] = { LHREQ_INITIALIZE,
+ LGUEST_GUEST_TOP/getpagesize(), /* Just below us */
+ pgdir, start, page_offset };
+ int fd;
+
+ fd = open_or_die("/dev/lguest", O_RDWR);
+ if (write(fd, args, sizeof(args)) < 0)
+ err(1, "Writing to /dev/lguest");
+ return fd;
+}
+
+static void set_fd(int fd, struct device_list *devices)
+{
+ FD_SET(fd, &devices->infds);
+ if (fd > devices->max_infd)
+ devices->max_infd = fd;
+}
+
+/* We send lguest_add signals while input is pending: avoids races. */
+static void wake_parent(int pipefd, struct device_list *devices)
+{
+ nice(19);
+
+ set_fd(pipefd, devices);
+
+ for (;;) {
+ fd_set rfds = devices->infds;
+
+ select(devices->max_infd+1, &rfds, NULL, NULL, NULL);
+ if (FD_ISSET(pipefd, &rfds)) {
+ int ignorefd;
+ if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0)
+ exit(0);
+ FD_CLR(ignorefd, &devices->infds);
+ }
+ kill(getppid(), SIGUSR1);
+ }
+}
+
+/* We don't want signal to kill us, just jerk us out of kernel. */
+static void wakeup(int signo)
+{
+}
+
+static int setup_waker(struct device_list *device_list)
+{
+ int pipefd[2], child;
+ struct sigaction act;
+
+ act.sa_handler = wakeup;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = 0;
+ sigaction(SIGUSR1, &act, NULL);
+
+ pipe(pipefd);
+ child = fork();
+ if (child == -1)
+ err(1, "forking");
+
+ if (child == 0) {
+ close(pipefd[1]);
+ wake_parent(pipefd[0], device_list);
+ }
+ close(pipefd[0]);
+
+ return pipefd[1];
+}
+
+static void *_check_pointer(unsigned long addr, unsigned int size,
+ unsigned int line)
+{
+ if (addr >= LGUEST_GUEST_TOP || addr + size >= LGUEST_GUEST_TOP)
+ errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr);
+ return (void *)addr;
+}
+#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
+
+/* Returns pointer to dma->used_len */
+static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
+{
+ unsigned int i;
+ struct lguest_dma *udma;
+
+ udma = check_pointer(dma, sizeof(*udma));
+ for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+ if (!udma->len[i])
+ break;
+
+ iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]);
+ iov[i].iov_len = udma->len[i];
+ }
+ *num = i;
+ return &udma->used_len;
+}
+
+static u32 *get_dma_buffer(int fd, void *key,
+ struct iovec iov[], unsigned int *num, u32 *irq)
+{
+ u32 buf[] = { LHREQ_GETDMA, (u32)key };
+ unsigned long udma;
+ u32 *res;
+
+ udma = write(fd, buf, sizeof(buf));
+ if (udma == (unsigned long)-1)
+ return NULL;
+
+ /* Kernel stashes irq in ->used_len. */
+ res = dma2iov(udma, iov, num);
+ *irq = *res;
+ return res;
+}
+
+static void trigger_irq(int fd, u32 irq)
+{
+ u32 buf[] = { LHREQ_IRQ, irq };
+ if (write(fd, buf, sizeof(buf)) != 0)
+ err(1, "Triggering irq %i", irq);
+}
+
+static void discard_iovec(struct iovec *iov, unsigned int *num)
+{
+ static char discard_buf[1024];
+ *num = 1;
+ iov->iov_base = discard_buf;
+ iov->iov_len = sizeof(discard_buf);
+}
+
+static struct termios orig_term;
+static void restore_term(void)
+{
+ tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
+}
+
+struct console_abort
+{
+ int count;
+ struct timeval start;
+};
+
+/* We DMA input to buffer bound at start of console page. */
+static bool handle_console_input(int fd, struct device *dev)
+{
+ u32 irq = 0, *lenp;
+ int len;
+ unsigned int num;
+ struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+ struct console_abort *abort = dev->priv;
+
+ lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq);
+ if (!lenp) {
+ warn("console: no dma buffer!");
+ discard_iovec(iov, &num);
+ }
+
+ len = readv(dev->fd, iov, num);
+ if (len <= 0) {
+ warnx("Failed to get console input, ignoring console.");
+ len = 0;
+ }
+
+ if (lenp) {
+ *lenp = len;
+ trigger_irq(fd, irq);
+ }
+
+ /* Three ^C within one second? Exit. */
+ if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) {
+ if (!abort->count++)
+ gettimeofday(&abort->start, NULL);
+ else if (abort->count == 3) {
+ struct timeval now;
+ gettimeofday(&now, NULL);
+ if (now.tv_sec <= abort->start.tv_sec+1)
+ exit(2);
+ abort->count = 0;
+ }
+ } else
+ abort->count = 0;
+
+ if (!len) {
+ restore_term();
+ return false;
+ }
+ return true;
+}
+
+static u32 handle_console_output(int fd, const struct iovec *iov,
+ unsigned num, struct device*dev)
+{
+ return writev(STDOUT_FILENO, iov, num);
+}
+
+static u32 handle_tun_output(int fd, const struct iovec *iov,
+ unsigned num, struct device *dev)
+{
+ /* Now we've seen output, we should warn if we can't get buffers. */
+ *(bool *)dev->priv = true;
+ return writev(dev->fd, iov, num);
+}
+
+static unsigned long peer_offset(unsigned int peernum)
+{
+ return 4 * peernum;
+}
+
+static bool handle_tun_input(int fd, struct device *dev)
+{
+ u32 irq = 0, *lenp;
+ int len;
+ unsigned num;
+ struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+
+ lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num,
+ &irq);
+ if (!lenp) {
+ if (*(bool *)dev->priv)
+ warn("network: no dma buffer!");
+ discard_iovec(iov, &num);
+ }
+
+ len = readv(dev->fd, iov, num);
+ if (len <= 0)
+ err(1, "reading network");
+ if (lenp) {
+ *lenp = len;
+ trigger_irq(fd, irq);
+ }
+ verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
+ ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1],
+ lenp ? "sent" : "discarded");
+ return true;
+}
+
+static u32 handle_block_output(int fd, const struct iovec *iov,
+ unsigned num, struct device *dev)
+{
+ struct lguest_block_page *p = dev->mem;
+ u32 irq, *lenp;
+ unsigned int len, reply_num;
+ struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
+ off64_t device_len, off = (off64_t)p->sector * 512;
+
+ device_len = *(off64_t *)dev->priv;
+
+ if (off >= device_len)
+ err(1, "Bad offset %llu vs %llu", off, device_len);
+ if (lseek64(dev->fd, off, SEEK_SET) != off)
+ err(1, "Bad seek to sector %i", p->sector);
+
+ verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
+
+ lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
+ if (!lenp)
+ err(1, "Block request didn't give us a dma buffer");
+
+ if (p->type) {
+ len = writev(dev->fd, iov, num);
+ if (off + len > device_len) {
+ ftruncate(dev->fd, device_len);
+ errx(1, "Write past end %llu+%u", off, len);
+ }
+ *lenp = 0;
+ } else {
+ len = readv(dev->fd, reply, reply_num);
+ *lenp = len;
+ }
+
+ p->result = 1 + (p->bytes != len);
+ trigger_irq(fd, irq);
+ return 0;
+}
+
+static void handle_output(int fd, unsigned long dma, unsigned long key,
+ struct device_list *devices)
+{
+ struct device *i;
+ u32 *lenp;
+ struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+ unsigned num = 0;
+
+ lenp = dma2iov(dma, iov, &num);
+ for (i = devices->dev; i; i = i->next) {
+ if (i->handle_output && key == i->watch_key) {
+ *lenp = i->handle_output(fd, iov, num, i);
+ return;
+ }
+ }
+ warnx("Pending dma %p, key %p", (void *)dma, (void *)key);
+}
+
+static void handle_input(int fd, int childfd, struct device_list *devices)
+{
+ struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
+
+ for (;;) {
+ struct device *i;
+ fd_set fds = devices->infds;
+
+ if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0)
+ break;
+
+ for (i = devices->dev; i; i = i->next) {
+ if (i->handle_input && FD_ISSET(i->fd, &fds)) {
+ if (!i->handle_input(fd, i)) {
+ FD_CLR(i->fd, &devices->infds);
+ /* Tell child to ignore it too... */
+ write(childfd, &i->fd, sizeof(i->fd));
+ }
+ }
+ }
+ }
+}
+
+static struct lguest_device_desc *new_dev_desc(u16 type, u16 features,
+ u16 num_pages)
+{
+ static unsigned long top = LGUEST_GUEST_TOP;
+ struct lguest_device_desc *desc;
+
+ desc = malloc(sizeof(*desc));
+ desc->type = type;
+ desc->num_pages = num_pages;
+ desc->features = features;
+ desc->status = 0;
+ if (num_pages) {
+ top -= num_pages*getpagesize();
+ map_zeroed_pages(top, num_pages);
+ desc->pfn = top / getpagesize();
+ } else
+ desc->pfn = 0;
+ return desc;
+}
+
+static struct device *new_device(struct device_list *devices,
+ u16 type, u16 num_pages, u16 features,
+ int fd,
+ bool (*handle_input)(int, struct device *),
+ unsigned long watch_off,
+ u32 (*handle_output)(int,
+ const struct iovec *,
+ unsigned,
+ struct device *))
+{
+ struct device *dev = malloc(sizeof(*dev));
+
+ /* Append to device list. */
+ *devices->lastdev = dev;
+ dev->next = NULL;
+ devices->lastdev = &dev->next;
+
+ dev->fd = fd;
+ if (handle_input)
+ set_fd(dev->fd, devices);
+ dev->desc = new_dev_desc(type, features, num_pages);
+ dev->mem = (void *)(dev->desc->pfn * getpagesize());
+ dev->handle_input = handle_input;
+ dev->watch_key = (unsigned long)dev->mem + watch_off;
+ dev->handle_output = handle_output;
+ return dev;
+}
+
+static void setup_console(struct device_list *devices)
+{
+ struct device *dev;
+
+ if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
+ struct termios term = orig_term;
+ term.c_lflag &= ~(ISIG|ICANON|ECHO);
+ tcsetattr(STDIN_FILENO, TCSANOW, &term);
+ atexit(restore_term);
+ }
+
+ /* We don't currently require a page for the console. */
+ dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0,
+ STDIN_FILENO, handle_console_input,
+ LGUEST_CONSOLE_DMA_KEY, handle_console_output);
+ dev->priv = malloc(sizeof(struct console_abort));
+ ((struct console_abort *)dev->priv)->count = 0;
+ verbose("device %p: console\n",
+ (void *)(dev->desc->pfn * getpagesize()));
+}
+
+static void setup_block_file(const char *filename, struct device_list *devices)
+{
+ int fd;
+ struct device *dev;
+ off64_t *device_len;
+ struct lguest_block_page *p;
+
+ fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT);
+ dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1,
+ LGUEST_DEVICE_F_RANDOMNESS,
+ fd, NULL, 0, handle_block_output);
+ device_len = dev->priv = malloc(sizeof(*device_len));
+ *device_len = lseek64(fd, 0, SEEK_END);
+ p = dev->mem;
+
+ p->num_sectors = *device_len/512;
+ verbose("device %p: block %i sectors\n",
+ (void *)(dev->desc->pfn * getpagesize()), p->num_sectors);
+}
+
+/* We use fnctl locks to reserve network slots (autocleanup!) */
+static unsigned int find_slot(int netfd, const char *filename)
+{
+ struct flock fl;
+
+ fl.l_type = F_WRLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_len = 1;
+ for (fl.l_start = 0;
+ fl.l_start < getpagesize()/sizeof(struct lguest_net);
+ fl.l_start++) {
+ if (fcntl(netfd, F_SETLK, &fl) == 0)
+ return fl.l_start;
+ }
+ errx(1, "No free slots in network file %s", filename);
+}
+
+static void setup_net_file(const char *filename,
+ struct device_list *devices)
+{
+ int netfd;
+ struct device *dev;
+
+ netfd = open(filename, O_RDWR, 0);
+ if (netfd < 0) {
+ if (errno == ENOENT) {
+ netfd = open(filename, O_RDWR|O_CREAT, 0600);
+ if (netfd >= 0) {
+ char page[getpagesize()];
+ memset(page, 0, sizeof(page));
+ write(netfd, page, sizeof(page));
+ }
+ }
+ if (netfd < 0)
+ err(1, "cannot open net file '%s'", filename);
+ }
+
+ dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
+ find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM,
+ -1, NULL, 0, NULL);
+
+ /* We overwrite the /dev/zero mapping with the actual file. */
+ if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE,
+ MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem)
+ err(1, "could not mmap '%s'", filename);
+ verbose("device %p: shared net %s, peer %i\n",
+ (void *)(dev->desc->pfn * getpagesize()), filename,
+ dev->desc->features & ~LGUEST_NET_F_NOCSUM);
+}
+
+static u32 str2ip(const char *ipaddr)
+{
+ unsigned int byte[4];
+
+ sscanf(ipaddr, "%u.%u.%u.%u", &byte[0], &byte[1], &byte[2], &byte[3]);
+ return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3];
+}
+
+/* adapted from libbridge */
+static void add_to_bridge(int fd, const char *if_name, const char *br_name)
+{
+ int ifidx;
+ struct ifreq ifr;
+
+ if (!*br_name)
+ errx(1, "must specify bridge name");
+
+ ifidx = if_nametoindex(if_name);
+ if (!ifidx)
+ errx(1, "interface %s does not exist!", if_name);
+
+ strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
+ ifr.ifr_ifindex = ifidx;
+ if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
+ err(1, "can't add %s to bridge %s", if_name, br_name);
+}
+
+static void configure_device(int fd, const char *devname, u32 ipaddr,
+ unsigned char hwaddr[6])
+{
+ struct ifreq ifr;
+ struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+
+ memset(&ifr, 0, sizeof(ifr));
+ strcpy(ifr.ifr_name, devname);
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = htonl(ipaddr);
+ if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
+ err(1, "Setting %s interface address", devname);
+ ifr.ifr_flags = IFF_UP;
+ if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
+ err(1, "Bringing interface %s up", devname);
+
+ if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
+ err(1, "getting hw address for %s", devname);
+
+ memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
+}
+
+static void setup_tun_net(const char *arg, struct device_list *devices)
+{
+ struct device *dev;
+ struct ifreq ifr;
+ int netfd, ipfd;
+ u32 ip;
+ const char *br_name = NULL;
+
+ netfd = open_or_die("/dev/net/tun", O_RDWR);
+ memset(&ifr, 0, sizeof(ifr));
+ ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
+ strcpy(ifr.ifr_name, "tap%d");
+ if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
+ err(1, "configuring /dev/net/tun");
+
+ /* You will be peer 1: we should create enough jitter to randomize */
+ dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
+ NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd,
+ handle_tun_input, peer_offset(0), handle_tun_output);
+ dev->priv = malloc(sizeof(bool));
+ *(bool *)dev->priv = false;
+
+ ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+ if (ipfd < 0)
+ err(1, "opening IP socket");
+
+ if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
+ ip = INADDR_ANY;
+ br_name = arg + strlen(BRIDGE_PFX);
+ add_to_bridge(ipfd, ifr.ifr_name, br_name);
+ } else
+ ip = str2ip(arg);
+
+ /* We are peer 0, ie. first slot. */
+ configure_device(ipfd, ifr.ifr_name, ip, dev->mem);
+ close(ipfd);
+
+ verbose("device %p: tun net %u.%u.%u.%u\n",
+ (void *)(dev->desc->pfn * getpagesize()),
+ (u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip);
+ if (br_name)
+ verbose("attached to bridge: %s\n", br_name);
+}
+
+/* Now we know how much memory we have, we copy in device descriptors */
+static void map_device_descriptors(struct device_list *devs, unsigned long mem)
+{
+ struct device *i;
+ unsigned int num;
+ struct lguest_device_desc *descs;
+
+ /* Device descriptor array sits just above top of normal memory */
+ descs = map_zeroed_pages(mem, 1);
+
+ for (i = devs->dev, num = 0; i; i = i->next, num++) {
+ if (num == LGUEST_MAX_DEVICES)
+ errx(1, "too many devices");
+ verbose("Device %i: %s\n", num,
+ i->desc->type == LGUEST_DEVICE_T_NET ? "net"
+ : i->desc->type == LGUEST_DEVICE_T_CONSOLE ? "console"
+ : i->desc->type == LGUEST_DEVICE_T_BLOCK ? "block"
+ : "unknown");
+ descs[num] = *i->desc;
+ free(i->desc);
+ i->desc = &descs[num];
+ }
+}
+
+static void __attribute__((noreturn))
+run_guest(int lguest_fd, int waker_fd, struct device_list *device_list)
+{
+ sigset_t sigset;
+
+ sigemptyset(&sigset);
+ sigaddset(&sigset, SIGUSR1);
+ for (;;) {
+ unsigned long arr[2];
+ int readval;
+
+ sigprocmask(SIG_UNBLOCK, &sigset, NULL);
+ readval = read(lguest_fd, arr, sizeof(arr));
+ sigprocmask(SIG_BLOCK, &sigset, NULL);
+
+ if (readval == sizeof(arr))
+ handle_output(lguest_fd, arr[0], arr[1], device_list);
+ else if (errno == ENOENT) {
+ char reason[1024] = { 0 };
+ read(lguest_fd, reason, sizeof(reason)-1);
+ errx(1, "%s", reason);
+ } else if (errno != EINTR)
+ err(1, "Running guest failed");
+ handle_input(lguest_fd, waker_fd, device_list);
+ }
+}
+
+static struct option opts[] = {
+ { "verbose", 0, NULL, 'v' },
+ { "sharenet", 1, NULL, 's' },
+ { "tunnet", 1, NULL, 't' },
+ { "block", 1, NULL, 'b' },
+ { "initrd", 1, NULL, 'i' },
+};
+static void usage(void)
+{
+ errx(1, "Usage: lguest [--verbose] "
+ "[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
+ "|--block=<filename>|--initrd=<filename>]...\n"
+ "<mem-in-mb> vmlinux [args...]");
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned long mem, pgdir, start, page_offset;
+ int c, lguest_fd, waker_fd;
+ struct device_list device_list;
+ struct lguest_boot_info *boot = (void *)0;
+ const char *initrd_name = NULL;
+
+ device_list.max_infd = -1;
+ device_list.dev = NULL;
+ device_list.lastdev = &device_list.dev;
+ FD_ZERO(&device_list.infds);
+
+ while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
+ switch (c) {
+ case 'v':
+ verbose = true;
+ break;
+ case 's':
+ setup_net_file(optarg, &device_list);
+ break;
+ case 't':
+ setup_tun_net(optarg, &device_list);
+ break;
+ case 'b':
+ setup_block_file(optarg, &device_list);
+ break;
+ case 'i':
+ initrd_name = optarg;
+ break;
+ default:
+ warnx("Unknown argument %s", argv[optind]);
+ usage();
+ }
+ }
+ if (optind + 2 > argc)
+ usage();
+
+ /* We need a console device */
+ setup_console(&device_list);
+
+ /* First we map /dev/zero over all of guest-physical memory. */
+ mem = atoi(argv[optind]) * 1024 * 1024;
+ map_zeroed_pages(0, mem / getpagesize());
+
+ /* Now we load the kernel */
+ start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
+ &page_offset);
+
+ /* Write the device descriptors into memory. */
+ map_device_descriptors(&device_list, mem);
+
+ /* Map the initrd image if requested */
+ if (initrd_name)
+ boot->initrd_size = load_initrd(initrd_name, mem);
+
+ /* Set up the initial linar pagetables. */
+ pgdir = setup_pagetables(mem, boot->initrd_size, page_offset);
+
+ /* Give the guest the boot information it needs. */
+ concat(boot->cmdline, argv+optind+2);
+ boot->max_pfn = mem/getpagesize();
+
+ lguest_fd = tell_kernel(pgdir, start, page_offset);
+ waker_fd = setup_waker(&device_list);
+
+ run_guest(lguest_fd, waker_fd, &device_list);
+}
===================================================================
--- /dev/null
+++ b/Documentation/lguest/lguest.txt
@@ -0,0 +1,125 @@
+Rusty's Remarkably Unreliable Guide to Lguest
+ - or, A Young Coder's Illustrated Hypervisor
+http://lguest.ozlabs.org
+
+Lguest is designed to be a minimal hypervisor for the Linux kernel, for
+Linux developers and users to experiment with virtualization with the
+minimum of complexity. Nonetheless, it should have sufficient
+features to make it useful for specific tasks, and, of course, you are
+encouraged to fork and enhance it.
+
+Features:
+
+- Kernel module which runs in a normal kernel.
+- Simple I/O model for communication.
+- Simple program to create new guests.
+- Logo contains cute puppies: http://lguest.ozlabs.org
+
+Developer features:
+
+- Fun to hack on.
+- No ABI: being tied to a specific kernel anyway, you can change anything.
+- Many opportunities for improvement or feature implementation.
+
+Running Lguest:
+
+- You will need to configure your kernel with the following options:
+
+ CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1]
+ CONFIG_TUN=y/m ("Universal TUN/TAP device driver support")
+ CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers")
+ CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)")
+ CONFIG_LGUEST=y/m ("Linux hypervisor example code")
+
+ and I recommend:
+ CONFIG_HZ=100 ("Timer frequency")[2]
+
+- A tool called "lguest" is available in this directory: type "make"
+ to build it.
+
+- Create or find a root disk image. There are several useful ones
+ around, such as the xm-test tiny root image at
+ http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img
+
+ For more serious work, I usually use a distribution ISO image and
+ install it under qemu, then make multiple copies:
+
+ dd if=/dev/zero of=rootfile bs=1M count=2048
+ qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
+
+- "modprobe lg" if you built it as a module.
+
+- Run an lguest as root:
+
+ Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba
+
+ Explanation:
+ 64m: the amount of memory to use.
+
+ vmlinux: the kernel image found in the top of your build directory. You
+ can also use a standard bzImage.
+
+ --tunnet=192.168.19.1: configures a "tap" device for networking with this
+ IP address.
+
+ --block=rootfile: a file or block device which becomes /dev/lgba
+ inside the guest.
+
+ root=/dev/lgba: this (and anything else on the command line) are
+ kernel boot parameters.
+
+- Configuring networking. I usually have the host masquerade, using
+ "iptables -t nat -o eth0 -j MASQUERADE" and "echo 1 >
+ /proc/sys/net/ipv4/ip_forward". In this example, I would configure
+ eth0 inside the guest at 192.168.19.2.
+
+ Another method is to bridge the tap device to an external interface
+ using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest
+ to obtain an IP address. The bridge needs to be configured first:
+ this option simply adds the tap interface to it.
+
+ A simple example on my system:
+
+ ifconfig eth0 0.0.0.0
+ brctl addbr lg0
+ ifconfig lg0 up
+ brctl addif lg0 eth0
+ dhclient lg0
+
+ Then use --tunnet=bridge:lg0 when launching the guest.
+
+ See http://linux-net.osdl.org/index.php/Bridge for general information
+ on how to get bridging working.
+
+- You can also create an inter-guest network using
+ "--sharenet=<filename>": any two guests using the same file are on
+ the same network. This file is created if it does not exist.
+
+Lguest I/O model:
+
+Lguest uses a simplified DMA model plus shared memory for I/O. Guests
+can communicate with each other if they share underlying memory
+(usually by the lguest program mmaping the same file), but they can
+use any non-shared memory to communicate with the lguest process.
+
+Guests can register DMA buffers at any key (must be a valid physical
+address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq)
+hypercall. "dmabufs" is the physical address of an array of "num"
+"struct lguest_dma": each contains a used_len, and an array of
+physical addresses and lengths. When a transfer occurs, the
+"used_len" field of one of the buffers which has used_len 0 will be
+set to the length transferred and the irq will fire.
+
+Using an irq value of 0 unbinds the dma buffers.
+
+To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used,
+and the bytes used is written to the used_len field. This can be 0 if
+noone else has bound a DMA buffer to that key or some other error.
+DMA buffers bound by the same guest are ignored.
+
+Cheers!
+Rusty Russell rusty@rustcorp.com.au.
+
+[1] These are on various places on the TODO list, waiting for you to
+ get annoyed enough at the limitation to fix it.
+[2] Lguest is not yet tickless when idle. See [1].
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: PATCH 7/8] lguest: the block driver
[not found] ` <1176203528.26372.40.camel@localhost.localdomain>
2007-04-10 11:12 ` [PATCH 8/8] lguest: the documentation, example launcher Rusty Russell
@ 2007-04-10 11:28 ` Pekka Enberg
[not found] ` <84144f020704100428r6e35e34cued8758c94728a6a0@mail.gmail.com>
2 siblings, 0 replies; 13+ messages in thread
From: Pekka Enberg @ 2007-04-10 11:28 UTC (permalink / raw)
To: Rusty Russell
Cc: Andi Kleen, Andrew Morton, lkml - Kernel Mailing List,
virtualization
Hi Rusty,
On 4/10/07, Rusty Russell <rusty@rustcorp.com.au> wrote:
> +/* Jens gave me this nice helper to end all chunks of a request. */
> +static void end_entire_request(struct request *req, int uptodate)
> +{
> + if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
> + BUG();
> + add_disk_randomness(req->rq_disk);
> + blkdev_dequeue_request(req);
> + end_that_request_last(req, uptodate);
> +}
Perhaps we should move this to generic code (i.e. block/ll_rw_blk.c)?
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: PATCH 7/8] lguest: the block driver
[not found] ` <84144f020704100428r6e35e34cued8758c94728a6a0@mail.gmail.com>
@ 2007-04-10 11:36 ` Pekka Enberg
[not found] ` <84144f020704100436y3405f104x8345c72556d961e9@mail.gmail.com>
1 sibling, 0 replies; 13+ messages in thread
From: Pekka Enberg @ 2007-04-10 11:36 UTC (permalink / raw)
To: Rusty Russell
Cc: Andi Kleen, Andrew Morton, jens.axboe, lkml - Kernel Mailing List,
virtualization
On 4/10/07, Rusty Russell <rusty@rustcorp.com.au> wrote:
> > +/* Jens gave me this nice helper to end all chunks of a request. */
> > +static void end_entire_request(struct request *req, int uptodate)
> > +{
> > + if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
> > + BUG();
> > + add_disk_randomness(req->rq_disk);
> > + blkdev_dequeue_request(req);
> > + end_that_request_last(req, uptodate);
> > +}
On 4/10/07, Pekka Enberg <penberg@cs.helsinki.fi> wrote:
> Perhaps we should move this to generic code (i.e. block/ll_rw_blk.c)?
Uhm, I am bit confused now. Why don't you just use end_request() here?
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: PATCH 7/8] lguest: the block driver
[not found] ` <84144f020704100436y3405f104x8345c72556d961e9@mail.gmail.com>
@ 2007-04-11 4:00 ` Rusty Russell
2007-04-11 5:29 ` Pekka Enberg
0 siblings, 1 reply; 13+ messages in thread
From: Rusty Russell @ 2007-04-11 4:00 UTC (permalink / raw)
To: Pekka Enberg
Cc: Andi Kleen, Andrew Morton, jens.axboe, lkml - Kernel Mailing List,
virtualization
On Tue, 2007-04-10 at 14:36 +0300, Pekka Enberg wrote:
> On 4/10/07, Rusty Russell <rusty@rustcorp.com.au> wrote:
> > > +/* Jens gave me this nice helper to end all chunks of a request. */
> > > +static void end_entire_request(struct request *req, int uptodate)
> > > +{
> > > + if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
> > > + BUG();
> > > + add_disk_randomness(req->rq_disk);
> > > + blkdev_dequeue_request(req);
> > > + end_that_request_last(req, uptodate);
> > > +}
>
> On 4/10/07, Pekka Enberg <penberg@cs.helsinki.fi> wrote:
> > Perhaps we should move this to generic code (i.e. block/ll_rw_blk.c)?
Yeah, Jens said to put it in here and he'd hoist it later.
> Uhm, I am bit confused now. Why don't you just use end_request() here?
What a question! end_request() doesn't end a request! What a crazy
idea!
As far as I can tell, every name in the block layer is actually some
variant of "fuck off, this is too complicated for you to understand".
Hope that clarifies!
Rusty.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: PATCH 7/8] lguest: the block driver
2007-04-11 4:00 ` Rusty Russell
@ 2007-04-11 5:29 ` Pekka Enberg
0 siblings, 0 replies; 13+ messages in thread
From: Pekka Enberg @ 2007-04-11 5:29 UTC (permalink / raw)
To: Rusty Russell
Cc: Andi Kleen, Andrew Morton, jens.axboe, lkml - Kernel Mailing List,
virtualization
On 4/11/07, Rusty Russell <rusty@rustcorp.com.au> wrote:
> What a question! end_request() doesn't end a request! What a crazy
> idea!
Aah, indeed, end_request() uses req->hard_cur_sectors while
end_entire_request() uses req->hard_nr_sectors which I missed.
On 4/11/07, Rusty Russell <rusty@rustcorp.com.au> wrote:
> Hope that clarifies!
It does. Thanks Rusty :)
Pekka
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2007-04-11 5:29 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <1176203068.26372.21.camel@localhost.localdomain>
2007-04-10 11:05 ` [PATCH 1/8] lguest: the guest code Rusty Russell
[not found] ` <1176203130.26372.23.camel@localhost.localdomain>
2007-04-10 11:06 ` [PATCH 2/8] lguest: the host code Rusty Russell
[not found] ` <1176203176.26372.25.camel@localhost.localdomain>
2007-04-10 11:07 ` [PATCH 3/8] lguest: the asm offsets Rusty Russell
[not found] ` <1176203231.26372.27.camel@localhost.localdomain>
2007-04-10 11:08 ` [PATCH 4/8] lguest: the Makefile and Kconfig Rusty Russell
2007-04-10 11:08 ` [PATCH 5/8] lguest: the console driver Rusty Russell
2007-04-10 11:09 ` [PATCH 5/8] lguest: the net driver Rusty Russell
2007-04-10 11:11 ` [PATCH 6/8] " Rusty Russell
[not found] ` <1176203475.26372.37.camel@localhost.localdomain>
2007-04-10 11:12 ` PATCH 7/8] lguest: the block driver Rusty Russell
[not found] ` <1176203528.26372.40.camel@localhost.localdomain>
2007-04-10 11:12 ` [PATCH 8/8] lguest: the documentation, example launcher Rusty Russell
2007-04-10 11:28 ` PATCH 7/8] lguest: the block driver Pekka Enberg
[not found] ` <84144f020704100428r6e35e34cued8758c94728a6a0@mail.gmail.com>
2007-04-10 11:36 ` Pekka Enberg
[not found] ` <84144f020704100436y3405f104x8345c72556d961e9@mail.gmail.com>
2007-04-11 4:00 ` Rusty Russell
2007-04-11 5:29 ` Pekka Enberg
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).