* [patch] KVM: paravirtual guest support
@ 2007-01-09 9:33 Ingo Molnar
[not found] ` <20070109093354.GA10318-X9Un+BFzKDI@public.gmane.org>
0 siblings, 1 reply; 4+ messages in thread
From: Ingo Molnar @ 2007-01-09 9:33 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm-devel
Subject: [patch] KVM: paravirtual guest support
From: Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org>
this enables a CONFIG_PARAVIRT Linux guest kernel to establish a
hypercall API to a KVM host. If successfully registered, then the Linux
guest will optimize a few things like its interrupt controller, io-delay
and it also registers its cr3-cache structures with the host. (but the
host will not touch those, just yet)
(this is fully backwards compatible - if the WRMSR fails then the Linux
guest continues to execute as a native kernel.)
Signed-off-by: Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org>
---
arch/i386/kernel/paravirt.c | 275 ++++++++++++++++++++++++++++++++++++++++++++
drivers/serial/8250.c | 3
include/linux/paravirt.h | 12 +
init/main.c | 6
4 files changed, 294 insertions(+), 2 deletions(-)
Index: linux/arch/i386/kernel/paravirt.c
===================================================================
--- linux.orig/arch/i386/kernel/paravirt.c
+++ linux/arch/i386/kernel/paravirt.c
@@ -20,6 +20,7 @@
#include <linux/efi.h>
#include <linux/bcd.h>
#include <linux/start_kernel.h>
+#include <linux/kvm_para.h>
#include <asm/bug.h>
#include <asm/paravirt.h>
@@ -33,6 +34,9 @@
#include <asm/apic.h>
#include <asm/tlbflush.h>
+#include <asm/i8259.h>
+#include <io_ports.h>
+
/* nop stub */
static void native_nop(void)
{
@@ -683,3 +687,274 @@ struct paravirt_ops paravirt_ops = {
.irq_enable_sysexit = native_irq_enable_sysexit,
.iret = native_iret,
};
+
+/*
+ * KVM paravirtualization optimizations:
+ */
+int kvm_paravirt;
+
+/*
+ * No need for any "IO delay" on KVM:
+ */
+static void kvm_io_delay(void)
+{
+}
+
+static DEFINE_PER_CPU(struct kvm_vcpu_para_state, para_state);
+
+/*
+ * Special, register-to-cr3 instruction based hypercall API
+ * variant to the KVM host. This utilizes the cr3 filter capability
+ * of the hardware - if this works out then no VM exit happens,
+ * if a VM exit happens then KVM will get the virtual address too.
+ */
+static void kvm_write_cr3(unsigned long guest_cr3)
+{
+ struct kvm_vcpu_para_state *para_state = &get_cpu_var(para_state);
+ struct kvm_cr3_cache *cache = ¶_state->cr3_cache;
+ int idx;
+
+ /*
+ * Check the cache (maintained by the host) for a matching
+ * guest_cr3 => host_cr3 mapping. Use it if found:
+ */
+ for (idx = 0; idx < cache->max_idx; idx++) {
+ if (cache->entry[idx].guest_cr3 == guest_cr3) {
+ /*
+ * Cache-hit: we load the cached host-CR3 value.
+ * This never causes any VM exit. (if it does then the
+ * hypervisor could do nothing with this instruction
+ * and the guest OS would be aborted)
+ */
+ asm volatile("movl %0, %%cr3"
+ : : "r" (cache->entry[idx].host_cr3));
+ goto out;
+ }
+ }
+
+ /*
+ * Cache-miss. Load the guest-cr3 value into cr3, which will
+ * cause a VM exit to the hypervisor, which then loads the
+ * host cr3 value and updates the cr3_cache.
+ */
+ asm volatile("movl %0, %%cr3" : : "r" (guest_cr3));
+out:
+ put_cpu_var(para_state);
+}
+
+/*
+ * Avoid the VM exit upon cr3 load by using the cached
+ * ->active_mm->pgd value:
+ */
+static void kvm_flush_tlb_user(void)
+{
+ kvm_write_cr3(__pa(current->active_mm->pgd));
+}
+
+static void kvm_flush_tlb_single(u32 addr)
+{
+ __native_flush_tlb_single(addr);
+}
+/*
+ * Disable global pages, do a flush, then enable global pages:
+ */
+static fastcall void kvm_flush_tlb_kernel(void)
+{
+ unsigned long orig_cr4 = read_cr4();
+
+ write_cr4(orig_cr4 & ~X86_CR4_PGE);
+ kvm_flush_tlb_user();
+ write_cr4(orig_cr4);
+}
+
+/*
+ * Simplified i8259A controller handling:
+ */
+static void mask_and_ack_kvm(unsigned int irq)
+{
+ unsigned int irqmask = 1 << irq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&i8259A_lock, flags);
+ cached_irq_mask |= irqmask;
+
+ if (irq & 8) {
+ outb(cached_slave_mask, PIC_SLAVE_IMR);
+ outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
+ outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
+ } else {
+ outb(cached_master_mask, PIC_MASTER_IMR);
+ /* 'Specific EOI' to master: */
+ outb(0x60+irq, PIC_MASTER_CMD);
+ }
+ spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static void disable_kvm_irq(unsigned int irq)
+{
+ unsigned int mask = 1 << irq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&i8259A_lock, flags);
+ cached_irq_mask |= mask;
+ if (irq & 8)
+ outb(cached_slave_mask, PIC_SLAVE_IMR);
+ else
+ outb(cached_master_mask, PIC_MASTER_IMR);
+ spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static void enable_kvm_irq(unsigned int irq)
+{
+ unsigned int mask = ~(1 << irq);
+ unsigned long flags;
+
+ spin_lock_irqsave(&i8259A_lock, flags);
+ cached_irq_mask &= mask;
+ if (irq & 8)
+ outb(cached_slave_mask, PIC_SLAVE_IMR);
+ else
+ outb(cached_master_mask, PIC_MASTER_IMR);
+ spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static struct irq_chip kvm_chip = {
+ .name = "XT-PIC",
+ .mask = disable_kvm_irq,
+ .disable = disable_kvm_irq,
+ .unmask = enable_kvm_irq,
+ .mask_ack = mask_and_ack_kvm,
+};
+
+static void __init kvm_init_IRQ(void)
+{
+ int i;
+
+ printk("init KVM IRQ controller\n");
+#ifdef CONFIG_X86_LOCAL_APIC
+ init_bsp_APIC();
+#endif
+ init_8259A(0);
+
+ for (i = 0; i < NR_IRQS; i++) {
+ irq_desc[i].status = IRQ_DISABLED;
+ irq_desc[i].action = NULL;
+ irq_desc[i].depth = 1;
+
+ if (i < 16) {
+ /*
+ * 16 old-style INTA-cycle interrupts:
+ */
+ set_irq_chip_and_handler_name(i, &kvm_chip,
+ handle_level_irq, "XT");
+ } else {
+ /*
+ * 'high' PCI IRQs filled in on demand
+ */
+ irq_desc[i].chip = &no_irq_chip;
+ }
+ }
+
+ /*
+ * Cover the whole vector space, no vector can escape
+ * us. (some of these will be overridden and become
+ * 'special' SMP interrupts)
+ */
+ for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+ int vector = FIRST_EXTERNAL_VECTOR + i;
+ if (i >= NR_IRQS)
+ break;
+ if (vector != SYSCALL_VECTOR)
+ set_intr_gate(vector, interrupt[i]);
+ }
+
+ /* setup after call gates are initialised (usually add in
+ * the architecture specific gates)
+ */
+ intr_init_hook();
+
+ irq_ctx_init(smp_processor_id());
+}
+
+/*
+ * This is the vm-syscall address - to be patched by the host to
+ * VMCALL (Intel) or VMMCALL (AMD), depending on the CPU model:
+ */
+asm (
+ " .globl vm_syscall_addr \n"
+ " .align 4 \n"
+ " vm_syscall_addr: \n"
+ " nop \n"
+ " nop \n"
+ " nop \n"
+ " ret \n"
+);
+
+extern unsigned char vm_syscall_addr[4];
+
+int kvm_guest_register_para(int cpu)
+{
+ struct kvm_vcpu_para_state *para_state = &per_cpu(para_state, cpu);
+
+ printk("kvm guest on VCPU#%d: trying to register para_state %p\n",
+ cpu, para_state);
+ /*
+ * Move a magic (and otherwise invalid) value to
+ * cr3, and thus signal to KVM that we are entering
+ * paravirtualized mode:
+ */
+ para_state->guest_version = KVM_PARA_API_VERSION;
+ para_state->host_version = -1;
+ para_state->size = sizeof(*para_state);
+ para_state->ret = 0;
+ para_state->vm_syscall_addr = __pa(vm_syscall_addr);
+
+ if (wrmsr_safe(MSR_KVM_API_MAGIC, __pa(para_state), 0)) {
+ printk("KVM guest: WRMSR probe failed.\n");
+ return 0;
+ }
+
+ printk("kvm guest: host returned %d\n", para_state->ret);
+ printk("kvm guest: host version: %d\n", para_state->host_version);
+ printk("kvm guest: cr3 cache size: %d\n",
+ para_state->cr3_cache.max_idx);
+ printk("kvm guest: syscall entry: %02x %02x %02x %02x\n",
+ vm_syscall_addr[0], vm_syscall_addr[1],
+ vm_syscall_addr[2], vm_syscall_addr[3]);
+ if (para_state->ret) {
+ printk("kvm guest: host refused registration.\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+static int __init kvm_paravirt_setup(char *s)
+{
+ printk("KVM paravirtualization setup\n");
+ if (sscanf(s, "%u", &kvm_paravirt) <= 0)
+ return 1;
+ if (!kvm_paravirt)
+ return 1;
+
+ kvm_paravirt = kvm_guest_register_para(smp_processor_id());
+ if (!kvm_paravirt)
+ return 1;
+
+ printk("KVM paravirtualized: OK\n");
+
+ paravirt_ops.name = "KVM";
+ paravirt_ops.io_delay = kvm_io_delay;
+ paravirt_ops.init_IRQ = kvm_init_IRQ;
+ paravirt_ops.flush_tlb_user = kvm_flush_tlb_user;
+ paravirt_ops.flush_tlb_kernel = kvm_flush_tlb_kernel;
+ paravirt_ops.flush_tlb_single = kvm_flush_tlb_single;
+ paravirt_ops.write_cr3 = kvm_write_cr3;
+ paravirt_ops.paravirt_enabled = 1;
+
+ return 1;
+}
+__setup("kvm_paravirt=", kvm_paravirt_setup);
+
+EXPORT_SYMBOL_GPL(paravirt_ops);
+
Index: linux/drivers/serial/8250.c
===================================================================
--- linux.orig/drivers/serial/8250.c
+++ linux/drivers/serial/8250.c
@@ -27,6 +27,7 @@
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/ioport.h>
+#include <linux/paravirt.h>
#include <linux/init.h>
#include <linux/console.h>
#include <linux/sysrq.h>
@@ -1371,7 +1372,7 @@ static irqreturn_t serial8250_interrupt(
l = l->next;
- if (l == i->head && pass_counter++ > PASS_LIMIT) {
+ if (!paravirt_enabled() && l == i->head && pass_counter++ > PASS_LIMIT) {
/* If we hit this, we're dead. */
printk(KERN_ERR "serial8250: too much work for "
"irq%d\n", irq);
Index: linux/include/linux/paravirt.h
===================================================================
--- /dev/null
+++ linux/include/linux/paravirt.h
@@ -0,0 +1,12 @@
+#ifndef __LINUX_PARAVIRT_H
+#define __LINUX_PARAVIRT_H
+
+/*
+ * Paravirtualization support
+ */
+
+#ifndef CONFIG_PARAVIRT
+# define paravirt_enabled() 0
+#endif
+
+#endif
Index: linux/init/main.c
===================================================================
--- linux.orig/init/main.c
+++ linux/init/main.c
@@ -374,7 +374,11 @@ static void __init setup_per_cpu_areas(v
if (size < PERCPU_ENOUGH_ROOM)
size = PERCPU_ENOUGH_ROOM;
#endif
- ptr = alloc_bootmem(size * nr_possible_cpus);
+ /*
+ * Align them to page size - just in case someone aligns
+ * the per-CPU data to page that alignment should be preserved:
+ */
+ ptr = alloc_bootmem_pages(size * nr_possible_cpus);
for_each_possible_cpu(i) {
__per_cpu_offset[i] = ptr - __per_cpu_start;
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [patch] KVM: paravirtual guest support
[not found] ` <20070109093354.GA10318-X9Un+BFzKDI@public.gmane.org>
@ 2007-01-09 9:41 ` Ingo Molnar
2007-01-09 10:13 ` Avi Kivity
1 sibling, 0 replies; 4+ messages in thread
From: Ingo Molnar @ 2007-01-09 9:41 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm-devel
* Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org> wrote:
> Subject: [patch] KVM: paravirtual guest support
> From: Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org>
also needs this patch.
Ingo
Index: linux/include/asm-i386/processor.h
===================================================================
--- linux.orig/include/asm-i386/processor.h
+++ linux/include/asm-i386/processor.h
@@ -547,7 +547,6 @@ static inline void rep_nop(void)
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
-#define paravirt_enabled() 0
#define __cpuid native_cpuid
static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread)
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [patch] KVM: paravirtual guest support
[not found] ` <20070109093354.GA10318-X9Un+BFzKDI@public.gmane.org>
2007-01-09 9:41 ` Ingo Molnar
@ 2007-01-09 10:13 ` Avi Kivity
[not found] ` <45A36AE4.6000904-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
1 sibling, 1 reply; 4+ messages in thread
From: Avi Kivity @ 2007-01-09 10:13 UTC (permalink / raw)
To: Ingo Molnar; +Cc: kvm-devel
Ingo Molnar wrote:
> Subject: [patch] KVM: paravirtual guest support
> From: Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org>
>
> this enables a CONFIG_PARAVIRT Linux guest kernel to establish a
> hypercall API to a KVM host. If successfully registered, then the Linux
> guest will optimize a few things like its interrupt controller, io-delay
> and it also registers its cr3-cache structures with the host. (but the
> host will not touch those, just yet)
>
> (this is fully backwards compatible - if the WRMSR fails then the Linux
> guest continues to execute as a native kernel.)
>
> Signed-off-by: Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org>
> ---
> + /*
> + * Cache-hit: we load the cached host-CR3 value.
> + * This never causes any VM exit. (if it does then the
> + * hypervisor could do nothing with this instruction
> + * and the guest OS would be aborted)
> + */
> + asm volatile("movl %0, %%cr3"
> + : : "r" (cache->entry[idx].host_cr3));
> + goto out;
> + }
> + }
> +
> + /*
> + * Cache-miss. Load the guest-cr3 value into cr3, which will
> + * cause a VM exit to the hypervisor, which then loads the
> + * host cr3 value and updates the cr3_cache.
> + */
> + asm volatile("movl %0, %%cr3" : : "r" (guest_cr3));
> +out:
> + put_cpu_var(para_state);
> +}
> +
>
There's a problem here. A cache-hit cr3 is an hpa, while a cache-miss
cr3 is a gpa. The two could alias:
/* pseudoassembler */
guest:
mov $0x1234, %cr3 /* cache miss */
host:
creates shadow for 0x1234 at 0x5678
set cr3 cache entry: $0x1234 -> 0x5678
guest:
mov $0x5678, %cr3 /* cache miss */
vmx accepts that as a cached cr3 equivalent to the guest's 0x1234,
whereas a 0x5678 gpa cr3 was intended.
The only solution I see is to use the hypercall API on a cache miss.
Once the guest indicates it wants to use the cr3 cache, set_cr3() should
fault, and cr3 should only be accepted from the hypercall.
--
error compiling committee.c: too many arguments to function
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [patch] KVM: paravirtual guest support
[not found] ` <45A36AE4.6000904-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-01-09 10:46 ` Ingo Molnar
0 siblings, 0 replies; 4+ messages in thread
From: Ingo Molnar @ 2007-01-09 10:46 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm-devel
* Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:
> There's a problem here. A cache-hit cr3 is an hpa, while a cache-miss
> cr3 is a gpa. The two could alias:
ok, agreed. This is nice in fact, because this will be a live test of
the hypercall API :-)
> The only solution I see is to use the hypercall API on a cache miss.
> Once the guest indicates it wants to use the cr3 cache, set_cr3()
> should fault, and cr3 should only be accepted from the hypercall.
yeah.
Ingo
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2007-01-09 10:46 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-01-09 9:33 [patch] KVM: paravirtual guest support Ingo Molnar
[not found] ` <20070109093354.GA10318-X9Un+BFzKDI@public.gmane.org>
2007-01-09 9:41 ` Ingo Molnar
2007-01-09 10:13 ` Avi Kivity
[not found] ` <45A36AE4.6000904-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-01-09 10:46 ` Ingo Molnar
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox