[patch] KVM: paravirtual guest support

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [patch] KVM: paravirtual guest support
@ 2007-01-09  9:33 Ingo Molnar
       [not found] ` <20070109093354.GA10318-X9Un+BFzKDI@public.gmane.org>
  0 siblings, 1 reply; 4+ messages in thread
From: Ingo Molnar @ 2007-01-09  9:33 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel

Subject: [patch] KVM: paravirtual guest support
From: Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org>

this enables a CONFIG_PARAVIRT Linux guest kernel to establish a 
hypercall API to a KVM host. If successfully registered, then the Linux 
guest will optimize a few things like its interrupt controller, io-delay 
and it also registers its cr3-cache structures with the host. (but the 
host will not touch those, just yet)

(this is fully backwards compatible - if the WRMSR fails then the Linux
guest continues to execute as a native kernel.)

Signed-off-by: Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org>
---
 arch/i386/kernel/paravirt.c |  275 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/serial/8250.c       |    3 
 include/linux/paravirt.h    |   12 +
 init/main.c                 |    6 
 4 files changed, 294 insertions(+), 2 deletions(-)

Index: linux/arch/i386/kernel/paravirt.c
===================================================================
--- linux.orig/arch/i386/kernel/paravirt.c
+++ linux/arch/i386/kernel/paravirt.c
@@ -20,6 +20,7 @@
 #include <linux/efi.h>
 #include <linux/bcd.h>
 #include <linux/start_kernel.h>
+#include <linux/kvm_para.h>
 
 #include <asm/bug.h>
 #include <asm/paravirt.h>
@@ -33,6 +34,9 @@
 #include <asm/apic.h>
 #include <asm/tlbflush.h>
 
+#include <asm/i8259.h>
+#include <io_ports.h>
+
 /* nop stub */
 static void native_nop(void)
 {
@@ -683,3 +687,274 @@ struct paravirt_ops paravirt_ops = {
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 	.iret = native_iret,
 };
+
+/*
+ * KVM paravirtualization optimizations:
+ */
+int kvm_paravirt;
+
+/*
+ * No need for any "IO delay" on KVM:
+ */
+static void kvm_io_delay(void)
+{
+}
+
+static DEFINE_PER_CPU(struct kvm_vcpu_para_state, para_state);
+
+/*
+ * Special, register-to-cr3 instruction based hypercall API
+ * variant to the KVM host. This utilizes the cr3 filter capability
+ * of the hardware - if this works out then no VM exit happens,
+ * if a VM exit happens then KVM will get the virtual address too.
+ */
+static void kvm_write_cr3(unsigned long guest_cr3)
+{
+	struct kvm_vcpu_para_state *para_state = &get_cpu_var(para_state);
+	struct kvm_cr3_cache *cache = &para_state->cr3_cache;
+	int idx;
+
+	/*
+	 * Check the cache (maintained by the host) for a matching
+	 * guest_cr3 => host_cr3 mapping. Use it if found:
+	 */
+	for (idx = 0; idx < cache->max_idx; idx++) {
+		if (cache->entry[idx].guest_cr3 == guest_cr3) {
+			/*
+			 * Cache-hit: we load the cached host-CR3 value.
+			 * This never causes any VM exit. (if it does then the
+			 * hypervisor could do nothing with this instruction
+			 * and the guest OS would be aborted)
+			 */
+			asm volatile("movl %0, %%cr3"
+				: : "r" (cache->entry[idx].host_cr3));
+			goto out;
+		}
+	}
+
+	/*
+	 * Cache-miss. Load the guest-cr3 value into cr3, which will
+	 * cause a VM exit to the hypervisor, which then loads the
+	 * host cr3 value and updates the cr3_cache.
+	 */
+	asm volatile("movl %0, %%cr3" : : "r" (guest_cr3));
+out:
+	put_cpu_var(para_state);
+}
+
+/*
+ * Avoid the VM exit upon cr3 load by using the cached
+ * ->active_mm->pgd value:
+ */
+static void kvm_flush_tlb_user(void)
+{
+	kvm_write_cr3(__pa(current->active_mm->pgd));
+}
+
+static void kvm_flush_tlb_single(u32 addr)
+{
+	__native_flush_tlb_single(addr);
+}
+/*
+ * Disable global pages, do a flush, then enable global pages:
+ */
+static fastcall void kvm_flush_tlb_kernel(void)
+{
+	unsigned long orig_cr4 = read_cr4();
+
+	write_cr4(orig_cr4 & ~X86_CR4_PGE);
+	kvm_flush_tlb_user();
+	write_cr4(orig_cr4);
+}
+
+/*
+ * Simplified i8259A controller handling:
+ */
+static void mask_and_ack_kvm(unsigned int irq)
+{
+	unsigned int irqmask = 1 << irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	cached_irq_mask |= irqmask;
+
+	if (irq & 8) {
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
+		outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
+		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
+	} else {
+		outb(cached_master_mask, PIC_MASTER_IMR);
+		/* 'Specific EOI' to master: */
+		outb(0x60+irq, PIC_MASTER_CMD);
+	}
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static void disable_kvm_irq(unsigned int irq)
+{
+	unsigned int mask = 1 << irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	cached_irq_mask |= mask;
+	if (irq & 8)
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
+	else
+		outb(cached_master_mask, PIC_MASTER_IMR);
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static void enable_kvm_irq(unsigned int irq)
+{
+	unsigned int mask = ~(1 << irq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	cached_irq_mask &= mask;
+	if (irq & 8)
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
+	else
+		outb(cached_master_mask, PIC_MASTER_IMR);
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static struct irq_chip kvm_chip = {
+	.name		= "XT-PIC",
+	.mask		= disable_kvm_irq,
+	.disable	= disable_kvm_irq,
+	.unmask		= enable_kvm_irq,
+	.mask_ack	= mask_and_ack_kvm,
+};
+
+static void __init kvm_init_IRQ(void)
+{
+	int i;
+
+	printk("init KVM IRQ controller\n");
+#ifdef CONFIG_X86_LOCAL_APIC
+	init_bsp_APIC();
+#endif
+	init_8259A(0);
+
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc[i].status = IRQ_DISABLED;
+		irq_desc[i].action = NULL;
+		irq_desc[i].depth = 1;
+
+		if (i < 16) {
+			/*
+			 * 16 old-style INTA-cycle interrupts:
+			 */
+			set_irq_chip_and_handler_name(i, &kvm_chip,
+						      handle_level_irq, "XT");
+		} else {
+			/*
+			 * 'high' PCI IRQs filled in on demand
+			 */
+			irq_desc[i].chip = &no_irq_chip;
+		}
+	}
+
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (i >= NR_IRQS)
+			break;
+		if (vector != SYSCALL_VECTOR)
+			set_intr_gate(vector, interrupt[i]);
+	}
+
+	/* setup after call gates are initialised (usually add in
+	 * the architecture specific gates)
+	 */
+	intr_init_hook();
+
+	irq_ctx_init(smp_processor_id());
+}
+
+/*
+ * This is the vm-syscall address - to be patched by the host to
+ * VMCALL (Intel) or VMMCALL (AMD), depending on the CPU model:
+ */
+asm (
+	"	.globl vm_syscall_addr		\n"
+	"	.align 4			\n"
+	"	vm_syscall_addr:		\n"
+	"		nop			\n"
+	"		nop			\n"
+	"		nop			\n"
+	"		ret			\n"
+);
+
+extern unsigned char vm_syscall_addr[4];
+
+int kvm_guest_register_para(int cpu)
+{
+	struct kvm_vcpu_para_state *para_state = &per_cpu(para_state, cpu);
+
+	printk("kvm guest on VCPU#%d: trying to register para_state %p\n",
+		cpu, para_state);
+	/*
+	 * Move a magic (and otherwise invalid) value to
+	 * cr3, and thus signal to KVM that we are entering
+	 * paravirtualized mode:
+	 */
+	para_state->guest_version = KVM_PARA_API_VERSION;
+	para_state->host_version = -1;
+	para_state->size = sizeof(*para_state);
+	para_state->ret = 0;
+	para_state->vm_syscall_addr = __pa(vm_syscall_addr);
+
+	if (wrmsr_safe(MSR_KVM_API_MAGIC, __pa(para_state), 0)) {
+		printk("KVM guest: WRMSR probe failed.\n");
+		return 0;
+	}
+
+	printk("kvm guest: host returned %d\n", para_state->ret);
+	printk("kvm guest: host version: %d\n", para_state->host_version);
+	printk("kvm guest: cr3 cache size: %d\n",
+				para_state->cr3_cache.max_idx);
+	printk("kvm guest: syscall entry: %02x %02x %02x %02x\n",
+			vm_syscall_addr[0], vm_syscall_addr[1],
+			vm_syscall_addr[2], vm_syscall_addr[3]);
+	if (para_state->ret) {
+		printk("kvm guest: host refused registration.\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static int __init kvm_paravirt_setup(char *s)
+{
+	printk("KVM paravirtualization setup\n");
+        if (sscanf(s, "%u", &kvm_paravirt) <= 0)
+		return 1;
+	if (!kvm_paravirt)
+		return 1;
+
+	kvm_paravirt = kvm_guest_register_para(smp_processor_id());
+	if (!kvm_paravirt)
+		return 1;
+
+	printk("KVM paravirtualized: OK\n");
+
+	paravirt_ops.name = "KVM";
+	paravirt_ops.io_delay = kvm_io_delay;
+	paravirt_ops.init_IRQ = kvm_init_IRQ;
+	paravirt_ops.flush_tlb_user = kvm_flush_tlb_user;
+	paravirt_ops.flush_tlb_kernel = kvm_flush_tlb_kernel;
+	paravirt_ops.flush_tlb_single = kvm_flush_tlb_single;
+	paravirt_ops.write_cr3 = kvm_write_cr3;
+	paravirt_ops.paravirt_enabled = 1;
+
+	return 1;
+}
+__setup("kvm_paravirt=", kvm_paravirt_setup);
+
+EXPORT_SYMBOL_GPL(paravirt_ops);
+
Index: linux/drivers/serial/8250.c
===================================================================
--- linux.orig/drivers/serial/8250.c
+++ linux/drivers/serial/8250.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/ioport.h>
+#include <linux/paravirt.h>
 #include <linux/init.h>
 #include <linux/console.h>
 #include <linux/sysrq.h>
@@ -1371,7 +1372,7 @@ static irqreturn_t serial8250_interrupt(
 
 		l = l->next;
 
-		if (l == i->head && pass_counter++ > PASS_LIMIT) {
+		if (!paravirt_enabled() && l == i->head && pass_counter++ > PASS_LIMIT) {
 			/* If we hit this, we're dead. */
 			printk(KERN_ERR "serial8250: too much work for "
 				"irq%d\n", irq);
Index: linux/include/linux/paravirt.h
===================================================================
--- /dev/null
+++ linux/include/linux/paravirt.h
@@ -0,0 +1,12 @@
+#ifndef __LINUX_PARAVIRT_H
+#define __LINUX_PARAVIRT_H
+
+/*
+ * Paravirtualization support
+ */
+
+#ifndef CONFIG_PARAVIRT
+# define paravirt_enabled()	0
+#endif
+
+#endif
Index: linux/init/main.c
===================================================================
--- linux.orig/init/main.c
+++ linux/init/main.c
@@ -374,7 +374,11 @@ static void __init setup_per_cpu_areas(v
 	if (size < PERCPU_ENOUGH_ROOM)
 		size = PERCPU_ENOUGH_ROOM;
 #endif
-	ptr = alloc_bootmem(size * nr_possible_cpus);
+	/*
+	 * Align them to page size - just in case someone aligns
+	 * the per-CPU data to page that alignment should be preserved:
+	 */
+	ptr = alloc_bootmem_pages(size * nr_possible_cpus);
 
 	for_each_possible_cpu(i) {
 		__per_cpu_offset[i] = ptr - __per_cpu_start;

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [patch] KVM: paravirtual guest support
       [not found] ` <20070109093354.GA10318-X9Un+BFzKDI@public.gmane.org>
@ 2007-01-09  9:41   ` Ingo Molnar
  2007-01-09 10:13   ` Avi Kivity
  1 sibling, 0 replies; 4+ messages in thread
From: Ingo Molnar @ 2007-01-09  9:41 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel


* Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org> wrote:

> Subject: [patch] KVM: paravirtual guest support
> From: Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org>

also needs this patch.

	Ingo

Index: linux/include/asm-i386/processor.h
===================================================================
--- linux.orig/include/asm-i386/processor.h
+++ linux/include/asm-i386/processor.h
@@ -547,7 +547,6 @@ static inline void rep_nop(void)
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
-#define paravirt_enabled() 0
 #define __cpuid native_cpuid
 
 static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread)

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [patch] KVM: paravirtual guest support
       [not found] ` <20070109093354.GA10318-X9Un+BFzKDI@public.gmane.org>
  2007-01-09  9:41   ` Ingo Molnar
@ 2007-01-09 10:13   ` Avi Kivity
       [not found]     ` <45A36AE4.6000904-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  1 sibling, 1 reply; 4+ messages in thread
From: Avi Kivity @ 2007-01-09 10:13 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: kvm-devel

Ingo Molnar wrote:
> Subject: [patch] KVM: paravirtual guest support
> From: Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org>
>
> this enables a CONFIG_PARAVIRT Linux guest kernel to establish a 
> hypercall API to a KVM host. If successfully registered, then the Linux 
> guest will optimize a few things like its interrupt controller, io-delay 
> and it also registers its cr3-cache structures with the host. (but the 
> host will not touch those, just yet)
>
> (this is fully backwards compatible - if the WRMSR fails then the Linux
> guest continues to execute as a native kernel.)
>
> Signed-off-by: Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org>
> ---
> +			/*
> +			 * Cache-hit: we load the cached host-CR3 value.
> +			 * This never causes any VM exit. (if it does then the
> +			 * hypervisor could do nothing with this instruction
> +			 * and the guest OS would be aborted)
> +			 */
> +			asm volatile("movl %0, %%cr3"
> +				: : "r" (cache->entry[idx].host_cr3));
> +			goto out;
> +		}
> +	}
> +
> +	/*
> +	 * Cache-miss. Load the guest-cr3 value into cr3, which will
> +	 * cause a VM exit to the hypervisor, which then loads the
> +	 * host cr3 value and updates the cr3_cache.
> +	 */
> +	asm volatile("movl %0, %%cr3" : : "r" (guest_cr3));
> +out:
> +	put_cpu_var(para_state);
> +}
> +
>   

There's a problem here.  A cache-hit cr3 is an hpa, while a cache-miss 
cr3 is a gpa.  The two could alias:

/* pseudoassembler */

guest:
  mov $0x1234, %cr3 /* cache miss */

host:
  creates shadow for 0x1234 at 0x5678
  set cr3 cache entry: $0x1234 -> 0x5678

guest:
  mov $0x5678, %cr3 /* cache miss */
  vmx accepts that as a cached cr3 equivalent to the guest's 0x1234, 
whereas a 0x5678 gpa cr3 was intended.

The only solution I see is to use the hypercall API on a cache miss.  
Once the guest indicates it wants to use the cr3 cache, set_cr3() should 
fault, and cr3 should only be accepted from the hypercall.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [patch] KVM: paravirtual guest support
       [not found]     ` <45A36AE4.6000904-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-01-09 10:46       ` Ingo Molnar
  0 siblings, 0 replies; 4+ messages in thread
From: Ingo Molnar @ 2007-01-09 10:46 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel


* Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote:

> There's a problem here.  A cache-hit cr3 is an hpa, while a cache-miss 
> cr3 is a gpa.  The two could alias:

ok, agreed. This is nice in fact, because this will be a live test of 
the hypercall API :-)

> The only solution I see is to use the hypercall API on a cache miss.  
> Once the guest indicates it wants to use the cr3 cache, set_cr3() 
> should fault, and cr3 should only be accepted from the hypercall.

yeah.

	Ingo

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2007-01-09 10:46 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-01-09  9:33 [patch] KVM: paravirtual guest support Ingo Molnar
     [not found] ` <20070109093354.GA10318-X9Un+BFzKDI@public.gmane.org>
2007-01-09  9:41   ` Ingo Molnar
2007-01-09 10:13   ` Avi Kivity
     [not found]     ` <45A36AE4.6000904-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-01-09 10:46       ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox