[patch] KVM: paravirtual guest support

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org>
To: Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
Cc: kvm-devel <kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org>
Subject: [patch] KVM: paravirtual guest support
Date: Tue, 9 Jan 2007 10:33:54 +0100	[thread overview]
Message-ID: <20070109093354.GA10318@elte.hu> (raw)

Subject: [patch] KVM: paravirtual guest support
From: Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org>

this enables a CONFIG_PARAVIRT Linux guest kernel to establish a 
hypercall API to a KVM host. If successfully registered, then the Linux 
guest will optimize a few things like its interrupt controller, io-delay 
and it also registers its cr3-cache structures with the host. (but the 
host will not touch those, just yet)

(this is fully backwards compatible - if the WRMSR fails then the Linux
guest continues to execute as a native kernel.)

Signed-off-by: Ingo Molnar <mingo-X9Un+BFzKDI@public.gmane.org>
---
 arch/i386/kernel/paravirt.c |  275 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/serial/8250.c       |    3 
 include/linux/paravirt.h    |   12 +
 init/main.c                 |    6 
 4 files changed, 294 insertions(+), 2 deletions(-)

Index: linux/arch/i386/kernel/paravirt.c
===================================================================
--- linux.orig/arch/i386/kernel/paravirt.c
+++ linux/arch/i386/kernel/paravirt.c
@@ -20,6 +20,7 @@
 #include <linux/efi.h>
 #include <linux/bcd.h>
 #include <linux/start_kernel.h>
+#include <linux/kvm_para.h>
 
 #include <asm/bug.h>
 #include <asm/paravirt.h>
@@ -33,6 +34,9 @@
 #include <asm/apic.h>
 #include <asm/tlbflush.h>
 
+#include <asm/i8259.h>
+#include <io_ports.h>
+
 /* nop stub */
 static void native_nop(void)
 {
@@ -683,3 +687,274 @@ struct paravirt_ops paravirt_ops = {
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 	.iret = native_iret,
 };
+
+/*
+ * KVM paravirtualization optimizations:
+ */
+int kvm_paravirt;
+
+/*
+ * No need for any "IO delay" on KVM:
+ */
+static void kvm_io_delay(void)
+{
+}
+
+static DEFINE_PER_CPU(struct kvm_vcpu_para_state, para_state);
+
+/*
+ * Special, register-to-cr3 instruction based hypercall API
+ * variant to the KVM host. This utilizes the cr3 filter capability
+ * of the hardware - if this works out then no VM exit happens,
+ * if a VM exit happens then KVM will get the virtual address too.
+ */
+static void kvm_write_cr3(unsigned long guest_cr3)
+{
+	struct kvm_vcpu_para_state *para_state = &get_cpu_var(para_state);
+	struct kvm_cr3_cache *cache = &para_state->cr3_cache;
+	int idx;
+
+	/*
+	 * Check the cache (maintained by the host) for a matching
+	 * guest_cr3 => host_cr3 mapping. Use it if found:
+	 */
+	for (idx = 0; idx < cache->max_idx; idx++) {
+		if (cache->entry[idx].guest_cr3 == guest_cr3) {
+			/*
+			 * Cache-hit: we load the cached host-CR3 value.
+			 * This never causes any VM exit. (if it does then the
+			 * hypervisor could do nothing with this instruction
+			 * and the guest OS would be aborted)
+			 */
+			asm volatile("movl %0, %%cr3"
+				: : "r" (cache->entry[idx].host_cr3));
+			goto out;
+		}
+	}
+
+	/*
+	 * Cache-miss. Load the guest-cr3 value into cr3, which will
+	 * cause a VM exit to the hypervisor, which then loads the
+	 * host cr3 value and updates the cr3_cache.
+	 */
+	asm volatile("movl %0, %%cr3" : : "r" (guest_cr3));
+out:
+	put_cpu_var(para_state);
+}
+
+/*
+ * Avoid the VM exit upon cr3 load by using the cached
+ * ->active_mm->pgd value:
+ */
+static void kvm_flush_tlb_user(void)
+{
+	kvm_write_cr3(__pa(current->active_mm->pgd));
+}
+
+static void kvm_flush_tlb_single(u32 addr)
+{
+	__native_flush_tlb_single(addr);
+}
+/*
+ * Disable global pages, do a flush, then enable global pages:
+ */
+static fastcall void kvm_flush_tlb_kernel(void)
+{
+	unsigned long orig_cr4 = read_cr4();
+
+	write_cr4(orig_cr4 & ~X86_CR4_PGE);
+	kvm_flush_tlb_user();
+	write_cr4(orig_cr4);
+}
+
+/*
+ * Simplified i8259A controller handling:
+ */
+static void mask_and_ack_kvm(unsigned int irq)
+{
+	unsigned int irqmask = 1 << irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	cached_irq_mask |= irqmask;
+
+	if (irq & 8) {
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
+		outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
+		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
+	} else {
+		outb(cached_master_mask, PIC_MASTER_IMR);
+		/* 'Specific EOI' to master: */
+		outb(0x60+irq, PIC_MASTER_CMD);
+	}
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static void disable_kvm_irq(unsigned int irq)
+{
+	unsigned int mask = 1 << irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	cached_irq_mask |= mask;
+	if (irq & 8)
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
+	else
+		outb(cached_master_mask, PIC_MASTER_IMR);
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static void enable_kvm_irq(unsigned int irq)
+{
+	unsigned int mask = ~(1 << irq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+	cached_irq_mask &= mask;
+	if (irq & 8)
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
+	else
+		outb(cached_master_mask, PIC_MASTER_IMR);
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static struct irq_chip kvm_chip = {
+	.name		= "XT-PIC",
+	.mask		= disable_kvm_irq,
+	.disable	= disable_kvm_irq,
+	.unmask		= enable_kvm_irq,
+	.mask_ack	= mask_and_ack_kvm,
+};
+
+static void __init kvm_init_IRQ(void)
+{
+	int i;
+
+	printk("init KVM IRQ controller\n");
+#ifdef CONFIG_X86_LOCAL_APIC
+	init_bsp_APIC();
+#endif
+	init_8259A(0);
+
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc[i].status = IRQ_DISABLED;
+		irq_desc[i].action = NULL;
+		irq_desc[i].depth = 1;
+
+		if (i < 16) {
+			/*
+			 * 16 old-style INTA-cycle interrupts:
+			 */
+			set_irq_chip_and_handler_name(i, &kvm_chip,
+						      handle_level_irq, "XT");
+		} else {
+			/*
+			 * 'high' PCI IRQs filled in on demand
+			 */
+			irq_desc[i].chip = &no_irq_chip;
+		}
+	}
+
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (i >= NR_IRQS)
+			break;
+		if (vector != SYSCALL_VECTOR)
+			set_intr_gate(vector, interrupt[i]);
+	}
+
+	/* setup after call gates are initialised (usually add in
+	 * the architecture specific gates)
+	 */
+	intr_init_hook();
+
+	irq_ctx_init(smp_processor_id());
+}
+
+/*
+ * This is the vm-syscall address - to be patched by the host to
+ * VMCALL (Intel) or VMMCALL (AMD), depending on the CPU model:
+ */
+asm (
+	"	.globl vm_syscall_addr		\n"
+	"	.align 4			\n"
+	"	vm_syscall_addr:		\n"
+	"		nop			\n"
+	"		nop			\n"
+	"		nop			\n"
+	"		ret			\n"
+);
+
+extern unsigned char vm_syscall_addr[4];
+
+int kvm_guest_register_para(int cpu)
+{
+	struct kvm_vcpu_para_state *para_state = &per_cpu(para_state, cpu);
+
+	printk("kvm guest on VCPU#%d: trying to register para_state %p\n",
+		cpu, para_state);
+	/*
+	 * Move a magic (and otherwise invalid) value to
+	 * cr3, and thus signal to KVM that we are entering
+	 * paravirtualized mode:
+	 */
+	para_state->guest_version = KVM_PARA_API_VERSION;
+	para_state->host_version = -1;
+	para_state->size = sizeof(*para_state);
+	para_state->ret = 0;
+	para_state->vm_syscall_addr = __pa(vm_syscall_addr);
+
+	if (wrmsr_safe(MSR_KVM_API_MAGIC, __pa(para_state), 0)) {
+		printk("KVM guest: WRMSR probe failed.\n");
+		return 0;
+	}
+
+	printk("kvm guest: host returned %d\n", para_state->ret);
+	printk("kvm guest: host version: %d\n", para_state->host_version);
+	printk("kvm guest: cr3 cache size: %d\n",
+				para_state->cr3_cache.max_idx);
+	printk("kvm guest: syscall entry: %02x %02x %02x %02x\n",
+			vm_syscall_addr[0], vm_syscall_addr[1],
+			vm_syscall_addr[2], vm_syscall_addr[3]);
+	if (para_state->ret) {
+		printk("kvm guest: host refused registration.\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static int __init kvm_paravirt_setup(char *s)
+{
+	printk("KVM paravirtualization setup\n");
+        if (sscanf(s, "%u", &kvm_paravirt) <= 0)
+		return 1;
+	if (!kvm_paravirt)
+		return 1;
+
+	kvm_paravirt = kvm_guest_register_para(smp_processor_id());
+	if (!kvm_paravirt)
+		return 1;
+
+	printk("KVM paravirtualized: OK\n");
+
+	paravirt_ops.name = "KVM";
+	paravirt_ops.io_delay = kvm_io_delay;
+	paravirt_ops.init_IRQ = kvm_init_IRQ;
+	paravirt_ops.flush_tlb_user = kvm_flush_tlb_user;
+	paravirt_ops.flush_tlb_kernel = kvm_flush_tlb_kernel;
+	paravirt_ops.flush_tlb_single = kvm_flush_tlb_single;
+	paravirt_ops.write_cr3 = kvm_write_cr3;
+	paravirt_ops.paravirt_enabled = 1;
+
+	return 1;
+}
+__setup("kvm_paravirt=", kvm_paravirt_setup);
+
+EXPORT_SYMBOL_GPL(paravirt_ops);
+
Index: linux/drivers/serial/8250.c
===================================================================
--- linux.orig/drivers/serial/8250.c
+++ linux/drivers/serial/8250.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/ioport.h>
+#include <linux/paravirt.h>
 #include <linux/init.h>
 #include <linux/console.h>
 #include <linux/sysrq.h>
@@ -1371,7 +1372,7 @@ static irqreturn_t serial8250_interrupt(
 
 		l = l->next;
 
-		if (l == i->head && pass_counter++ > PASS_LIMIT) {
+		if (!paravirt_enabled() && l == i->head && pass_counter++ > PASS_LIMIT) {
 			/* If we hit this, we're dead. */
 			printk(KERN_ERR "serial8250: too much work for "
 				"irq%d\n", irq);
Index: linux/include/linux/paravirt.h
===================================================================
--- /dev/null
+++ linux/include/linux/paravirt.h
@@ -0,0 +1,12 @@
+#ifndef __LINUX_PARAVIRT_H
+#define __LINUX_PARAVIRT_H
+
+/*
+ * Paravirtualization support
+ */
+
+#ifndef CONFIG_PARAVIRT
+# define paravirt_enabled()	0
+#endif
+
+#endif
Index: linux/init/main.c
===================================================================
--- linux.orig/init/main.c
+++ linux/init/main.c
@@ -374,7 +374,11 @@ static void __init setup_per_cpu_areas(v
 	if (size < PERCPU_ENOUGH_ROOM)
 		size = PERCPU_ENOUGH_ROOM;
 #endif
-	ptr = alloc_bootmem(size * nr_possible_cpus);
+	/*
+	 * Align them to page size - just in case someone aligns
+	 * the per-CPU data to page that alignment should be preserved:
+	 */
+	ptr = alloc_bootmem_pages(size * nr_possible_cpus);
 
 	for_each_possible_cpu(i) {
 		__per_cpu_offset[i] = ptr - __per_cpu_start;

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

next             reply	other threads:[~2007-01-09  9:33 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-01-09  9:33 Ingo Molnar [this message]
     [not found] ` <20070109093354.GA10318-X9Un+BFzKDI@public.gmane.org>
2007-01-09  9:41   ` [patch] KVM: paravirtual guest support Ingo Molnar
2007-01-09 10:13   ` Avi Kivity
     [not found]     ` <45A36AE4.6000904-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-01-09 10:46       ` Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070109093354.GA10318@elte.hu \
    --to=mingo-x9un+bfzkdi@public.gmane.org \
    --cc=avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org \
    --cc=kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.