public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
* SMP guest boots
@ 2007-06-13  9:17 Li, Xin B
       [not found] ` <B30DA1341B0CFA4893EF8A36B40B5C5D01433C79-wq7ZOvIWXbNpB2pF5aRoyrfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  0 siblings, 1 reply; 7+ messages in thread
From: Li, Xin B @ 2007-06-13  9:17 UTC (permalink / raw)
  To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

[-- Attachment #1: Type: text/plain, Size: 758 bytes --]

I've just got x86_64 Linux 2.6.20.3 SMP guest boots on KVM with 2 vCPUs
(I'm using a x86_64 RHEL5 image but the RHEL5 kernel can't boot.),
/proc/cpuinfo and /proc/interrupts show the 2 CPU are working, but it's
still _not_ stable and I saw qemu segmentation faults, anyway it starts
working :-)
My base is Greg's APIC patch version 9 with some collision fixes, I've
attached them.
k.smp.patch and u.smp.patch are the real patches to enable kvm smp, and
there are still some debug code.
I also changed the guest BIOS by hardcoded 2 CPU and removed the logic
to detect CPU # using INIT-SIPI-SIPI broadcast, since our code can't
handle twice INIT-SIPI-SIPI sequence yet (one in BIOS, the other in OS).
If any problem, pls contact me.
Thanks
-Xin

[-- Attachment #2: k.apic.patch --]
[-- Type: application/octet-stream, Size: 99472 bytes --]

commit 639b23a91cfd1aabb34541ee61a2306d7e835abc
Author: root <root@vtsmp-xin.(none)>
Date:   Wed Jun 13 16:55:48 2007 +0800

    kvm apic support.

diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile
index c0a789f..1aad737 100644
--- a/drivers/kvm/Makefile
+++ b/drivers/kvm/Makefile
@@ -2,7 +2,7 @@
 # Makefile for Kernel-based Virtual Machine module
 #
 
-kvm-objs := kvm_main.o mmu.o x86_emulate.o
+kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o lapic.o kernint.o
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/drivers/kvm/irqdevice.h b/drivers/kvm/irqdevice.h
new file mode 100644
index 0000000..173313d
--- /dev/null
+++ b/drivers/kvm/irqdevice.h
@@ -0,0 +1,179 @@
+/*
+ * Defines an interface for an abstract interrupt controller.  The model
+ * consists of a unit with an arbitrary number of input lines N (IRQ0-(N-1)),
+ * an arbitrary number of output lines (INTR) (LINT, EXTINT, NMI, etc), and
+ * methods for completing an interrupt-acknowledge cycle (INTA).  A particular
+ * implementation of this model will define various policies, such as
+ * irq-to-vector translation, INTA/auto-EOI policy, etc.
+ *
+ * In addition, the INTR callback mechanism allows the unit to be "wired" to
+ * an interruptible source in a very flexible manner. For instance, an
+ * irqdevice could have its INTR wired to a VCPU (ala LAPIC), or another
+ * interrupt controller (ala cascaded i8259s)
+ *
+ * Copyright (C) 2007 Novell
+ *
+ * Authors:
+ *   Gregory Haskins <ghaskins@novell.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef __IRQDEVICE_H
+#define __IRQDEVICE_H
+
+struct kvm_irqdevice;
+
+typedef enum {
+	kvm_irqpin_localint,
+	kvm_irqpin_extint,
+	kvm_irqpin_smi,
+	kvm_irqpin_nmi,
+	kvm_irqpin_invalid, /* must always be last */
+} kvm_irqpin_t;
+
+
+struct kvm_irqsink {
+	void (*set_intr)(struct kvm_irqsink *this,
+			 struct kvm_irqdevice *dev,
+			 kvm_irqpin_t pin);
+
+	void *private;
+};
+
+#define KVM_IRQACKDATA_VECTOR_VALID   (1 << 0)
+#define KVM_IRQACKDATA_VECTOR_PENDING (1 << 1)
+#define KVM_IRQACKDATA_NEXT_VALID     (1 << 2)
+
+#define KVM_IRQACK_FLAG_PEEK          (1 << 0)
+
+struct kvm_irqack_data {
+	int flags;
+	int vector;
+	int next;
+};
+
+struct kvm_irqdevice {
+	int  (*ack)(struct kvm_irqdevice *this, int flags,
+		    struct kvm_irqack_data *data);
+	int  (*set_pin)(struct kvm_irqdevice *this, int pin, int level);
+	void (*destructor)(struct kvm_irqdevice *this);
+
+	void               *private;
+	struct kvm_irqsink  sink;
+};
+
+/**
+ * kvm_irqdevice_init - initialize the kvm_irqdevice for use
+ * @dev: The device
+ *
+ * Description: Initialize the kvm_irqdevice for use.  Should be called before
+ *              calling any derived implementation init functions
+ *
+ * Returns: (void)
+ */
+static inline void kvm_irqdevice_init(struct kvm_irqdevice *dev)
+{
+	memset(dev, 0, sizeof(*dev));
+}
+
+/**
+ * kvm_irqdevice_ack - read and ack the highest priority vector from the device
+ * @dev: The device
+ * @flags: Modifies default behavior
+ *           [ KVM_IRQACK_FLAG_PEEK - Dont ack vector, just check status ]
+ * @data: A pointer to a kvm_irqack_data structure to hold the result
+ *
+ * Description: Read the highest priority pending vector from the device,
+ *              potentially invoking auto-EOI depending on device policy
+ *
+ *              Successful return indicates that the *data* structure is valid
+ *
+ *               data.flags -
+ *                  [KVM_IRQACKDATA_VECTOR_VALID - data.vector is valid]
+ *                  [KVM_IRQACKDATA_VECTOR_PENDING - more vectors are pending]
+ *                  [KVM_IRQACKDATA_NEXT_VALID - next-vector is valid]
+ *
+ * Returns: (int)
+ *   [-1 = failure]
+ *   [ 0 = success]
+ */
+static inline int kvm_irqdevice_ack(struct kvm_irqdevice *dev, int flags,
+				    struct kvm_irqack_data *data)
+{
+	return dev->ack(dev, flags, data);
+}
+
+/**
+ * kvm_irqdevice_set_pin - allows the caller to assert/deassert an IRQ
+ * @dev: The device
+ * @pin: The input pin to alter
+ * @level: The value to set (1 = assert, 0 = deassert)
+ *
+ * Description: Allows the caller to assert/deassert an IRQ input pin to the
+ *              device according to device policy.
+ *
+ * Returns: (int)
+ *   [-1 = failure]
+ *   [ 0 = success]
+ */
+static inline int kvm_irqdevice_set_pin(struct kvm_irqdevice *dev, int pin,
+				  int level)
+{
+	return dev->set_pin(dev, pin, level);
+}
+
+/**
+ * kvm_irqdevice_register_sink - registers an kvm_irqsink object
+ * @dev: The device
+ * @sink: The sink to register.  Data will be copied so building object from
+ *        transient storage is ok.
+ *
+ * Description: Registers an kvm_irqsink object as an INTR callback
+ *
+ * Returns: (void)
+ */
+static inline void kvm_irqdevice_register_sink(struct kvm_irqdevice *dev,
+					       const struct kvm_irqsink *sink)
+{
+	dev->sink = *sink;
+}
+
+/**
+ * kvm_irqdevice_destructor - destroys an irqdevice
+ * @dev: The device
+ *
+ * Returns: (void)
+ */
+static inline void kvm_irqdevice_destructor(struct kvm_irqdevice *dev)
+{
+	dev->destructor(dev);
+}
+
+/**
+ * kvm_irqdevice_set_intr - invokes a registered INTR callback
+ * @dev: The device
+ * @pin: Identifies the pin to alter -
+ *           [ KVM_IRQPIN_LOCALINT (default) - an vector is pending on this
+ *                                             device]
+ *           [ KVM_IRQPIN_EXTINT - a vector is pending on an external device]
+ *           [ KVM_IRQPIN_SMI - system-management-interrupt pin]
+ *           [ KVM_IRQPIN_NMI - non-maskable-interrupt pin
+ *
+ * Description: Invokes a registered INTR callback (if present).  This
+ *              function is meant to be used privately by a irqdevice
+ *              implementation.
+ *
+ * Returns: (void)
+ */
+static inline void kvm_irqdevice_set_intr(struct kvm_irqdevice *dev,
+					  kvm_irqpin_t pin)
+{
+	struct kvm_irqsink *sink = &dev->sink;
+	if (sink->set_intr)
+		sink->set_intr(sink, dev, pin);
+}
+
+#endif /*  __IRQDEVICE_H */
diff --git a/drivers/kvm/kernint.c b/drivers/kvm/kernint.c
new file mode 100644
index 0000000..b5cbcae
--- /dev/null
+++ b/drivers/kvm/kernint.c
@@ -0,0 +1,149 @@
+/*
+ * Kernel Interrupt IRQ device
+ *
+ * Provides a model for connecting in-kernel interrupt resources to a VCPU.
+ *
+ * A typical modern x86 processor has the concept of an internal Local-APIC
+ * and some external signal pins.  The way in which interrupts are injected is
+ * dependent on whether software enables the LAPIC or not.  When enabled,
+ * interrupts are acknowledged through the LAPIC.  Otherwise they are through
+ * an externally connected PIC (typically an i8259 on the BSP)
+ *
+ * Copyright (C) 2007 Novell
+ *
+ * Authors:
+ *   Gregory Haskins <ghaskins@novell.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "kvm.h"
+
+struct kvm_kernint {
+	struct kvm_vcpu              *vcpu;
+	struct kvm_irqdevice         *self_irq;
+	struct kvm_irqdevice         *ext_irq;
+	struct kvm_irqdevice          apic_irq;
+
+};
+
+static struct kvm_irqdevice *get_irq_dev(struct kvm_kernint *s)
+{
+	struct kvm_irqdevice *dev;
+
+	if (kvm_lapic_enabled(s->vcpu))
+		dev = &s->apic_irq;
+	else
+		dev = s->ext_irq;
+
+	if (!dev)
+		kvm_crash_guest(s->vcpu->kvm);
+
+	return dev;
+}
+
+static int kernint_irqdev_ack(struct kvm_irqdevice *this, int flags,
+			      struct kvm_irqack_data *data)
+{
+	struct kvm_kernint *s = (struct kvm_kernint*)this->private;
+
+	return kvm_irqdevice_ack(get_irq_dev(s), flags, data);
+}
+
+static int kernint_irqdev_set_pin(struct kvm_irqdevice *this,
+				  int irq, int level)
+{
+	/* no-op */
+	return 0;
+}
+
+static void kernint_irqdev_destructor(struct kvm_irqdevice *this)
+{
+	struct kvm_kernint *s = (struct kvm_kernint*)this->private;
+
+	kvm_irqdevice_destructor(&s->apic_irq);
+	kvm_lapic_destroy(s->vcpu);
+	kfree(s);
+}
+
+static void kvm_apic_intr(struct kvm_irqsink *this,
+			  struct kvm_irqdevice *dev,
+			  kvm_irqpin_t pin)
+{
+	struct kvm_kernint *s = (struct kvm_kernint*)this->private;
+
+	/*
+	 * If the LAPIC sent us an interrupt it *must* be enabled,
+	 * just forward it on to the CPU
+	 */
+	kvm_irqdevice_set_intr(s->self_irq, pin);
+}
+
+static void kvm_ext_intr(struct kvm_irqsink *this,
+			 struct kvm_irqdevice *dev,
+			 kvm_irqpin_t pin)
+{
+	struct kvm_kernint *s = (struct kvm_kernint*)this->private;
+
+	/*
+	 * If the EXTINT device sent us an interrupt, forward it to the LINT0
+	 * pin of the LAPIC
+	 */
+	if (pin != kvm_irqpin_localint)
+		return;
+
+	/*
+	 * "irq 0" = LINT0, 1 = LINT1
+	 */
+	kvm_irqdevice_set_pin(&s->apic_irq, 0, 1);
+}
+
+int kvm_kernint_init(struct kvm_vcpu *vcpu)
+{
+	struct kvm_irqdevice *irqdev = &vcpu->irq.dev;
+	struct kvm_kernint *s;
+	struct kvm_irqsink apicsink;
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	s->vcpu = vcpu;
+
+	/*
+	 * Configure the irqdevice interface
+	 */
+	irqdev->ack         = kernint_irqdev_ack;
+	irqdev->set_pin     = kernint_irqdev_set_pin;
+	irqdev->destructor  = kernint_irqdev_destructor;
+
+	irqdev->private = s;
+	s->self_irq = irqdev;
+
+	/*
+	 * Configure the EXTINT device if this is the BSP processor
+	 */
+	if (!vcpu_slot(vcpu)) {
+		struct kvm_irqsink extsink = {
+			.set_intr   = kvm_ext_intr,
+			.private    = s
+		};
+		s->ext_irq = &vcpu->kvm->isa_irq;
+		kvm_irqdevice_register_sink(s->ext_irq, &extsink);
+	}
+
+	/*
+	 * Configure the LAPIC device
+	 */
+	apicsink.set_intr = kvm_apic_intr;
+	apicsink.private  = s;
+
+	kvm_irqdevice_init(&s->apic_irq);
+	kvm_irqdevice_register_sink(&s->apic_irq, &apicsink);
+	kvm_lapic_init(vcpu, &s->apic_irq, 0);
+
+	return 0;
+}
+
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index b08272b..e6861ed 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -16,6 +16,7 @@
 #include <asm/signal.h>
 
 #include "vmx.h"
+#include "irqdevice.h"
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
 
@@ -168,6 +169,27 @@ struct vmcs {
 
 struct kvm_vcpu;
 
+int kvm_user_irqdev_init(struct kvm_irqdevice *dev);
+int kvm_user_irqdev_save(struct kvm_irqdevice *this, void *data);
+int kvm_user_irqdev_restore(struct kvm_irqdevice *this, void *data);
+int kvm_userint_init(struct kvm_vcpu *vcpu);
+int kvm_kernint_init(struct kvm_vcpu *vcpu);
+
+#define KVM_LAPIC_OPTION_USERMODE (1 << 0)
+
+int kvm_lapic_init(struct kvm_vcpu *vcpu, struct kvm_irqdevice *dev,
+		   int flags);
+void kvm_lapic_destroy(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, u64 cr8);
+u64  kvm_lapic_get_tpr(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 base);
+u64  kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+void kvm_lapic_save(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+void kvm_lapic_restore(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+int  kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+void *kvm_lapic_get_regs(struct kvm_vcpu *vcpu);
+
 /*
  * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
  * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
@@ -259,10 +281,95 @@ struct kvm_stat {
 	u32 signal_exits;
 	u32 irq_window_exits;
 	u32 halt_exits;
+	u32 halt_wakeup;
 	u32 request_irq_exits;
 	u32 irq_exits;
 	u32 light_exits;
 	u32 efer_reload;
+	u32 irq_posted;
+	u32 irq_ignored;
+	u32 irq_accepted;
+	u32 guest_preempt;
+	u32 apic_mmio;
+	u32 local_mmio;
+};
+
+struct kvm_io_device {
+	void (*read)(struct kvm_io_device *this,
+		     gpa_t addr,
+		     int len,
+		     void *val);
+	void (*write)(struct kvm_io_device *this,
+		      gpa_t addr,
+		      int len,
+		      const void *val);
+	int (*in_range)(struct kvm_io_device *this, gpa_t addr);
+	void (*destructor)(struct kvm_io_device *this);
+
+	void             *private;
+};
+
+static inline void kvm_iodevice_read(struct kvm_io_device *dev,
+				     gpa_t addr,
+				     int len,
+				     void *val)
+{
+	dev->read(dev, addr, len, val);
+}
+
+static inline void kvm_iodevice_write(struct kvm_io_device *dev,
+				      gpa_t addr,
+				      int len,
+				      const void *val)
+{
+	dev->write(dev, addr, len, val);
+}
+
+static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
+{
+	return dev->in_range(dev, addr);
+}
+
+static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+	dev->destructor(dev);
+}
+
+/*
+ * It would be nice to use something smarter than a linear search, TBD...
+ * Thankfully we dont expect many devices to register (famous last words :),
+ * so until then it will suffice.  At least its abstracted so we can change
+ * in one place.
+ */
+struct kvm_io_bus {
+	int                   dev_count;
+#define NR_IOBUS_DEVS 6
+	struct kvm_io_device *devs[NR_IOBUS_DEVS];
+};
+
+void kvm_io_bus_init(struct kvm_io_bus *bus);
+void kvm_io_bus_destroy(struct kvm_io_bus *bus);
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
+			     struct kvm_io_device *dev);
+
+#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
+
+/*
+ * structure for maintaining info for interrupting an executing VCPU
+ */
+struct kvm_vcpu_irq {
+	spinlock_t           lock;
+	struct kvm_irqdevice dev;
+	int                  pending;
+	int                  deferred;
+	int                  guest_cpu;
+	wait_queue_head_t    wq;
+};
+
+struct kvm_lapic {
+	void                 *dev;
+	struct kvm_io_device *mmio;
 };
 
 struct kvm_vcpu {
@@ -277,11 +384,10 @@ struct kvm_vcpu {
 	u64 host_tsc;
 	struct kvm_run *run;
 	int interrupt_window_open;
+	struct kvm_vcpu_irq irq;
+	struct kvm_lapic apic;
 	int guest_mode;
 	unsigned long requests;
-	unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
-#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
-	unsigned long irq_pending[NR_IRQ_WORDS];
 	unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
 	unsigned long rip;      /* needs vcpu_load_rsp_rip() */
 
@@ -292,10 +398,8 @@ struct kvm_vcpu {
 	struct page *para_state_page;
 	gpa_t hypercall_gpa;
 	unsigned long cr4;
-	unsigned long cr8;
 	u64 pdptrs[4]; /* pae */
 	u64 shadow_efer;
-	u64 apic_base;
 	u64 ia32_misc_enable_msr;
 	int nmsrs;
 	int save_nmsrs;
@@ -360,6 +464,78 @@ struct kvm_vcpu {
 	struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
 };
 
+/*
+ * These two functions are helpers for determining if a standard interrupt
+ * is pending to replace the old "if (vcpu->irq_summary)" logic.
+ */
+
+/*
+ * Assumes lock already held
+ */
+static inline int __kvm_vcpu_irq_pending(struct kvm_vcpu *vcpu)
+{
+	int pending = vcpu->irq.pending;
+
+	if (vcpu->irq.deferred != -1)
+		__set_bit(kvm_irqpin_localint, &pending);
+
+	return pending;
+}
+
+static inline int kvm_vcpu_irq_pending(struct kvm_vcpu *vcpu)
+{
+	int ret = 0;
+	int flags;
+
+	spin_lock_irqsave(&vcpu->irq.lock, flags);
+	ret = __kvm_vcpu_irq_pending(vcpu);
+	spin_unlock_irqrestore(&vcpu->irq.lock, flags);
+
+	return ret;
+}
+
+/*
+ * Assumes lock already held
+ */
+static inline int kvm_vcpu_irq_pop(struct kvm_vcpu *vcpu,
+				   struct kvm_irqack_data *data)
+{
+	int ret = 0;
+
+	if (vcpu->irq.deferred != -1) {
+		ret = kvm_irqdevice_ack(&vcpu->irq.dev, KVM_IRQACK_FLAG_PEEK,
+					data);
+		data->flags |= KVM_IRQACKDATA_VECTOR_VALID;
+		data->vector = vcpu->irq.deferred;
+		vcpu->irq.deferred = -1;
+	} else
+		ret = kvm_irqdevice_ack(&vcpu->irq.dev, 0, data);
+
+	/*
+	 * If there are no more interrupts we must clear the status flag
+	 */
+	if (!(data->flags & KVM_IRQACKDATA_VECTOR_PENDING))
+		__clear_bit(kvm_irqpin_localint, &vcpu->irq.pending);
+
+	return ret;
+}
+
+static inline void __kvm_vcpu_irq_push(struct kvm_vcpu *vcpu, int irq)
+{
+	BUG_ON(vcpu->irq.deferred != -1); /* We can only hold one deferred */
+
+	vcpu->irq.deferred = irq;
+}
+
+static inline void kvm_vcpu_irq_push(struct kvm_vcpu *vcpu, int irq)
+{
+	int flags;
+
+	spin_lock_irqsave(&vcpu->irq.lock, flags);
+	__kvm_vcpu_irq_push(vcpu, irq);
+	spin_unlock_irqrestore(&vcpu->irq.lock, flags);
+}
+
 struct kvm_mem_alias {
 	gfn_t base_gfn;
 	unsigned long npages;
@@ -393,6 +569,9 @@ struct kvm {
 	unsigned long rmap_overflow;
 	struct list_head vm_list;
 	struct file *filp;
+	struct kvm_io_bus mmio_bus;
+	int enable_kernel_pic;
+	struct kvm_irqdevice isa_irq;
 };
 
 struct descriptor_table {
@@ -467,6 +646,10 @@ void kvm_exit_arch(void);
 int kvm_mmu_module_init(void);
 void kvm_mmu_module_exit(void);
 
+int kvm_apicbus_send(struct kvm *kvm, int dest, int trig_mode, int level,
+		     int dest_mode, int delivery_mode, int vector);
+int kvm_vcpu_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
+
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
@@ -612,6 +795,13 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 	return (struct kvm_mmu_page *)page_private(page);
 }
 
+static inline int vcpu_slot(struct kvm_vcpu *vcpu)
+{
+	return vcpu - vcpu->kvm->vcpus;
+}
+
+void kvm_crash_guest(struct kvm *kvm);
+
 static inline u16 read_fs(void)
 {
 	u16 seg;
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 633c2ed..152a7c9 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -72,10 +72,17 @@ static struct kvm_stats_debugfs_item {
 	{ "signal_exits", STAT_OFFSET(signal_exits) },
 	{ "irq_window", STAT_OFFSET(irq_window_exits) },
 	{ "halt_exits", STAT_OFFSET(halt_exits) },
+	{ "halt_wakeup", STAT_OFFSET(halt_wakeup) },
 	{ "request_irq", STAT_OFFSET(request_irq_exits) },
 	{ "irq_exits", STAT_OFFSET(irq_exits) },
 	{ "light_exits", STAT_OFFSET(light_exits) },
 	{ "efer_reload", STAT_OFFSET(efer_reload) },
+	{ "irq_posted", STAT_OFFSET(irq_posted) },
+	{ "irq_ignored", STAT_OFFSET(irq_ignored) },
+	{ "irq_accepted", STAT_OFFSET(irq_accepted) },
+	{ "guest_preempt", STAT_OFFSET(guest_preempt) },
+	{ "apic_mmio", STAT_OFFSET(apic_mmio) },
+	{ "local_mmio", STAT_OFFSET(local_mmio) },
 	{ NULL }
 };
 
@@ -363,6 +370,8 @@ static struct kvm *kvm_create_vm(void)
 
 	spin_lock_init(&kvm->lock);
 	INIT_LIST_HEAD(&kvm->active_mmu_pages);
+	kvm_io_bus_init(&kvm->mmio_bus);
+	kvm_irqdevice_init(&kvm->isa_irq);
 	spin_lock(&kvm_lock);
 	list_add(&kvm->vm_list, &vm_list);
 	spin_unlock(&kvm_lock);
@@ -370,6 +379,12 @@ static struct kvm *kvm_create_vm(void)
 		struct kvm_vcpu *vcpu = &kvm->vcpus[i];
 
 		mutex_init(&vcpu->mutex);
+
+		memset(&vcpu->irq, 0, sizeof(vcpu->irq));
+		spin_lock_init(&vcpu->irq.lock);
+		vcpu->irq.deferred = -1;
+		init_waitqueue_head(&vcpu->irq.wq);
+
 		vcpu->cpu = -1;
 		vcpu->kvm = kvm;
 		vcpu->mmu.root_hpa = INVALID_PAGE;
@@ -443,6 +458,7 @@ static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
 	vcpu_load(vcpu);
 	kvm_mmu_destroy(vcpu);
 	vcpu_put(vcpu);
+	kvm_irqdevice_destructor(&vcpu->irq.dev);
 	kvm_arch_ops->vcpu_free(vcpu);
 	free_page((unsigned long)vcpu->run);
 	vcpu->run = NULL;
@@ -464,6 +480,23 @@ static void kvm_free_vcpus(struct kvm *kvm)
 		kvm_free_vcpu(&kvm->vcpus[i]);
 }
 
+/*
+ * The function kills a guest while there still is a user space processes
+ * with a descriptor to it
+ */
+void kvm_crash_guest(struct kvm *kvm)
+{
+	unsigned int i;
+
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		/*
+		 * FIXME: in the future it should send IPI to gracefully
+		 * stop the other vCPUs
+		 */
+		kvm_free_vcpu(&kvm->vcpus[i]);
+	}
+}
+
 static int kvm_dev_release(struct inode *inode, struct file *filp)
 {
 	return 0;
@@ -474,6 +507,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
+	kvm_io_bus_destroy(&kvm->mmio_bus);
+	if (kvm->enable_kernel_pic)
+		kvm_irqdevice_destructor(&kvm->isa_irq);
 	kvm_free_vcpus(kvm);
 	kvm_free_physmem(kvm);
 	kfree(kvm);
@@ -679,7 +715,7 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 		inject_gp(vcpu);
 		return;
 	}
-	vcpu->cr8 = cr8;
+	kvm_lapic_set_tpr(vcpu, cr8);
 }
 EXPORT_SYMBOL_GPL(set_cr8);
 
@@ -980,6 +1016,69 @@ out:
 	return r;
 }
 
+static int kvm_vm_ioctl_enable_kernel_pic(struct kvm *kvm, __u32 val)
+{
+	/*
+	 * FIXME: We should not allow this if VCPUs have already been created
+	 */
+	if (kvm->enable_kernel_pic)
+		return -EINVAL;
+
+	/*
+	 * Someday we may offer two levels of in-kernel PIC support:
+	 *
+	 *  level 0 = (default) compatiblity mode (everything in userspace)
+	 *  level 1 = LAPIC in kernel, IOAPIC/i8259 in userspace
+	 *  level 2 = All three in kernel
+	 *
+	 * For now we only support level 0 and 1.  However, you cant set
+	 * level 0
+	 */
+	if (val != 1)
+		return -EINVAL;
+
+	kvm->enable_kernel_pic = val;
+
+	printk(KERN_INFO "KVM: Setting in-kernel PIC level to %d\n", val);
+
+	/*
+	 * installing a user_irqdev model to the kvm->isa_irq device
+	 * creates a level-1 environment, where the userspace completely
+	 * controls the ISA domain interrupts in the IOAPIC/i8259.
+	 * Interrupts come down to the VCPU either as an ISA vector to
+	 * this controller, or as an APIC bus message (or both)
+	 */
+	kvm_user_irqdev_init(&kvm->isa_irq);
+
+	return 0;
+}
+
+static int kvm_vm_ioctl_isa_interrupt(struct kvm *kvm,
+				      struct kvm_interrupt *irq)
+{
+	if (irq->irq < 0 || irq->irq >= 256)
+		return -EINVAL;
+
+	if (!kvm->enable_kernel_pic)
+		return -EINVAL;
+
+	return kvm_irqdevice_set_pin(&kvm->isa_irq, irq->irq, 1);
+}
+
+static int kvm_vm_ioctl_apic_msg(struct kvm *kvm,
+				 struct kvm_apic_msg *msg)
+{
+	if (!kvm->enable_kernel_pic)
+		return -EINVAL;
+
+	msg->delivery_mode = (msg->delivery_mode << 8) & 0xF00;
+
+	kvm_apicbus_send(kvm, msg->dest, msg->trig_mode, 1, msg->dest_mode,
+			 msg->delivery_mode, msg->vector);
+
+	return 0;
+}
+
 static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 {
 	int i;
@@ -1097,12 +1196,37 @@ static int emulator_write_std(unsigned long addr,
 	return X86EMUL_UNHANDLEABLE;
 }
 
+static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
+						gpa_t addr)
+{
+	struct kvm_io_device *dev = vcpu->apic.mmio;
+
+	/*
+	 * First check if the LAPIC will snarf this request
+	 */
+	if (dev && dev->in_range(dev, addr)) {
+		++vcpu->stat.apic_mmio;
+		return dev;
+	}
+
+	/*
+	 * And then fallback to allow any device to participate
+	 */
+	dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
+	if (dev)
+		++vcpu->stat.local_mmio;
+
+	return dev;
+}
+
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
 				  unsigned int bytes,
 				  struct x86_emulate_ctxt *ctxt)
 {
-	struct kvm_vcpu *vcpu = ctxt->vcpu;
+	struct kvm_vcpu      *vcpu = ctxt->vcpu;
+	struct kvm_io_device *mmio_dev;
+	gpa_t                 gpa;
 
 	if (vcpu->mmio_read_completed) {
 		memcpy(val, vcpu->mmio_data, bytes);
@@ -1111,18 +1235,26 @@ static int emulator_read_emulated(unsigned long addr,
 	} else if (emulator_read_std(addr, val, bytes, ctxt)
 		   == X86EMUL_CONTINUE)
 		return X86EMUL_CONTINUE;
-	else {
-		gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
 
-		if (gpa == UNMAPPED_GVA)
-			return X86EMUL_PROPAGATE_FAULT;
-		vcpu->mmio_needed = 1;
-		vcpu->mmio_phys_addr = gpa;
-		vcpu->mmio_size = bytes;
-		vcpu->mmio_is_write = 0;
+	gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+	if (gpa == UNMAPPED_GVA)
+		return X86EMUL_PROPAGATE_FAULT;
 
-		return X86EMUL_UNHANDLEABLE;
+	/*
+	 * Is this MMIO handled locally?
+	 */
+	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+	if (mmio_dev) {
+		kvm_iodevice_read(mmio_dev, gpa, bytes, val);
+		return X86EMUL_CONTINUE;
 	}
+
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_phys_addr = gpa;
+	vcpu->mmio_size = bytes;
+	vcpu->mmio_is_write = 0;
+
+	return X86EMUL_UNHANDLEABLE;
 }
 
 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1150,8 +1282,9 @@ static int emulator_write_emulated(unsigned long addr,
 				   unsigned int bytes,
 				   struct x86_emulate_ctxt *ctxt)
 {
-	struct kvm_vcpu *vcpu = ctxt->vcpu;
-	gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+	struct kvm_vcpu      *vcpu = ctxt->vcpu;
+	struct kvm_io_device *mmio_dev;
+	gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
 
 	if (gpa == UNMAPPED_GVA) {
 		kvm_arch_ops->inject_page_fault(vcpu, addr, 2);
@@ -1161,6 +1294,15 @@ static int emulator_write_emulated(unsigned long addr,
 	if (emulator_write_phys(vcpu, gpa, val, bytes))
 		return X86EMUL_CONTINUE;
 
+	/*
+	 * Is this MMIO handled locally?
+	 */
+	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+	if (mmio_dev) {
+		kvm_iodevice_write(mmio_dev, gpa, bytes, val);
+		return X86EMUL_CONTINUE;
+	}
+
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_phys_addr = gpa;
 	vcpu->mmio_size = bytes;
@@ -1331,7 +1473,7 @@ EXPORT_SYMBOL_GPL(emulate_instruction);
 
 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->irq_summary)
+	if (kvm_vcpu_irq_pending(vcpu))
 		return 1;
 
 	vcpu->run->exit_reason = KVM_EXIT_HLT;
@@ -1550,7 +1692,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		data = 3;
 		break;
 	case MSR_IA32_APICBASE:
-		data = vcpu->apic_base;
+		data = kvm_lapic_get_base(vcpu);
 		break;
 	case MSR_IA32_MISC_ENABLE:
 		data = vcpu->ia32_misc_enable_msr;
@@ -1628,7 +1770,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	case 0x200 ... 0x2ff: /* MTRRs */
 		break;
 	case MSR_IA32_APICBASE:
-		vcpu->apic_base = data;
+		kvm_lapic_set_base(vcpu, data);
 		break;
 	case MSR_IA32_MISC_ENABLE:
 		vcpu->ia32_misc_enable_msr = data;
@@ -1892,8 +2034,9 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
-	/* re-sync apic's tpr */
-	vcpu->cr8 = kvm_run->cr8;
+	if (!vcpu->kvm->enable_kernel_pic)
+		/* re-sync apic's tpr if the APIC is in userspace */
+		kvm_lapic_set_tpr(vcpu, kvm_run->cr8);
 
 	if (vcpu->pio.cur_count) {
 		r = complete_pio(vcpu);
@@ -2042,12 +2185,12 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	sregs->cr2 = vcpu->cr2;
 	sregs->cr3 = vcpu->cr3;
 	sregs->cr4 = vcpu->cr4;
-	sregs->cr8 = vcpu->cr8;
 	sregs->efer = vcpu->shadow_efer;
-	sregs->apic_base = vcpu->apic_base;
 
-	memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
-	       sizeof sregs->interrupt_bitmap);
+	kvm_lapic_save(vcpu, sregs);
+
+	if (!vcpu->kvm->enable_kernel_pic)
+		kvm_user_irqdev_save(&vcpu->irq.dev, &sregs->interrupt_bitmap);
 
 	vcpu_put(vcpu);
 
@@ -2064,7 +2207,6 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 				    struct kvm_sregs *sregs)
 {
 	int mmu_reset_needed = 0;
-	int i;
 	struct descriptor_table dt;
 
 	vcpu_load(vcpu);
@@ -2080,14 +2222,10 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
 	vcpu->cr3 = sregs->cr3;
 
-	vcpu->cr8 = sregs->cr8;
-
 	mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
 #ifdef CONFIG_X86_64
 	kvm_arch_ops->set_efer(vcpu, sregs->efer);
 #endif
-	vcpu->apic_base = sregs->apic_base;
-
 	kvm_arch_ops->decache_cr4_guest_bits(vcpu);
 
 	mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
@@ -2101,12 +2239,11 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	if (mmu_reset_needed)
 		kvm_mmu_reset_context(vcpu);
 
-	memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
-	       sizeof vcpu->irq_pending);
-	vcpu->irq_summary = 0;
-	for (i = 0; i < NR_IRQ_WORDS; ++i)
-		if (vcpu->irq_pending[i])
-			__set_bit(i, &vcpu->irq_summary);
+	kvm_lapic_restore(vcpu, sregs);
+
+	if (!vcpu->kvm->enable_kernel_pic)
+		kvm_user_irqdev_restore(&vcpu->irq.dev,
+					&sregs->interrupt_bitmap[0]);
 
 	set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
 	set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
@@ -2267,14 +2404,8 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 {
 	if (irq->irq < 0 || irq->irq >= 256)
 		return -EINVAL;
-	vcpu_load(vcpu);
-
-	set_bit(irq->irq, vcpu->irq_pending);
-	set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
-
-	vcpu_put(vcpu);
 
-	return 0;
+	return kvm_irqdevice_set_pin(&vcpu->irq.dev, irq->irq, 1);
 }
 
 static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
@@ -2376,6 +2507,171 @@ out1:
 }
 
 /*
+ * The vCPU has executed a HLT instruction with in-kernel mode enabled.
+ */
+static int kvm_vcpu_kern_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	int r = 1;
+
+	spin_lock_irq(&vcpu->irq.lock);
+	__add_wait_queue(&vcpu->irq.wq, &wait);
+
+	/*
+	 * We will block until either an interrupt or a signal wakes us up
+	 */
+	while(!__kvm_vcpu_irq_pending(vcpu)
+	      && !signal_pending(current)
+	      && !kvm_run->request_interrupt_window) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		spin_unlock_irq(&vcpu->irq.lock);
+		vcpu_put(vcpu);
+
+		schedule();
+
+		vcpu_load(vcpu);
+		spin_lock_irq(&vcpu->irq.lock);
+	}
+
+	/*
+	 * If userspace is waiting for an injection point, we cant sleep here
+	 */
+	if (kvm_run->request_interrupt_window
+	    && !__kvm_vcpu_irq_pending(vcpu)) {
+		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+		r = 0;
+	}
+
+	__remove_wait_queue(&vcpu->irq.wq, &wait);
+	__set_current_state(TASK_RUNNING);
+	spin_unlock_irq(&vcpu->irq.lock);
+
+	return r;
+}
+
+/*
+ * The vCPU has executed a HLT instruction.
+ */
+int kvm_vcpu_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	int r = -EINVAL;
+
+	++vcpu->stat.halt_exits;
+
+	if (vcpu->kvm->enable_kernel_pic)
+		/*
+		 * If the in-kernel PIC is enabled, we will perform HLT
+		 * in-kernel as well
+		 */
+		r = kvm_vcpu_kern_halt(vcpu, kvm_run);
+	else {
+		/*
+		 * Else, we decide to go back to userspace or vmenter depending
+		 * on whether there are interrupts currently pending or not
+		 */
+		r = kvm_vcpu_irq_pending(vcpu) ? 1 : 0;
+		if (!r)
+			kvm_run->exit_reason = KVM_EXIT_HLT;
+	}
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
+
+/*
+ * This function is invoked whenever we want to interrupt a vcpu that is
+ * currently executing in guest-mode.  It currently is a no-op because
+ * the simple delivery of the IPI to execute this function accomplishes our
+ * goal: To cause a VMEXIT.  We pass the vcpu (which contains the
+ * vcpu->irq.task, etc) for future use
+ */
+static void kvm_vcpu_guest_intr(void *info)
+{
+#ifdef NOT_YET
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu*)info;
+#endif
+}
+
+/*
+ * This function will be invoked whenever the vcpu->irq.dev raises its INTR
+ * line
+ */
+static void kvm_vcpu_intr(struct kvm_irqsink *this,
+			  struct kvm_irqdevice *dev,
+			  kvm_irqpin_t pin)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu*)this->private;
+	int direct_ipi = -1;
+
+	++vcpu->stat.irq_posted;
+
+	spin_lock_irq(&vcpu->irq.lock);
+
+	if (!test_bit(pin, &vcpu->irq.pending)) {
+		/*
+		 * Record the change..
+		 */
+		__set_bit(pin, &vcpu->irq.pending);
+
+		/*
+		 * then wake up the vcpu (if necessary)
+		 */
+		if (vcpu->irq.guest_cpu != -1) {
+			/*
+			 * If we are in guest mode, we must send a host-IPI
+			 * to the CPU which is running the guest to cause
+			 * a VMEXIT.
+			 */
+			direct_ipi = vcpu->irq.guest_cpu;
+			BUG_ON(direct_ipi == smp_processor_id());
+			++vcpu->stat.guest_preempt;
+		}
+
+		/*
+		 * If the CPU is halted it will be waiting for a wake-up
+		 */
+		if (waitqueue_active(&vcpu->irq.wq)) {
+			wake_up_interruptible_sync(&vcpu->irq.wq);
+			set_tsk_need_resched(current);
+			++vcpu->stat.halt_wakeup;
+		}
+
+	} else
+		++vcpu->stat.irq_ignored;
+
+
+	spin_unlock_irq(&vcpu->irq.lock);
+
+	/*
+	 * we can safely send the IPI outside of the lock-scope because the
+	 * irq.pending has already been updated.  This code assumes that
+	 * userspace will not sleep on anything other than HLT instructions.
+	 * HLT is covered in a race-free way because irq.pending was updated
+	 * in the critical section, and handle_halt() which check if any
+	 * interrupts are pending before returning to userspace.
+	 *
+	 * If it turns out that userspace can sleep on conditions other than
+	 * HLT, this code will need to be enhanced to allow the irq.pending
+	 * flags to be exported to userspace
+	 */
+	if (direct_ipi != -1)
+		smp_call_function_single(direct_ipi,
+					 kvm_vcpu_guest_intr,
+					 vcpu, 0, 0);
+}
+
+static void kvm_vcpu_irqsink_init(struct kvm_vcpu *vcpu)
+{
+	struct kvm_irqsink sink = {
+		.set_intr   = kvm_vcpu_intr,
+		.private    = vcpu
+	};
+
+	kvm_irqdevice_register_sink(&vcpu->irq.dev, &sink);
+}
+
+/*
  * Creates some virtual cpus.  Good luck creating more than one.
  */
 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
@@ -2422,6 +2718,17 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 	if (r < 0)
 		goto out_free_vcpus;
 
+	kvm_irqdevice_init(&vcpu->irq.dev);
+	kvm_vcpu_irqsink_init(vcpu);
+
+	if (kvm->enable_kernel_pic)
+		r = kvm_kernint_init(vcpu);
+	else
+		r = kvm_userint_init(vcpu);
+
+	if (r < 0)
+		goto out_free_vcpus;
+
 	kvm_arch_ops->vcpu_load(vcpu);
 	r = kvm_mmu_setup(vcpu);
 	if (r >= 0)
@@ -2567,6 +2874,12 @@ static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	return 0;
 }
 
+static int kvm_vcpu_ioctl_apic_reset(struct kvm_vcpu *vcpu)
+{
+	kvm_lapic_reset(vcpu);
+	return 0;
+}
+
 static long kvm_vcpu_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -2736,6 +3049,13 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_APIC_RESET: {
+		r = kvm_vcpu_ioctl_apic_reset(vcpu);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
 	default:
 		;
 	}
@@ -2789,6 +3109,41 @@ static long kvm_vm_ioctl(struct file *filp,
 			goto out;
 		break;
 	}
+	case KVM_ENABLE_KERNEL_PIC: {
+		__u32 val;
+
+		r = -EFAULT;
+		if (copy_from_user(&val, argp, sizeof val))
+			goto out;
+		r = kvm_vm_ioctl_enable_kernel_pic(kvm, val);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_ISA_INTERRUPT: {
+		struct kvm_interrupt irq;
+
+		r = -EFAULT;
+		if (copy_from_user(&irq, argp, sizeof irq))
+			goto out;
+		r = kvm_vm_ioctl_isa_interrupt(kvm, &irq);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_APIC_MSG: {
+		struct kvm_apic_msg msg;
+
+		r = -EFAULT;
+		if (copy_from_user(&msg, argp, sizeof msg))
+			goto out;
+		r = kvm_vm_ioctl_apic_msg(kvm, &msg);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
 	default:
 		;
 	}
@@ -2920,12 +3275,21 @@ static long kvm_dev_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
-	case KVM_CHECK_EXTENSION:
-		/*
-		 * No extensions defined at present.
-		 */
-		r = 0;
+	case KVM_CHECK_EXTENSION: {
+		int ext = (long)argp;
+
+		switch (ext) {
+		case KVM_ISA_INTERRUPT:
+		case KVM_APIC_MSG:
+		case KVM_APIC_RESET:
+			r = 1;
+			break;
+		default:
+			r = 0;
+			break;
+		}
 		break;
+	}
 	case KVM_GET_VCPU_MMAP_SIZE:
 		r = -EINVAL;
 		if (arg)
@@ -3031,6 +3395,43 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
 	return NOTIFY_OK;
 }
 
+void kvm_io_bus_init(struct kvm_io_bus *bus)
+{
+	memset(bus, 0, sizeof(*bus));
+}
+
+void kvm_io_bus_destroy(struct kvm_io_bus *bus)
+{
+	int i;
+
+	for (i = 0; i < bus->dev_count; i++) {
+		struct kvm_io_device *pos = bus->devs[i];
+
+		kvm_iodevice_destructor(pos);
+	}
+}
+
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
+{
+	int i;
+
+	for (i = 0; i < bus->dev_count; i++) {
+		struct kvm_io_device *pos = bus->devs[i];
+
+		if (pos->in_range(pos, addr))
+			return pos;
+	}
+
+	return NULL;
+}
+
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
+{
+	BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
+
+	bus->devs[bus->dev_count++] = dev;
+}
+
 static struct notifier_block kvm_cpu_notifier = {
 	.notifier_call = kvm_cpu_hotplug,
 	.priority = 20, /* must be > scheduler priority */
diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c
new file mode 100644
index 0000000..602e94c
--- /dev/null
+++ b/drivers/kvm/lapic.c
@@ -0,0 +1,1435 @@
+/*
+ * Local APIC virtualization
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2007 Novell
+ *
+ * Authors:
+ *   Dor Laor <dor.laor@qumranet.com>
+ *   Gregory Haskins <ghaskins@novell.com>
+ *
+ * Based on Xen 3.0 code, Copyright (c) 2004, Intel Corporation.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "kvm.h"
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/msr.h>
+#include <asm/page.h>
+#include <asm/current.h>
+
+/*XXX remove this definition after GFW enabled */
+#define APIC_NO_BIOS
+
+#define PRId64 "d"
+#define PRIx64 "llx"
+#define PRIu64 "u"
+#define PRIo64 "o"
+
+#define APIC_BUS_CYCLE_NS 1
+
+/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
+#define apic_debug(fmt,arg...)
+
+struct kvm_kern_apic {
+	spinlock_t              lock;
+	atomic_t                ref_count;
+	int                     usermode;
+	u32                     status;
+	u32                     vcpu_id;
+	u64                     base_msr;
+	unsigned long           base_address;
+	struct kvm_io_device    mmio_dev;
+	struct {
+		unsigned long   pending;
+		u32             divide_count;
+		ktime_t         last_update;
+		struct hrtimer  dev;
+	} timer;
+	u32                     err_status;
+	u32                     err_write_count;
+	struct kvm_vcpu         *vcpu;
+	struct kvm_irqdevice    *irq_dev;
+	struct page             *regs_page;
+	void                    *regs;
+};
+
+static __inline__ int find_highest_bit(unsigned long *data, int nr_bits)
+{
+	int length = BITS_TO_LONGS(nr_bits);
+	while (length && !data[--length])
+		continue;
+	return __ffs(data[length]) + (length * BITS_PER_LONG);
+}
+
+#define APIC_LVT_NUM			6
+/* 14 is the version for Xeon and Pentium 8.4.8*/
+#define APIC_VERSION			(0x14UL | ((APIC_LVT_NUM - 1) << 16))
+#define VLOCAL_APIC_MEM_LENGTH		(1 << 12)
+/* followed define is not in apicdef.h */
+#define APIC_SHORT_MASK			0xc0000
+#define APIC_DEST_NOSHORT		0x0
+#define APIC_DEST_MASK			0x800
+#define _APIC_GLOB_DISABLE		0x0
+#define APIC_GLOB_DISABLE_MASK		0x1
+#define APIC_SOFTWARE_DISABLE_MASK	0x2
+#define _APIC_BSP_ACCEPT_PIC		0x3
+#define MAX_APIC_INT_VECTOR             256
+
+#define inject_gp(vcpu) kvm_arch_ops->inject_gp(vcpu, 0);
+
+#define apic_enabled(apic)              \
+	(!((apic)->status &                   \
+	   (APIC_GLOB_DISABLE_MASK | APIC_SOFTWARE_DISABLE_MASK)))
+
+#define apic_global_enabled(apic)       \
+	(!(test_bit(_APIC_GLOB_DISABLE, &(apic)->status)))
+
+#define LVT_MASK \
+	APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK
+
+#define LINT_MASK   \
+	LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY |\
+	APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER
+
+#define KVM_APIC_ID(apic)   \
+	(GET_APIC_ID(apic_get_reg(apic, APIC_ID)))
+
+#define apic_lvt_enabled(apic, lvt_type)    \
+	(!(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED))
+
+#define apic_lvt_vector(apic, lvt_type)     \
+	(apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK)
+
+#define apic_lvt_dm(apic, lvt_type)           \
+	(apic_get_reg(apic, lvt_type) & APIC_MODE_MASK)
+
+#define apic_lvtt_period(apic)     \
+	(apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC)
+
+static inline u32 apic_get_reg(struct kvm_kern_apic *apic, u32 reg)
+{
+	return *((u32 *)(apic->regs + reg));
+}
+
+static inline void apic_set_reg(struct kvm_kern_apic *apic,
+				u32 reg, u32 val)
+{
+	*((u32 *)(apic->regs + reg)) = val;
+}
+
+static unsigned int apic_lvt_mask[APIC_LVT_NUM] =
+{
+	LVT_MASK | APIC_LVT_TIMER_PERIODIC, 	/* LVTT */
+	LVT_MASK | APIC_MODE_MASK, 		/* LVTTHMR */
+	LVT_MASK | APIC_MODE_MASK, 		/* LVTPC */
+	LINT_MASK, LINT_MASK, 			/* LVT0-1 */
+	LVT_MASK 				/* LVTERR */
+};
+
+#define ASSERT(x)  							     \
+	if (!(x)) { 							     \
+		printk(KERN_EMERG "assertion failed %s: %d: %s\n",           \
+		       __FILE__, __LINE__, #x);                              \
+		BUG();                                                       \
+	}
+
+static int apic_find_highest_irr(struct kvm_kern_apic *apic)
+{
+	int result;
+
+	result = find_highest_bit((unsigned long *)(apic->regs + APIC_IRR),
+				  MAX_APIC_INT_VECTOR);
+
+	ASSERT( result == 0 || result >= 16);
+
+	return result;
+}
+
+
+static int apic_find_highest_isr(struct kvm_kern_apic *apic)
+{
+	int result;
+
+	result = find_highest_bit((unsigned long *)(apic->regs + APIC_ISR),
+				  MAX_APIC_INT_VECTOR);
+
+	ASSERT( result == 0 || result >= 16);
+
+	return result;
+}
+
+static void apic_dropref(struct kvm_kern_apic *apic)
+{
+	if (atomic_dec_and_test(&apic->ref_count)) {
+
+		spin_lock_bh(&apic->lock);
+
+		hrtimer_cancel(&apic->timer.dev);
+
+		if (apic->regs_page) {
+			__free_page(apic->regs_page);
+			apic->regs_page = 0;
+		}
+
+		spin_unlock_bh(&apic->lock);
+
+		kfree(apic);
+	}
+}
+
+#if 0
+static void apic_dump_state(struct kvm_kern_apic *apic)
+{
+	u64 *tmp;
+
+	printk(KERN_INFO "%s begin\n", __FUNCTION__);
+
+	printk(KERN_INFO "status = 0x%08x\n", apic->status);
+	printk(KERN_INFO "base_msr=0x%016llx, apicbase = 0x%08lx\n",
+	       apic->base_msr, apic->base_address);
+
+	tmp = (u64*)(apic->regs + APIC_IRR);
+	printk(KERN_INFO "IRR = 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n",
+	       tmp[3], tmp[2], tmp[1], tmp[0]);
+	tmp = (u64*)(apic->regs + APIC_ISR);
+	printk(KERN_INFO "ISR = 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n",
+	       tmp[3], tmp[2], tmp[1], tmp[0]);
+	tmp = (u64*)(apic->regs + APIC_TMR);
+	printk(KERN_INFO "TMR = 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n",
+	       tmp[3], tmp[2], tmp[1], tmp[0]);
+
+	printk(KERN_INFO "APIC_ID=0x%08x\n", apic_get_reg(apic, APIC_ID));
+	printk(KERN_INFO "APIC_TASKPRI=0x%08x\n",
+	       apic_get_reg(apic, APIC_TASKPRI) & 0xff);
+	printk(KERN_INFO "APIC_PROCPRI=0x%08x\n",
+	       apic_get_reg(apic, APIC_PROCPRI));
+
+	printk(KERN_INFO "APIC_DFR=0x%08x\n",
+	       apic_get_reg(apic, APIC_DFR) | 0x0FFFFFFF);
+	printk(KERN_INFO "APIC_LDR=0x%08x\n",
+	       apic_get_reg(apic, APIC_LDR) & APIC_LDR_MASK);
+	printk(KERN_INFO "APIC_SPIV=0x%08x\n",
+	       apic_get_reg(apic, APIC_SPIV) & 0x3ff);
+	printk(KERN_INFO "APIC_ESR=0x%08x\n",
+	       apic_get_reg(apic, APIC_ESR));
+	printk(KERN_INFO "APIC_ICR=0x%08x\n",
+	       apic_get_reg(apic, APIC_ICR) & ~(1 << 12));
+	printk(KERN_INFO "APIC_ICR2=0x%08x\n",
+	       apic_get_reg(apic, APIC_ICR2) & 0xff000000);
+
+	printk(KERN_INFO "APIC_LVTERR=0x%08x\n",
+	       apic_get_reg(apic, APIC_LVTERR));
+	printk(KERN_INFO "APIC_LVT1=0x%08x\n",
+	       apic_get_reg(apic, APIC_LVT1));
+	printk(KERN_INFO "APIC_LVT0=0x%08x\n",
+	       apic_get_reg(apic, APIC_LVT0));
+	printk(KERN_INFO "APIC_LVTPC=0x%08x\n",
+	       apic_get_reg(apic, APIC_LVTPC));
+	printk(KERN_INFO "APIC_LVTTHMR=0x%08x\n",
+	       apic_get_reg(apic, APIC_LVTTHMR));
+	printk(KERN_INFO "APIC_LVTT=0x%08x\n",
+	       apic_get_reg(apic, APIC_LVTT));
+
+	printk(KERN_INFO "APIC_TMICT=0x%08x\n",
+	       apic_get_reg(apic, APIC_TMICT));
+	printk(KERN_INFO "APIC_TDCR=0x%08x\n",
+	       apic_get_reg(apic, APIC_TDCR));
+
+	printk(KERN_INFO "%s end\n", __FUNCTION__);
+}
+#endif
+
+
+static int apic_update_ppr(struct kvm_kern_apic *apic)
+{
+	u32 tpr, isrv, ppr, orig_ppr;
+	int irq;
+	int masked = 0;
+	int forward = 0;
+
+	ppr = apic_get_reg(apic, APIC_PROCPRI);
+	orig_ppr = ppr;
+
+	/*
+	 * Before we change anything, see if the only pending vectors we have
+	 * are anything masked by PPR
+	 */
+	irq = apic_find_highest_irr(apic);
+	if (irq && ((irq & 0xf0) <= ppr))
+		masked = true;
+
+	/*
+	 * Compute the PPR value based on the current settings of TPR/ISR
+	 */
+	tpr = apic_get_reg(apic, APIC_TASKPRI);
+	irq = apic_find_highest_isr(apic);
+	isrv = (irq >> 4) & 0xf;
+
+	if ((tpr >> 4) >= isrv)
+		ppr = tpr & 0xff;
+	else
+		ppr = isrv << 4;  /* low 4 bits of PPR have to be cleared */
+
+	apic_set_reg(apic, APIC_PROCPRI, ppr);
+
+	if (masked) {
+		/*
+		 * If we get here its because there were vectors that
+		 * were masked by PPR.  Check again to see if anything is
+		 * now available
+		 */
+		irq = apic_find_highest_irr(apic);
+		if ((irq & 0xf0) > ppr)
+			forward = 1;
+	}
+
+	apic_debug("%s: ppr 0x%x (old) 0x%x (new), isr 0x%x, isrv 0x%x\n",
+	       __FUNCTION__, orig_ppr, ppr, irq, isrv);
+
+	return forward;
+}
+
+static void apic_set_tpr(struct kvm_kern_apic *apic, u32 tpr)
+{
+	int forward = 0;
+
+	apic_debug("new value = %x\n", tpr);
+
+	apic_set_reg(apic, APIC_TASKPRI, tpr);
+	forward = apic_update_ppr(apic);
+
+	if (forward) {
+		spin_unlock_bh(&apic->lock);
+		kvm_irqdevice_set_intr(apic->irq_dev, kvm_irqpin_localint);
+		spin_lock_bh(&apic->lock);
+	}
+}
+
+static int apic_match_dest(struct kvm_kern_apic *target,
+			   int dest,
+			   int dest_mode,
+			   int delivery_mode)
+{
+	int result = 0;
+
+	spin_lock_bh(&target->lock);
+
+	if (!dest_mode) /* Physical */
+		result = (GET_APIC_ID(apic_get_reg(target, APIC_ID)) == dest);
+	else { /* Logical */
+		u32 ldr = apic_get_reg(target, APIC_LDR);
+
+		/* Flat mode */
+		if (apic_get_reg(target, APIC_DFR) == APIC_DFR_FLAT)
+			result = GET_APIC_LOGICAL_ID(ldr) & dest;
+		else {
+			if ((delivery_mode == APIC_DM_LOWEST) &&
+			    (dest == 0xff)) {
+				printk(KERN_ALERT "Broadcast IPI " \
+				       "with lowest priority "
+				       "delivery mode\n");
+				spin_unlock_bh(&target->lock);
+				kvm_crash_guest(target->vcpu->kvm);
+				return 0;
+			}
+			if (GET_APIC_LOGICAL_ID(ldr) == (dest & 0xf))
+				result = (GET_APIC_LOGICAL_ID(ldr) >> 4) &
+					(dest >> 4);
+			else
+				result = 0;
+		}
+	}
+
+	spin_unlock_bh(&target->lock);
+
+	return result;
+}
+
+/*
+ * Add a pending IRQ into lapic.
+ * Return 1 if successfully added and 0 if discarded.
+ */
+static int __apic_accept_irq(struct kvm_kern_apic *apic,
+			     int delivery_mode,
+			     int vector,
+			     int level,
+			     int trig_mode)
+{
+	kvm_irqpin_t pin = kvm_irqpin_invalid;
+
+	switch (delivery_mode) {
+	case APIC_DM_FIXED:
+	case APIC_DM_LOWEST:
+		if (unlikely(!apic_enabled(apic)))
+			break;
+
+		if (test_and_set_bit(vector, apic->regs + APIC_IRR)
+		    && trig_mode) {
+			apic_debug("level trig mode repeatedly for vector " \
+				   "%d\n", vector);
+			break;
+		}
+
+		if (trig_mode) {
+			apic_debug("level trig mode for vector %d\n", vector);
+			set_bit(vector, apic->regs + APIC_TMR);
+		}
+
+		apic_debug("FIXED/LOWEST interrupt for vector %d\n", vector);
+		pin = kvm_irqpin_localint;
+		break;
+	case APIC_DM_REMRD:
+		printk(KERN_WARNING "%s: Ignore deliver mode %d\n",
+		       __FUNCTION__, delivery_mode);
+		break;
+	case APIC_DM_EXTINT:
+		apic_debug("EXTINT interrupt\n");
+		pin  = kvm_irqpin_extint;
+		break;
+	case APIC_DM_SMI:
+		apic_debug("SMI interrupt\n");
+		pin = kvm_irqpin_smi;
+		break;
+	case APIC_DM_NMI:
+		apic_debug("NMI interrupt\n");
+		pin = kvm_irqpin_nmi;
+		break;
+	case APIC_DM_INIT:
+		apic_debug("INIT interrupt\n");
+		if (level) {
+			spin_unlock_bh(&apic->lock);
+			kvm_lapic_reset(apic->vcpu);
+			spin_lock_bh(&apic->lock);
+		}
+		break;
+	case APIC_DM_STARTUP: /* FIXME: currently no support for SMP */
+	default:
+		printk(KERN_ALERT "TODO: support interrupt type %x\n",
+		       delivery_mode);
+		spin_unlock_bh(&apic->lock);
+		kvm_crash_guest(apic->vcpu->kvm);
+		spin_lock_bh(&apic->lock);
+		break;
+	}
+
+	if (likely(pin != kvm_irqpin_invalid)) {
+		/*
+		 * temp release of the lock to transmit
+		 */
+		spin_unlock_bh(&apic->lock);
+		kvm_irqdevice_set_intr(apic->irq_dev, pin);
+		spin_lock_bh(&apic->lock);
+
+		return 1;
+	} else
+		return 0;
+}
+
+static int apic_accept_irq(struct kvm_kern_apic *apic,
+			   int delivery_mode,
+			   int vector,
+			   int level,
+			   int trig_mode)
+{
+	int ret;
+
+	spin_lock_bh(&apic->lock);
+	ret = __apic_accept_irq(apic, delivery_mode, vector,
+				level, trig_mode);
+	spin_unlock_bh(&apic->lock);
+
+	return ret;
+}
+
+static void apic_set_eoi(struct kvm_kern_apic *apic)
+{
+	int vector = apic_find_highest_isr(apic);
+	int forward;
+
+	/*
+	 * Not every write EOI will has corresponding ISR,
+	 * one example is when Kernel check timer on setup_IO_APIC
+	 */
+	if (!vector)
+		return;
+
+	__clear_bit(vector, apic->regs + APIC_ISR);
+	forward = apic_update_ppr(apic);
+
+	__clear_bit(vector, apic->regs + APIC_TMR);
+
+	if (forward) {
+		spin_unlock_bh(&apic->lock);
+		kvm_irqdevice_set_intr(apic->irq_dev, kvm_irqpin_localint);
+		spin_lock_bh(&apic->lock);
+	}
+}
+
+static int apic_check_vector(struct kvm_kern_apic *apic,u32 dm, u32 vector)
+{
+	if ((dm == APIC_DM_FIXED) && (vector < 16)) {
+		apic->err_status |= 0x40;
+		__apic_accept_irq(apic, APIC_DM_FIXED,
+				  apic_lvt_vector(apic, APIC_LVTERR), 0, 0);
+		apic_debug("%s: check failed "
+		       " dm %x vector %x\n", __FUNCTION__, dm, vector);
+		return 0;
+	}
+	return 1;
+}
+
+int kvm_apicbus_send(struct kvm *kvm, int dest, int trig_mode, int level,
+		     int dest_mode, int delivery_mode, int vector)
+{
+	int i;
+	u32 lpr_map = 0;
+
+	apic_debug("%s: %d %d %d %d %d %d\n", __FUNCTION__,
+		   dest, trig_mode, level, dest_mode, delivery_mode, vector);
+
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		struct kvm_kern_apic *target;
+		target = kvm->vcpus[i].apic.dev;
+
+		if (!target)
+			continue;
+
+		if (apic_match_dest(target, dest, dest_mode, delivery_mode)) {
+			if (delivery_mode == APIC_DM_LOWEST)
+				__set_bit(target->vcpu_id, &lpr_map);
+			else
+				apic_accept_irq(target, delivery_mode,
+						vector, level, trig_mode);
+		}
+	}
+
+	if (delivery_mode == APIC_DM_LOWEST) {
+		struct kvm_kern_apic *target;
+
+		/* Currently only UP is supported */
+		target = kvm->vcpus[0].apic.dev;
+
+		if (target)
+			apic_accept_irq(target, delivery_mode,
+					vector, level, trig_mode);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_apicbus_send);
+
+static void apic_send_ipi(struct kvm_kern_apic *apic)
+{
+	u32 icr_low = apic_get_reg(apic, APIC_ICR);
+	u32 icr_high = apic_get_reg(apic, APIC_ICR2);
+
+	unsigned int dest =          GET_APIC_DEST_FIELD(icr_high);
+	unsigned int short_hand =    icr_low & APIC_SHORT_MASK;
+	unsigned int trig_mode =     icr_low & APIC_INT_LEVELTRIG;
+	unsigned int level =         icr_low & APIC_INT_ASSERT;
+	unsigned int dest_mode =     icr_low & APIC_DEST_MASK;
+	unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
+	unsigned int vector =        icr_low & APIC_VECTOR_MASK;
+
+	apic_debug("icr_high 0x%x, icr_low 0x%x, "
+		 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
+		 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
+		 icr_high, icr_low, short_hand, dest,
+		 trig_mode, level, dest_mode, delivery_mode, vector);
+
+	/*
+	 * We unlock here because we would enter this function in a lock
+	 * state and we dont want to remain this way while we transmit
+	 */
+	spin_unlock_bh(&apic->lock);
+
+	switch (short_hand) {
+	case APIC_DEST_NOSHORT:
+		/*
+		 * If no short-hand notation is in use, just forward the
+		 * message onto the apicbus and let the bus handle the routing.
+		 */
+		kvm_apicbus_send(apic->vcpu->kvm, dest, trig_mode, level,
+				 dest_mode, delivery_mode, vector);
+		break;
+	case APIC_DEST_SELF:
+		apic_accept_irq(apic, delivery_mode, vector, level, trig_mode);
+		break;
+	default: {
+		/*
+		 * Otherwise we need to consider the short-hand to find the
+		 * correct targets.
+		 */
+		unsigned int i;
+
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			struct kvm_kern_apic *target;
+			int result = 0;
+
+			target = apic->vcpu->kvm->vcpus[i].apic.dev;
+
+			if (!target)
+				continue;
+
+			switch (short_hand) {
+			case APIC_DEST_ALLINC:
+				result = 1;
+				break;
+
+			case APIC_DEST_ALLBUT:
+				if (target != apic)
+					result = 1;
+				break;
+			default:
+				kvm_crash_guest(apic->vcpu->kvm);
+				return;
+			}
+
+			if (result)
+				apic_accept_irq(target, delivery_mode,
+						vector, level, trig_mode);
+		}
+	}
+	}
+
+	/*
+	 * Relock before returning
+	 */
+	spin_lock_bh(&apic->lock);
+
+}
+
+static u32 apic_get_tmcct(struct kvm_kern_apic *apic)
+{
+	u32 counter_passed;
+	ktime_t passed, now = apic->timer.dev.base->get_time();
+	u32 tmcct = apic_get_reg(apic, APIC_TMCCT);
+
+	ASSERT(apic != NULL);
+
+	if (unlikely(ktime_to_ns(now) <=
+		     ktime_to_ns(apic->timer.last_update))) {
+		/* Wrap around */
+		passed = ktime_add(
+			({ (ktime_t){
+				.tv64 = KTIME_MAX -
+					 (apic->timer.last_update).tv64 };
+			}), now);
+		apic_debug("time elapsed\n");
+	} else
+		passed = ktime_sub(now, apic->timer.last_update);
+
+	counter_passed = ktime_to_ns(passed) /
+		(APIC_BUS_CYCLE_NS * apic->timer.divide_count);
+	tmcct -= counter_passed;
+
+	if (tmcct <= 0) {
+		if (unlikely(!apic_lvtt_period(apic))) {
+			tmcct =  0;
+		} else {
+			do {
+				tmcct += apic_get_reg(apic, APIC_TMICT);
+			} while ( tmcct <= 0 );
+		}
+	}
+
+	apic->timer.last_update = now;
+	apic_set_reg(apic, APIC_TMCCT, tmcct);
+
+	return tmcct;
+}
+
+/*
+ *----------------------------------------------------------------------
+ * MMIO
+ *----------------------------------------------------------------------
+ */
+
+#define align(val, len) (val & ~(len-1))
+
+static int validate_mmio(struct kvm_kern_apic *apic, gpa_t address, int len)
+{
+	/*
+	 * According to IA 32 Manual, all registers should be accessed with
+	 * 32 bits alignment.
+	 */
+	if (align(address, 4) != align(address+(len-1), 4)) {
+		printk(KERN_WARNING "KVM: MMIO request for %d bytes at " \
+		       "0x%lx is not 32 bit aligned.  Injecting #GP\n",
+		       len, address);
+		inject_gp(apic->vcpu);
+		return 0;
+	}
+
+	return 1;
+}
+
+static u32 __apic_read(struct kvm_kern_apic *apic,
+				unsigned int offset)
+{
+	u32 val = 0;
+
+	if (offset > APIC_TDCR)
+		return 0;
+
+	switch (offset) {
+	case APIC_ARBPRI:
+		printk(KERN_WARNING "access local APIC ARBPRI register " \
+		       "which is for P6\n");
+		break;
+
+	case APIC_TMCCT:        /* Timer CCR */
+		val = apic_get_tmcct(apic);
+		break;
+
+	case APIC_ESR:
+		apic->err_write_count = 0;
+		/* fall through */
+	default:
+		val = apic_get_reg(apic, offset);
+		break;
+	}
+
+	return val;
+}
+
+static void apic_mmio_read(struct kvm_io_device *this,
+			   gpa_t address,
+			   int len,
+			   void *data)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private;
+	unsigned int          offset = address - apic->base_address;
+	unsigned char         alignment = offset & 0x3;
+	u32                   val;
+
+	if (!validate_mmio(apic, address, len))
+		return;
+
+	spin_lock_bh(&apic->lock);
+	val = __apic_read(apic, offset & ~0x3);
+	spin_unlock_bh(&apic->lock);
+
+	switch (len) {
+	case 1:
+	case 2:
+	case 4:
+		memcpy(data, (char*)((char*)&val + alignment), len);
+		break;
+	default:
+		printk(KERN_ALERT "Local APIC read with len = %x, " \
+		       "should be 1,2, or 4 instead\n", len);
+		inject_gp(apic->vcpu);
+		break;
+	}
+}
+
+static void apic_mmio_write(struct kvm_io_device *this,
+			    gpa_t address,
+			    int len,
+			    const void *data)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private;
+	unsigned int          offset = address - apic->base_address;
+	unsigned char         alignment = offset & 0x3;
+	u32                   val;
+
+	if (!validate_mmio(apic, address, len))
+		return;
+
+	spin_lock_bh(&apic->lock);
+
+	switch (len) {
+	case 1:
+	case 2: {
+		unsigned int tmp;
+
+		/*
+		 * Some kernels will access with byte/word alignment
+		 */
+		apic_debug("Notice: Local APIC write with len = %x\n", len);
+		tmp = __apic_read(apic, offset & ~0x3);
+		switch (len) {
+		case 1:
+			val = *(u8*)data;
+
+			val = (tmp & ~(0xff << (8*alignment))) |
+			      ((val & 0xff) << (8*alignment));
+			break;
+
+		case 2:
+			if (alignment != 0x0 && alignment != 0x2) {
+				printk(KERN_ALERT "alignment error for apic " \
+				       "with len == 2\n");
+				inject_gp(apic->vcpu);
+			}
+
+			/*
+			 * assumes 16 bit alignment on the pointer.
+			 * Mis-alignment is a host-side issue, however, so
+			 * we crash
+			 */
+			BUG_ON(((long)data & 0x1));
+
+			val = *(u16*)data;
+
+			val = (tmp & ~(0xffff << (8*alignment))) |
+			      ((val & 0xffff) << (8*alignment));
+			break;
+		}
+
+		break;
+	}
+	case 4:
+		memcpy(&val, data, 4);
+		break;
+	default:
+		printk(KERN_ALERT "Local APIC write with len = %x, " \
+		       "should be 1,2, or 4 instead\n", len);
+		inject_gp(apic->vcpu);
+		break;
+	}
+
+	/* too common printing */
+	if (offset != APIC_EOI)
+		apic_debug("%s: offset 0x%x with length 0x%x, and value is " \
+			 "0x%lx\n",
+		       __FUNCTION__, offset, len, val);
+
+	offset &= 0xff0;
+
+	switch (offset) {
+	case APIC_ID:   /* Local APIC ID */
+		apic_set_reg(apic, APIC_ID, val);
+		break;
+
+	case APIC_TASKPRI:
+		apic_set_tpr(apic, val & 0xff);
+		break;
+
+	case APIC_EOI:
+		apic_set_eoi(apic);
+		break;
+
+	case APIC_LDR:
+		apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+		break;
+
+	case APIC_DFR:
+		apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+		break;
+
+	case APIC_SPIV:
+		apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
+		if (!(val & APIC_SPIV_APIC_ENABLED)) {
+			int i;
+			u32 lvt_val;
+
+			apic->status |= APIC_SOFTWARE_DISABLE_MASK;
+			for (i = 0; i < APIC_LVT_NUM; i++) {
+				lvt_val = apic_get_reg(apic,
+							   APIC_LVTT +
+							   0x10 * i);
+				apic_set_reg(apic, APIC_LVTT + 0x10 * i,
+						 lvt_val | APIC_LVT_MASKED);
+			}
+
+			if ((apic_get_reg(apic, APIC_LVT0) &
+			     APIC_MODE_MASK) == APIC_DM_EXTINT)
+				clear_bit(_APIC_BSP_ACCEPT_PIC, &apic->status);
+		} else {
+			apic->status &= ~APIC_SOFTWARE_DISABLE_MASK;
+			if ((apic_get_reg(apic, APIC_LVT0) &
+			     APIC_MODE_MASK) == APIC_DM_EXTINT)
+				set_bit(_APIC_BSP_ACCEPT_PIC, &apic->status);
+		}
+		break;
+
+	case APIC_ESR:
+		apic->err_write_count = !apic->err_write_count;
+		if (!apic->err_write_count)
+			apic->err_status = 0;
+		break;
+
+	case APIC_ICR:
+		/* No delay here, so we always clear the pending bit*/
+		apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
+		apic_send_ipi(apic);
+		break;
+
+	case APIC_ICR2:
+		apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
+		break;
+
+	case APIC_LVTT:
+	case APIC_LVTTHMR:
+	case APIC_LVTPC:
+	case APIC_LVT0:
+	case APIC_LVT1:
+	case APIC_LVTERR:
+	{
+		if (apic->status & APIC_SOFTWARE_DISABLE_MASK)
+			val |= APIC_LVT_MASKED;
+
+		val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
+		apic_set_reg(apic, offset, val);
+
+		/* On hardware, when write vector less than 0x20 will error */
+		if (!(val & APIC_LVT_MASKED))
+			apic_check_vector(apic, apic_lvt_dm(apic, offset),
+					  apic_lvt_vector(apic, offset));
+		if (!apic->vcpu_id && (offset == APIC_LVT0)) {
+			if ((val & APIC_MODE_MASK) == APIC_DM_EXTINT)
+				if (val & APIC_LVT_MASKED)
+					clear_bit(_APIC_BSP_ACCEPT_PIC,
+						  &apic->status);
+				else
+					set_bit(_APIC_BSP_ACCEPT_PIC,
+						&apic->status);
+			else
+				clear_bit(_APIC_BSP_ACCEPT_PIC,
+					  &apic->status);
+		}
+	}
+		break;
+
+	case APIC_TMICT:
+	{
+		ktime_t now = apic->timer.dev.base->get_time();
+		u32 offset;
+
+		apic_set_reg(apic, APIC_TMICT, val);
+		apic_set_reg(apic, APIC_TMCCT, val);
+		apic->timer.last_update = now;
+		offset = APIC_BUS_CYCLE_NS * apic->timer.divide_count * val;
+
+		/* Make sure the lock ordering is coherent */
+		spin_unlock_bh(&apic->lock);
+		hrtimer_cancel(&apic->timer.dev);
+		hrtimer_start(&apic->timer.dev,
+			      ktime_add_ns(now, offset),
+			      HRTIMER_MODE_ABS);
+
+		apic_debug("%s: bus cycle is %"PRId64"ns, now 0x%016"PRIx64", "
+			 "timer initial count 0x%x, offset 0x%x, "
+			 "expire @ 0x%016"PRIx64".\n", __FUNCTION__,
+			 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
+			 apic_get_reg(apic, APIC_TMICT),
+			 offset, ktime_to_ns(ktime_add_ns(now, offset)));
+	}
+		return;
+
+	case APIC_TDCR:
+	{
+		unsigned int tmp1, tmp2;
+
+		tmp1 = val & 0xf;
+		tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
+		apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
+
+		apic_set_reg(apic, APIC_TDCR, val);
+
+		apic_debug("timer divide count is 0x%x\n",
+		       apic->timer.divide_count);
+	}
+		break;
+
+	default:
+		printk(KERN_WARNING "Local APIC Write to read-only register\n");
+		break;
+	}
+
+	spin_unlock_bh(&apic->lock);
+}
+
+static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private;
+	int ret = 0;
+
+	spin_lock_bh(&apic->lock);
+
+	if (apic_global_enabled(apic) &&
+	    (addr >= apic->base_address) &&
+	    (addr < (apic->base_address + VLOCAL_APIC_MEM_LENGTH)))
+		ret = 1;
+
+	spin_unlock_bh(&apic->lock);
+
+	return ret;
+}
+
+static void apic_mmio_destructor(struct kvm_io_device *this)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private;
+
+	apic_dropref(apic);
+}
+
+static void apic_mmio_register(struct kvm_kern_apic *apic)
+{
+	/* Register ourselves with the MMIO subsystem */
+	struct kvm_io_device *dev = &apic->mmio_dev;
+
+	dev->read       = apic_mmio_read;
+	dev->write      = apic_mmio_write;
+	dev->in_range   = apic_mmio_range;
+	dev->destructor = apic_mmio_destructor;
+
+	dev->private = apic;
+	atomic_inc(&apic->ref_count);
+
+	apic->vcpu->apic.mmio = dev;
+}
+
+/*
+ *----------------------------------------------------------------------
+ * LAPIC interface
+ *----------------------------------------------------------------------
+ */
+
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, u64 cr8)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev;
+
+	spin_lock_bh(&apic->lock);
+	apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
+	spin_unlock_bh(&apic->lock);
+}
+
+u64 kvm_lapic_get_tpr(struct kvm_vcpu *vcpu)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev;
+	u64 tpr;
+
+	spin_lock_bh(&apic->lock);
+	tpr = (u64)apic_get_reg(apic, APIC_TASKPRI);
+	spin_unlock_bh(&apic->lock);
+
+	return (tpr & 0xf0) >> 4;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_get_tpr);
+
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev;
+
+	spin_lock_bh(&apic->lock);
+	if (apic->vcpu_id)
+		value &= ~MSR_IA32_APICBASE_BSP;
+
+	apic->base_msr = value;
+	apic->base_address = apic->base_msr & MSR_IA32_APICBASE_BASE;
+
+	/* with FSB delivery interrupt, we can restart APIC functionality */
+	if (!(value & MSR_IA32_APICBASE_ENABLE))
+		set_bit(_APIC_GLOB_DISABLE, &apic->status);
+	else
+		clear_bit(_APIC_GLOB_DISABLE, &apic->status);
+
+	apic_debug("apic base msr is 0x%016"PRIx64", and base address is " \
+		 "0x%lx.\n", apic->base_msr, apic->base_address);
+
+	spin_unlock_bh(&apic->lock);
+}
+
+u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev;
+	u64 base;
+
+	spin_lock_bh(&apic->lock);
+	base = apic->base_msr;
+	spin_unlock_bh(&apic->lock);
+
+	return base;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
+
+void kvm_lapic_save(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	/*
+	 * FIXME: This needs to support the entire register set when
+	 * enabled
+	 */
+	sregs->cr8       = kvm_lapic_get_tpr(vcpu);
+	sregs->apic_base = kvm_lapic_get_base(vcpu);
+}
+
+void kvm_lapic_restore(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	/*
+	 * FIXME: This needs to support the entire register set when
+	 * enabled
+	 */
+	kvm_lapic_set_tpr(vcpu, sregs->cr8);
+	kvm_lapic_set_base(vcpu, sregs->apic_base);
+}
+
+void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+{
+	struct kvm_kern_apic *apic;
+	int i;
+
+	apic_debug("%s\n", __FUNCTION__);
+
+	ASSERT(vcpu);
+	apic = vcpu->apic.dev;
+	ASSERT(apic != NULL);
+
+	/* Stop the timer in case it's a reset to an active apic */
+	hrtimer_cancel(&apic->timer.dev);
+
+	spin_lock_bh(&apic->lock);
+
+	apic_set_reg(apic, APIC_ID, vcpu_slot(vcpu) << 24);
+	apic_set_reg(apic, APIC_LVR, APIC_VERSION);
+
+	for (i = 0; i < APIC_LVT_NUM; i++)
+		apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
+
+	apic_set_reg(apic, APIC_DFR, 0xffffffffU);
+	apic_set_reg(apic, APIC_SPIV, 0xff);
+	apic_set_reg(apic, APIC_TASKPRI, 0);
+	apic_set_reg(apic, APIC_LDR, 0);
+	apic_set_reg(apic, APIC_ESR, 0);
+	apic_set_reg(apic, APIC_ICR, 0);
+	apic_set_reg(apic, APIC_ICR2, 0);
+	apic_set_reg(apic, APIC_TDCR, 0);
+	apic_set_reg(apic, APIC_TMICT, 0);
+	memset((void*)(apic->regs + APIC_IRR), 0, KVM_IRQ_BITMAP_SIZE(u8));
+	memset((void*)(apic->regs + APIC_ISR), 0, KVM_IRQ_BITMAP_SIZE(u8));
+	memset((void*)(apic->regs + APIC_TMR), 0, KVM_IRQ_BITMAP_SIZE(u8));
+
+	apic->base_msr =
+		MSR_IA32_APICBASE_ENABLE |
+		APIC_DEFAULT_PHYS_BASE;
+	if (vcpu_slot(vcpu) == 0)
+		apic->base_msr |= MSR_IA32_APICBASE_BSP;
+	apic->base_address = apic->base_msr & MSR_IA32_APICBASE_BASE;
+
+	apic->timer.divide_count = 0;
+	apic->timer.pending = 0;
+	apic->status = 0;
+
+#ifdef APIC_NO_BIOS
+	/*
+	 * XXX According to mp specification, BIOS will enable LVT0/1,
+	 * remove it after BIOS enabled
+	 */
+	if (!vcpu_slot(vcpu)) {
+		apic_set_reg(apic, APIC_LVT0, APIC_MODE_EXTINT << 8);
+		apic_set_reg(apic, APIC_LVT1, APIC_MODE_NMI << 8);
+		set_bit(_APIC_BSP_ACCEPT_PIC, &apic->status);
+	}
+#endif
+
+	spin_unlock_bh(&apic->lock);
+
+	printk(KERN_INFO  "%s: vcpu=%p, id=%d, base_msr=" \
+	       "0x%016"PRIx64", base_address=0x%0lx.\n", __FUNCTION__, vcpu,
+	       GET_APIC_ID(apic_get_reg(apic, APIC_ID)),
+	       apic->base_msr, apic->base_address);
+}
+
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev;
+	int ret = 0;
+
+	spin_lock_bh(&apic->lock);
+	if (!apic->usermode)
+		ret = apic_enabled(apic);
+	spin_unlock_bh(&apic->lock);
+
+	return ret;
+}
+
+void *kvm_lapic_get_regs(struct kvm_vcpu *vcpu)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev;
+	return apic->regs;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_get_regs);
+
+/*
+ *----------------------------------------------------------------------
+ * timer interface
+ *----------------------------------------------------------------------
+ */
+static int __apic_timer_fn(struct kvm_kern_apic *apic)
+{
+	u32 vector;
+	ktime_t now;
+	int result = 0;
+
+	if (unlikely(!apic_enabled(apic) ||
+		     !apic_lvt_enabled(apic, APIC_LVTT))) {
+		apic_debug("%s: time interrupt although apic is down\n",
+			 __FUNCTION__);
+		return 0;
+	}
+
+	vector                  = apic_lvt_vector(apic, APIC_LVTT);
+	now                     = apic->timer.dev.base->get_time();
+	apic->timer.last_update = now;
+	apic->timer.pending++;
+
+	__apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
+
+	if (apic_lvtt_period(apic)) {
+		u32 offset;
+		u32 tmict = apic_get_reg(apic, APIC_TMICT);
+
+		apic_set_reg(apic, APIC_TMCCT, tmict);
+		offset = APIC_BUS_CYCLE_NS * apic->timer.divide_count * tmict;
+
+		result = 1;
+		apic->timer.dev.expires = ktime_add_ns(now, offset);
+
+		apic_debug("%s: now 0x%016"PRIx64", expire @ 0x%016"PRIx64", "
+		       "timer initial count 0x%x, timer current count 0x%x.\n",
+		       __FUNCTION__,
+		       ktime_to_ns(now), ktime_add_ns(now, offset),
+		       apic_get_reg(apic, APIC_TMICT),
+	               apic_get_reg(apic, APIC_TMCCT));
+	} else {
+		apic_set_reg(apic, APIC_TMCCT, 0);
+		apic_debug("%s: now 0x%016"PRIx64", "
+		       "timer initial count 0x%x, timer current count 0x%x.\n",
+		       __FUNCTION__,
+		       ktime_to_ns(now), apic_get_reg(apic, APIC_TMICT),
+		       apic_get_reg(apic, APIC_TMCCT));
+	}
+
+	return result;
+}
+
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
+{
+	struct kvm_kern_apic *apic;
+	int restart_timer = 0;
+
+	apic = container_of(data, struct kvm_kern_apic, timer.dev);
+
+	spin_lock_bh(&apic->lock);
+	restart_timer = __apic_timer_fn(apic);
+	spin_unlock_bh(&apic->lock);
+
+	if (restart_timer)
+		return HRTIMER_RESTART;
+	else
+		return HRTIMER_NORESTART;
+}
+
+/*
+ *----------------------------------------------------------------------
+ * IRQDEVICE interface
+ *----------------------------------------------------------------------
+ */
+
+static int apic_irqdev_ack(struct kvm_irqdevice *this, int flags,
+			   struct kvm_irqack_data *data)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private;
+	int irq;
+
+	apic_debug("LAPIC ACK attempt\n");
+
+	spin_lock_bh(&apic->lock);
+
+	if (!apic_enabled(apic))
+		goto out;
+
+	if (!(flags & KVM_IRQACK_FLAG_PEEK)) {
+		irq = apic_find_highest_irr(apic);
+		if ((irq & 0xf0) > apic_get_reg(apic, APIC_PROCPRI)) {
+			BUG_ON (irq < 0x10);
+
+			__set_bit(irq, apic->regs + APIC_ISR);
+			__clear_bit(irq, apic->regs + APIC_IRR);
+			apic_update_ppr(apic);
+
+			/*
+			 * We have to special case the timer interrupt
+			 * because we want the vector to stay pending
+			 * for each tick of the clock, even for a backlog.
+			 * Therefore, if this was a timer vector and we
+			 * still have ticks pending, keep IRR set
+			 */
+			if (irq == apic_lvt_vector(apic, APIC_LVTT)) {
+				BUG_ON(!apic->timer.pending);
+				apic->timer.pending--;
+				if (apic->timer.pending)
+					__set_bit(irq, apic->regs + APIC_IRR);
+			}
+
+			data->flags |= KVM_IRQACKDATA_VECTOR_VALID;
+			data->vector = irq;
+		}
+		else
+			data->vector = -1;
+
+		apic_debug("ACK for vector %d\n", data->vector);
+	}
+
+	/*
+	 * See if there is anything still pending.  Don't forget that we may
+	 * have entered this function with PEEK just to check pending
+	 * status.  This is really the only way we could ever find something
+	 * still eligible, since otherwise we would have just injected
+	 * the highest priority vector above
+	 */
+	irq = apic_find_highest_irr(apic);
+	if ((irq & 0xf0) > apic_get_reg(apic, APIC_PROCPRI))
+		data->flags |= KVM_IRQACKDATA_VECTOR_PENDING;
+
+	if (irq) {
+		/*
+		 * We report the next pending vector here so that the system
+		 * can assess TPR thresholds for TPR-shadowing purposes
+		 * (if applicable)
+		 */
+		data->next   = irq;
+		data->flags |= KVM_IRQACKDATA_NEXT_VALID;
+	}
+
+ out:
+	spin_unlock_bh(&apic->lock);
+
+	return 0;
+}
+
+static int apic_irqdev_set_pin(struct kvm_irqdevice *this, int irq, int level)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private;
+	int lvt = 0;
+
+	spin_lock_bh(&apic->lock);
+
+	if (!apic_enabled(apic)) {
+		/*
+		 * If the LAPIC is disabled, we simply forward the interrupt
+		 * on to the output line
+		 */
+		__apic_accept_irq(apic, APIC_DM_EXTINT, 0, level, 1);
+		goto out;
+	}
+
+	/*
+	 * pin "0" is LINT0, and "1" is LINT1
+	 */
+	BUG_ON(irq > 1);
+
+	switch(irq) {
+	case 0:
+		lvt = APIC_LVT0;
+		break;
+	case 1:
+		lvt = APIC_LVT1;
+		break;
+	}
+
+	if (apic_lvt_enabled(apic, lvt))
+		__apic_accept_irq(apic,
+				  apic_lvt_dm(apic, lvt),
+				  apic_lvt_vector(apic, lvt),
+				  level,
+				  1);
+
+
+ out:
+	spin_unlock_bh(&apic->lock);
+
+	return 0;
+}
+
+static void apic_irqdev_destructor(struct kvm_irqdevice *this)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private;
+
+	apic_dropref(apic);
+}
+
+static void apic_irqdev_register(struct kvm_kern_apic *apic,
+				 struct kvm_irqdevice *dev)
+{
+	dev->ack         = apic_irqdev_ack;
+	dev->set_pin     = apic_irqdev_set_pin;
+	dev->destructor  = apic_irqdev_destructor;
+
+	dev->private = apic;
+	atomic_inc(&apic->ref_count);
+
+	apic->irq_dev = dev;
+}
+
+int kvm_lapic_init(struct kvm_vcpu *vcpu,
+		   struct kvm_irqdevice *irq_dev, int flags)
+{
+	struct kvm_kern_apic *apic = NULL;
+	struct kvm_io_device *mmio_dev = NULL;
+
+	ASSERT(vcpu != NULL);
+	apic_debug("apic_init %d\n", vcpu_slot(vcpu));
+
+	apic = kzalloc(sizeof(*apic), GFP_KERNEL);
+	if (!apic)
+		goto nomem;
+
+	spin_lock_init(&apic->lock);
+	atomic_inc(&apic->ref_count);
+	apic->vcpu_id = vcpu_slot(vcpu);
+
+	apic->regs_page = alloc_page(GFP_KERNEL);
+	if ( apic->regs_page == NULL ) {
+		printk(KERN_ALERT "malloc apic regs error for vcpu %x\n",
+		       vcpu_slot(vcpu));
+		goto nomem;
+	}
+	apic->regs = page_address(apic->regs_page);
+	memset(apic->regs, 0, PAGE_SIZE);
+
+	apic->vcpu = vcpu;
+	vcpu->apic.dev = apic;
+
+	if (!(flags & KVM_LAPIC_OPTION_USERMODE)) {
+		apic_irqdev_register(apic, irq_dev);
+		apic_mmio_register(apic);
+	} else
+		apic->usermode = 1;
+
+	hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	apic->timer.dev.function = apic_timer_fn;
+
+	kvm_lapic_reset(vcpu);
+	return 0;
+
+ nomem:
+	if (mmio_dev)
+		kfree(mmio_dev);
+
+	if (apic)
+		apic_dropref(apic);
+
+	return -ENOMEM;
+}
+
+void kvm_lapic_destroy(struct kvm_vcpu *vcpu)
+{
+	struct kvm_kern_apic *apic = vcpu->apic.dev;
+
+	if (vcpu->apic.mmio)
+		kvm_iodevice_destructor(vcpu->apic.mmio);
+
+	apic_dropref(apic);
+}
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index 68841ef..03c58fd 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -107,24 +107,6 @@ static unsigned get_addr_size(struct kvm_vcpu *vcpu)
 				(cs_attrib & SVM_SELECTOR_DB_MASK) ? 4 : 2;
 }
 
-static inline u8 pop_irq(struct kvm_vcpu *vcpu)
-{
-	int word_index = __ffs(vcpu->irq_summary);
-	int bit_index = __ffs(vcpu->irq_pending[word_index]);
-	int irq = word_index * BITS_PER_LONG + bit_index;
-
-	clear_bit(bit_index, &vcpu->irq_pending[word_index]);
-	if (!vcpu->irq_pending[word_index])
-		clear_bit(word_index, &vcpu->irq_summary);
-	return irq;
-}
-
-static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
-{
-	set_bit(irq, vcpu->irq_pending);
-	set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
-}
-
 static inline void clgi(void)
 {
 	asm volatile (SVM_CLGI);
@@ -589,9 +571,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 
 	fx_init(vcpu);
 	vcpu->fpu_active = 1;
-	vcpu->apic_base = 0xfee00000 |
-			/*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
-			MSR_IA32_APICBASE_ENABLE;
 
 	return 0;
 
@@ -905,7 +884,12 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 
 	if (is_external_interrupt(exit_int_info))
-		push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
+		/*
+		 * An exception was taken while we were trying to inject an
+		 * IRQ.  We must defer the injection of the vector until
+		 * the next window.
+		 */
+		kvm_vcpu_irq_push(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
 
 	spin_lock(&vcpu->kvm->lock);
 
@@ -1115,7 +1099,7 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1;
 	skip_emulated_instruction(vcpu);
-	return kvm_emulate_halt(vcpu);
+	return kvm_vcpu_halt(vcpu, kvm_run);
 }
 
 static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1281,7 +1265,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu,
 	 * possible
 	 */
 	if (kvm_run->request_interrupt_window &&
-	    !vcpu->irq_summary) {
+	    !kvm_vcpu_irq_pending(vcpu)) {
 		++vcpu->stat.irq_window_exits;
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		return 0;
@@ -1380,63 +1364,148 @@ static void pre_svm_run(struct kvm_vcpu *vcpu)
 }
 
 
-static inline void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
-{
-	struct vmcb_control_area *control;
-
-	control = &vcpu->svm->vmcb->control;
-	control->int_vector = pop_irq(vcpu);
-	control->int_ctl &= ~V_INTR_PRIO_MASK;
-	control->int_ctl |= V_IRQ_MASK |
-		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
-}
-
 static void kvm_reput_irq(struct kvm_vcpu *vcpu)
 {
 	struct vmcb_control_area *control = &vcpu->svm->vmcb->control;
 
 	if (control->int_ctl & V_IRQ_MASK) {
 		control->int_ctl &= ~V_IRQ_MASK;
-		push_irq(vcpu, control->int_vector);
+		kvm_vcpu_irq_push(vcpu, control->int_vector);
 	}
 
 	vcpu->interrupt_window_open =
 		!(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
 }
 
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
-				       struct kvm_run *kvm_run)
+static void do_intr_requests(struct kvm_vcpu *vcpu,
+			    struct kvm_run *kvm_run,
+			    kvm_irqpin_t pin)
 {
 	struct vmcb_control_area *control = &vcpu->svm->vmcb->control;
+	int pending = 0;
 
 	vcpu->interrupt_window_open =
 		(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
 		 (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF));
 
-	if (vcpu->interrupt_window_open && vcpu->irq_summary)
+	if (vcpu->interrupt_window_open) {
 		/*
-		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
+		 * If interrupts enabled, and not blocked by sti or mov ss.
+		 * Good.
 		 */
-		kvm_do_inject_irq(vcpu);
+		struct kvm_irqack_data ack;
+		int r = 0;
+
+		memset(&ack, 0, sizeof(ack));
+
+		switch (pin) {
+		case kvm_irqpin_localint:
+			r = kvm_vcpu_irq_pop(vcpu, &ack);
+			break;
+		case kvm_irqpin_extint:
+			r = kvm_irqdevice_ack(&vcpu->kvm->isa_irq, 0, &ack);
+			if (!(ack.flags & KVM_IRQACKDATA_VECTOR_PENDING))
+				__clear_bit(pin, &vcpu->irq.pending);
+			break;
+		case kvm_irqpin_nmi:
+			/*
+			 * FIXME: Someday we will handle this using the
+			 * specific SVN NMI features.  For now, just inject
+			 * the NMI as a standard interrupt on vector 2
+			 */
+			ack.flags |= KVM_IRQACKDATA_VECTOR_VALID;
+			ack.vector = 2;
+			__clear_bit(pin, &vcpu->irq.pending);
+			break;
+		default:
+			panic("KVM: unknown interrupt pin raised: %d\n", pin);
+			break;
+		}
+
+		BUG_ON(r < 0);
+
+		if (ack.flags & KVM_IRQACKDATA_VECTOR_VALID) {
+			control = &vcpu->svm->vmcb->control;
+			control->int_vector = ack.vector;
+			control->int_ctl &= ~V_INTR_PRIO_MASK;
+			control->int_ctl |= V_IRQ_MASK |
+				((/*control->int_vector >> 4*/ 0xf) <<
+				 V_INTR_PRIO_SHIFT);
+
+			++vcpu->stat.irq_accepted;
+		}
+	}
 
 	/*
-	 * Interrupts blocked.  Wait for unblock.
+	 * Re-read the pending interrupt state.  If anything is still
+	 * pending we need to cause an exit on the next window
 	 */
-	if (!vcpu->interrupt_window_open &&
-	    (vcpu->irq_summary || kvm_run->request_interrupt_window)) {
+	pending = __kvm_vcpu_irq_pending(vcpu);
+
+	if (test_bit(pin, &pending))
+		/*
+		 * Trigger a VMEXIT on the next IRQ window
+		 */
 		control->intercept |= 1ULL << INTERCEPT_VINTR;
-	} else
+}
+
+static void clear_pending_controls(struct kvm_vcpu *vcpu,
+				  struct kvm_run *kvm_run)
+{
+	struct vmcb_control_area *control = &vcpu->svm->vmcb->control;
+
+	if (kvm_run->request_interrupt_window)
+		control->intercept |= 1ULL << INTERCEPT_VINTR;
+	else
 		control->intercept &= ~(1ULL << INTERCEPT_VINTR);
 }
 
+static void do_interrupt_requests(struct kvm_vcpu *vcpu,
+				  struct kvm_run *kvm_run)
+{
+	int pending = __kvm_vcpu_irq_pending(vcpu);
+
+	clear_pending_controls(vcpu, kvm_run);
+
+	while (pending) {
+		kvm_irqpin_t pin = __fls(pending);
+
+		switch (pin) {
+		case kvm_irqpin_localint:
+		case kvm_irqpin_extint:
+		case kvm_irqpin_nmi:
+			do_intr_requests(vcpu, kvm_run, pin);
+			break;
+		case kvm_irqpin_smi:
+			/* ignored (for now) */
+			printk(KERN_WARNING "KVM: dropping unhandled SMI\n");
+			__clear_bit(pin, &vcpu->irq.pending);
+			break;
+		case kvm_irqpin_invalid:
+			/* drop */
+			break;
+		default:
+			panic("KVM: unknown interrupt pin raised: %d\n", pin);
+			break;
+		}
+
+		__clear_bit(pin, &pending);
+	}
+}
+
 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 			      struct kvm_run *kvm_run)
 {
-	kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open &&
-						  vcpu->irq_summary == 0);
+	struct kvm_irqack_data ack;
+	kvm_irqdevice_ack(&vcpu->irq.dev, KVM_IRQACK_FLAG_PEEK, &ack);
+
+	kvm_run->ready_for_interrupt_injection =
+		(vcpu->interrupt_window_open &&
+		 !kvm_vcpu_irq_pending(vcpu) &&
+		 !(ack.flags & KVM_IRQACKDATA_NEXT_VALID));
 	kvm_run->if_flag = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF) != 0;
-	kvm_run->cr8 = vcpu->cr8;
-	kvm_run->apic_base = vcpu->apic_base;
+	kvm_run->cr8 = kvm_lapic_get_tpr(vcpu);
+	kvm_run->apic_base = kvm_lapic_get_base(vcpu);
 }
 
 /*
@@ -1448,7 +1517,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
 					  struct kvm_run *kvm_run)
 {
-	return (!vcpu->irq_summary &&
+	return (!kvm_vcpu_irq_pending(vcpu) &&
 		kvm_run->request_interrupt_window &&
 		vcpu->interrupt_window_open &&
 		(vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF));
@@ -1492,6 +1561,42 @@ again:
 
 	clgi();
 
+	spin_lock(&vcpu->irq.lock);
+
+	/*
+	 * If there are any signals pending (virtual interrupt related or
+	 * otherwise), don't even bother trying to enter guest mode...
+	 */
+	if (signal_pending(current)) {
+		kvm_run->exit_reason = KVM_EXIT_INTR;
+		spin_unlock(&vcpu->irq.lock);
+		stgi();
+		r = -EINTR;
+		/*
+		 * FIXME: We probably want to move this whole lock-block below
+		 * the host->guest state loading so we don't restore when
+		 * the system was never saved to begin with
+		 */
+		goto out;
+	}
+
+	/*
+	 * There are optimizations we can make when signaling interrupts
+	 * if we know the VCPU is in GUEST mode, so record the guest's
+	 * CPU to both serve as an indicator of vcpu state and a target
+	 * for our interrupts
+	 */
+	vcpu->irq.guest_cpu = task_cpu(current);
+
+	/*
+	 * We must inject interrupts (if any) while the irq_lock
+	 * is held
+	 */
+	if (!vcpu->mmio_read_completed)
+		do_interrupt_requests(vcpu, kvm_run);
+
+	spin_unlock(&vcpu->irq.lock);
+
 	vcpu->guest_mode = 1;
 	if (vcpu->requests)
 		if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
@@ -1658,6 +1763,13 @@ again:
 		profile_hit(KVM_PROFILING,
 			(void *)(unsigned long)vcpu->svm->vmcb->save.rip);
 
+	/*
+	 * Signal that we have transitioned back to host mode
+	 */
+	spin_lock(&vcpu->irq.lock);
+	vcpu->irq.guest_cpu = -1;
+	spin_unlock(&vcpu->irq.lock);
+
 	stgi();
 
 	kvm_reput_irq(vcpu);
@@ -1668,28 +1780,31 @@ again:
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->fail_entry.hardware_entry_failure_reason
 			= vcpu->svm->vmcb->control.exit_code;
-		post_kvm_run_save(vcpu, kvm_run);
-		return 0;
+		r = 0;
+		goto out;
 	}
 
 	r = handle_exit(vcpu, kvm_run);
 	if (r > 0) {
 		if (signal_pending(current)) {
 			++vcpu->stat.signal_exits;
-			post_kvm_run_save(vcpu, kvm_run);
 			kvm_run->exit_reason = KVM_EXIT_INTR;
-			return -EINTR;
+			r = -EINTR;
+			goto out;
 		}
 
 		if (dm_request_for_irq_injection(vcpu, kvm_run)) {
 			++vcpu->stat.request_irq_exits;
 			post_kvm_run_save(vcpu, kvm_run);
 			kvm_run->exit_reason = KVM_EXIT_INTR;
-			return -EINTR;
+			r = -EINTR;
+			goto out;
 		}
 		kvm_resched(vcpu);
 		goto again;
 	}
+
+ out:
 	post_kvm_run_save(vcpu, kvm_run);
 	return r;
 }
diff --git a/drivers/kvm/userint.c b/drivers/kvm/userint.c
new file mode 100644
index 0000000..c6118b0
--- /dev/null
+++ b/drivers/kvm/userint.c
@@ -0,0 +1,229 @@
+/*
+ * User Interrupts IRQ device
+ *
+ * This acts as an extention of an interrupt controller that exists elsewhere
+ * (typically in userspace/QEMU).  Because this PIC is a pseudo device that
+ * is downstream from a real emulated PIC, the "IRQ-to-vector" mapping has
+ * already occured.  Therefore, this PIC has the following unusal properties:
+ *
+ * 1) It has 256 "pins" which are literal vectors (i.e. no translation)
+ * 2) It only supports "auto-EOI" behavior since it is expected that the
+ *    upstream emulated PIC will handle the real EOIs (if applicable)
+ * 3) It only listens to "asserts" on the pins (deasserts are dropped)
+ *    because its an auto-EOI device anyway.
+ *
+ * Copyright (C) 2007 Novell
+ *
+ * bitarray code based on original vcpu->irq_pending code,
+ *     Copyright (C) 2007 Qumranet
+ *
+ * Authors:
+ *   Gregory Haskins <ghaskins@novell.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "kvm.h"
+
+/*
+ *----------------------------------------------------------------------
+ * optimized bitarray object - works like bitarrays in bitops, but uses
+ * a summary field to accelerate lookups.  Assumes external locking
+ *---------------------------------------------------------------------
+ */
+
+struct bitarray {
+	unsigned long summary; /* 1 per word in pending */
+	unsigned long pending[NR_IRQ_WORDS];
+};
+
+static inline int bitarray_pending(struct bitarray *this)
+{
+	return this->summary ? 1 : 0;
+}
+
+static inline int bitarray_findhighest(struct bitarray *this)
+{
+	if (!this->summary)
+		return -1;
+	else {
+		int word_index = __fls(this->summary);
+		int bit_index  = __fls(this->pending[word_index]);
+
+		return word_index * BITS_PER_LONG + bit_index;
+	}
+}
+
+static inline void bitarray_set(struct bitarray *this, int nr)
+{
+	__set_bit(nr, &this->pending);
+	__set_bit(nr / BITS_PER_LONG, &this->summary);
+}
+
+static inline void bitarray_clear(struct bitarray *this, int nr)
+{
+	int word = nr / BITS_PER_LONG;
+
+	__clear_bit(nr, &this->pending);
+	if (!this->pending[word])
+		__clear_bit(word, &this->summary);
+}
+
+static inline int bitarray_test(struct bitarray *this, int nr)
+{
+	return test_bit(nr, &this->pending);
+}
+
+static inline int bitarray_test_and_set(struct bitarray *this, int nr, int val)
+{
+	if (bitarray_test(this, nr) != val) {
+		if (val)
+			bitarray_set(this, nr);
+		else
+			bitarray_clear(this, nr);
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ *----------------------------------------------------------------------
+ * userint interface - provides the actual kvm_irqdevice implementation
+ *---------------------------------------------------------------------
+ */
+
+struct kvm_user_irqdev {
+	spinlock_t      lock;
+	atomic_t        ref_count;
+	struct bitarray pending;
+};
+
+static int user_irqdev_ack(struct kvm_irqdevice *this, int flags,
+			   struct kvm_irqack_data *data)
+{
+	struct kvm_user_irqdev *s = (struct kvm_user_irqdev*)this->private;
+
+	spin_lock(&s->lock);
+
+	if (!(flags & KVM_IRQACK_FLAG_PEEK)) {
+		int irq = bitarray_findhighest(&s->pending);
+
+		if (irq > -1) {
+			/*
+			 * Automatically clear the interrupt as the EOI
+			 * mechanism (if any) will take place in userspace
+			 */
+			bitarray_clear(&s->pending, irq);
+
+			data->flags |= KVM_IRQACKDATA_VECTOR_VALID;
+		}
+
+		data->vector = irq;
+	}
+
+	if (bitarray_pending(&s->pending))
+		data->flags |= KVM_IRQACKDATA_VECTOR_PENDING;
+
+	spin_unlock(&s->lock);
+
+	return 0;
+}
+
+static int user_irqdev_set_pin(struct kvm_irqdevice *this, int irq, int level)
+{
+	struct kvm_user_irqdev *s = (struct kvm_user_irqdev*)this->private;
+	int forward = 0;
+
+	spin_lock(&s->lock);
+	forward = bitarray_test_and_set(&s->pending, irq, level);
+	spin_unlock(&s->lock);
+
+	/*
+	 * alert the higher layer software we have changes
+	 */
+	if (forward)
+		kvm_irqdevice_set_intr(this, kvm_irqpin_localint);
+
+	return 0;
+}
+
+static void user_irqdev_destructor(struct kvm_irqdevice *this)
+{
+	struct kvm_user_irqdev *s = (struct kvm_user_irqdev*)this->private;
+
+	if (atomic_dec_and_test(&s->ref_count))
+		kfree(s);
+}
+
+int kvm_user_irqdev_init(struct kvm_irqdevice *irqdev)
+{
+	struct kvm_user_irqdev *s;
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	spin_lock_init(&s->lock);
+
+	irqdev->ack         = user_irqdev_ack;
+	irqdev->set_pin     = user_irqdev_set_pin;
+	irqdev->destructor  = user_irqdev_destructor;
+
+	irqdev->private = s;
+	atomic_inc(&s->ref_count);
+
+	return 0;
+}
+
+int kvm_user_irqdev_save(struct kvm_irqdevice *this, void *data)
+{
+	struct kvm_user_irqdev *s = (struct kvm_user_irqdev*)this->private;
+
+	spin_lock(&s->lock);
+	memcpy(data, s->pending.pending, sizeof s->pending.pending);
+	spin_unlock(&s->lock);
+
+	return 0;
+}
+
+int kvm_user_irqdev_restore(struct kvm_irqdevice *this, void *data)
+{
+	struct kvm_user_irqdev *s = (struct kvm_user_irqdev*)this->private;
+	int i;
+	int forward = 0;
+
+	spin_lock(&s->lock);
+
+	/*
+	 * walk the interrupt-bitmap and inject an IRQ for each bit found
+	 */
+	for (i = 0; i < 256; ++i) {
+		int val  = test_bit(i, data);
+		forward |= bitarray_test_and_set(&s->pending, i, val);
+	}
+
+	spin_unlock(&s->lock);
+
+	/*
+	 * alert the higher layer software we have changes
+	 */
+	if (forward)
+		kvm_irqdevice_set_intr(this, kvm_irqpin_localint);
+
+	return 0;
+}
+
+int kvm_userint_init(struct kvm_vcpu *vcpu)
+{
+	int ret;
+
+	ret = kvm_user_irqdev_init(&vcpu->irq.dev);
+	if (ret < 0)
+		return ret;
+
+	return kvm_lapic_init(vcpu, NULL, KVM_LAPIC_OPTION_USERMODE);
+}
+
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index b909b54..fdcdc60 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -1237,10 +1237,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	memset(vcpu->regs, 0, sizeof(vcpu->regs));
 	vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
-	vcpu->cr8 = 0;
-	vcpu->apic_base = 0xfee00000 |
-			/*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
-			MSR_IA32_APICBASE_ENABLE;
 
 	fx_init(vcpu);
 
@@ -1455,52 +1451,126 @@ static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
 	vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
 }
 
-static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
+static void do_intr_requests(struct kvm_vcpu *vcpu,
+			    struct kvm_run *kvm_run,
+			    kvm_irqpin_t pin)
 {
-	int word_index = __ffs(vcpu->irq_summary);
-	int bit_index = __ffs(vcpu->irq_pending[word_index]);
-	int irq = word_index * BITS_PER_LONG + bit_index;
-
-	clear_bit(bit_index, &vcpu->irq_pending[word_index]);
-	if (!vcpu->irq_pending[word_index])
-		clear_bit(word_index, &vcpu->irq_summary);
-
-	if (vcpu->rmode.active) {
-		inject_rmode_irq(vcpu, irq);
-		return;
-	}
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
-}
-
-
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
-				       struct kvm_run *kvm_run)
-{
-	u32 cpu_based_vm_exec_control;
+	int pending = 0;
 
 	vcpu->interrupt_window_open =
 		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
 		 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
 
 	if (vcpu->interrupt_window_open &&
-	    vcpu->irq_summary &&
-	    !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
+	    !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) {
 		/*
-		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
+		 * If interrupts enabled, and not blocked by sti or mov ss.
+		 * Good.
 		 */
-		kvm_do_inject_irq(vcpu);
+		struct kvm_irqack_data ack;
+		int r = 0;
+
+		memset(&ack, 0, sizeof(ack));
+
+		switch (pin) {
+		case kvm_irqpin_localint:
+			r = kvm_vcpu_irq_pop(vcpu, &ack);
+			break;
+		case kvm_irqpin_extint:
+			r = kvm_irqdevice_ack(&vcpu->kvm->isa_irq, 0, &ack);
+			if (!(ack.flags & KVM_IRQACKDATA_VECTOR_PENDING))
+				__clear_bit(pin, &vcpu->irq.pending);
+			break;
+		case kvm_irqpin_nmi:
+			/*
+			 * FIXME: Someday we will handle this using the
+			 * specific VMX NMI features.  For now, just inject
+			 * the NMI as a standard interrupt on vector 2
+			 */
+			ack.flags |= KVM_IRQACKDATA_VECTOR_VALID;
+			ack.vector = 2;
+			__clear_bit(pin, &vcpu->irq.pending);
+			break;
+		default:
+			panic("KVM: unknown interrupt pin raised: %d\n", pin);
+			break;
+		}
+
+		BUG_ON(r < 0);
 
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	if (!vcpu->interrupt_window_open &&
-	    (vcpu->irq_summary || kvm_run->request_interrupt_window))
+		if (ack.flags & KVM_IRQACKDATA_VECTOR_VALID) {
+			if (vcpu->rmode.active)
+				inject_rmode_irq(vcpu, ack.vector);
+			else
+				vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+					     ack.vector |
+					     INTR_TYPE_EXT_INTR |
+					     INTR_INFO_VALID_MASK);
+
+			++vcpu->stat.irq_accepted;
+		}
+	}
+
+	/*
+	 * Re-read the pending interrupt state.  If anything is still
+	 * pending we need to cause an exit on the next window
+	 */
+	pending = __kvm_vcpu_irq_pending(vcpu);
+
+	if (test_bit(pin, &pending) || kvm_run->request_interrupt_window) {
 		/*
-		 * Interrupts blocked.  Wait for unblock.
+		 * Trigger a VMEXIT on the next IRQ window
 		 */
-		cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-	else
-		cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+		u32 cbvec = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+		cbvec |= CPU_BASED_VIRTUAL_INTR_PENDING;
+		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cbvec);
+	}
+}
+
+static void clear_pending_controls(struct kvm_vcpu *vcpu,
+				  struct kvm_run *kvm_run)
+{
+	u32 cbvec = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ 
+ 	if (kvm_run->request_interrupt_window)
+ 		cbvec |= CPU_BASED_VIRTUAL_INTR_PENDING;
+ 	else
+ 		cbvec &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cbvec);
+}
+
+static void do_interrupt_requests(struct kvm_vcpu *vcpu,
+				  struct kvm_run *kvm_run)
+{
+	int pending = __kvm_vcpu_irq_pending(vcpu);
+
+	clear_pending_controls(vcpu, kvm_run);
+
+	while (pending) {
+		kvm_irqpin_t pin = __fls(pending);
+
+		switch (pin) {
+		case kvm_irqpin_localint:
+		case kvm_irqpin_extint:
+		case kvm_irqpin_nmi:
+			do_intr_requests(vcpu, kvm_run, pin);
+			break;
+		case kvm_irqpin_smi:
+			/* ignored (for now) */
+			printk(KERN_WARNING "KVM: dropping unhandled SMI\n");
+			__clear_bit(pin, &vcpu->irq.pending);
+			break;
+		case kvm_irqpin_invalid:
+			/* drop */
+			break;
+		default:
+			panic("KVM: unknown interrupt pin raised: %d\n", pin);
+			break;
+		}
+
+		__clear_bit(pin, &pending);
+	}
 }
 
 static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
@@ -1555,9 +1625,13 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	if (is_external_interrupt(vect_info)) {
+		/*
+		 * An exception was taken while we were trying to inject an
+		 * IRQ.  We must defer the injection of the vector until
+		 * the next window.
+		 */
 		int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
-		set_bit(irq, vcpu->irq_pending);
-		set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
+		kvm_vcpu_irq_push(vcpu, irq);
 	}
 
 	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
@@ -1611,7 +1685,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 								error_code)) {
 		if (vcpu->halt_request) {
 			vcpu->halt_request = 0;
-			return kvm_emulate_halt(vcpu);
+			return kvm_vcpu_halt(vcpu, kvm_run);
 		}
 		return 1;
 	}
@@ -1779,7 +1853,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 			return 1;
 		case 8:
 			vcpu_load_rsp_rip(vcpu);
-			vcpu->regs[reg] = vcpu->cr8;
+			vcpu->regs[reg] = kvm_lapic_get_tpr(vcpu);
 			vcpu_put_rsp_rip(vcpu);
 			skip_emulated_instruction(vcpu);
 			return 1;
@@ -1875,11 +1949,16 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 			      struct kvm_run *kvm_run)
 {
+ 	struct kvm_irqack_data ack;
+ 	kvm_irqdevice_ack(&vcpu->irq.dev, KVM_IRQACK_FLAG_PEEK, &ack);
+ 
 	kvm_run->if_flag = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) != 0;
-	kvm_run->cr8 = vcpu->cr8;
-	kvm_run->apic_base = vcpu->apic_base;
-	kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open &&
-						  vcpu->irq_summary == 0);
+	kvm_run->cr8 = kvm_lapic_get_tpr(vcpu);
+	kvm_run->apic_base = kvm_lapic_get_base(vcpu);
+	kvm_run->ready_for_interrupt_injection =
+		(vcpu->interrupt_window_open &&
+		 !kvm_vcpu_irq_pending(vcpu) &&
+		 !(ack.flags & KVM_IRQACKDATA_NEXT_VALID));
 }
 
 static int handle_interrupt_window(struct kvm_vcpu *vcpu,
@@ -1890,7 +1969,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
 	 * possible
 	 */
 	if (kvm_run->request_interrupt_window &&
-	    !vcpu->irq_summary) {
+	    !kvm_vcpu_irq_pending(vcpu)) {
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		++vcpu->stat.irq_window_exits;
 		return 0;
@@ -1901,7 +1980,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
 static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	skip_emulated_instruction(vcpu);
-	return kvm_emulate_halt(vcpu);
+	return kvm_vcpu_halt(vcpu, kvm_run);
 }
 
 static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1966,7 +2045,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
 					  struct kvm_run *kvm_run)
 {
-	return (!vcpu->irq_summary &&
+	return (!kvm_vcpu_irq_pending(vcpu) &&
 		kvm_run->request_interrupt_window &&
 		vcpu->interrupt_window_open &&
 		(vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
@@ -1983,13 +2062,42 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 
 preempted:
-	if (!vcpu->mmio_read_completed)
-		do_interrupt_requests(vcpu, kvm_run);
-
 	if (vcpu->guest_debug.enabled)
 		kvm_guest_debug_pre(vcpu);
 
 again:
+	/*
+	 * We disable interrupts until the next VMEXIT to eliminate a race
+	 * condition for delivery of virtual interrutps.  Note that this is
+	 * probably not as bad as it sounds, as interrupts will still invoke
+	 * a VMEXIT once transitioned to GUEST mode (and thus exit this lock
+	 * scope) even if they are disabled.
+	 */
+	local_irq_disable();
+
+	spin_lock(&vcpu->irq.lock);
+
+	/*
+	 * If there are any signals pending (virtual interrupt related or
+	 * otherwise), don't even bother trying to enter guest mode...
+	 */
+	if (signal_pending(current)) {
+		kvm_run->exit_reason = KVM_EXIT_INTR;
+		spin_unlock(&vcpu->irq.lock);
+		local_irq_enable();
+		r = -EINTR;
+		goto out;
+	}
+
+	/*
+	 * We must inject interrupts (if any) while the irq.lock
+	 * is held
+	 */
+	if (!vcpu->mmio_read_completed)
+		do_interrupt_requests(vcpu, kvm_run);
+
+	spin_unlock(&vcpu->irq.lock);
+
 	vmx_save_host_state(vcpu);
 	kvm_load_guest_fpu(vcpu);
 
@@ -2004,6 +2112,14 @@ again:
 
 	local_irq_disable();
 
+	/*
+	 * There are optimizations we can make when signaling interrupts
+	 * if we know the VCPU is in GUEST mode, so record the guest's
+	 * CPU to both serve as an indicator of vcpu state and a target
+	 * for our interrupts
+	 */
+	vcpu->irq.guest_cpu = task_cpu(current);
+
 	vcpu->guest_mode = 1;
 	if (vcpu->requests)
 		if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
@@ -2126,6 +2242,14 @@ again:
 	      : "cc", "memory" );
 
 	vcpu->guest_mode = 0;
+
+	/*
+	 * Signal that we have transitioned back to host mode
+	 */
+	spin_lock(&vcpu->irq.lock);
+	vcpu->irq.guest_cpu = -1;
+	spin_unlock(&vcpu->irq.lock);
+
 	local_irq_enable();
 
 	++vcpu->stat.exits;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index e6edca8..aaa826e 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -231,6 +231,17 @@ struct kvm_dirty_log {
 	};
 };
 
+/* for KVM_APIC */
+struct kvm_apic_msg {
+	/* in */
+	__u32 dest;
+	__u32 trig_mode;
+	__u32 dest_mode;
+	__u32 delivery_mode;
+	__u32 vector;
+	__u32 padding;
+};
+
 struct kvm_cpuid_entry {
 	__u32 function;
 	__u32 eax;
@@ -282,6 +293,9 @@ struct kvm_signal_mask {
 #define KVM_CREATE_VCPU           _IO(KVMIO,  0x41)
 #define KVM_GET_DIRTY_LOG         _IOW(KVMIO, 0x42, struct kvm_dirty_log)
 #define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO, 0x43, struct kvm_memory_alias)
+#define KVM_ENABLE_KERNEL_PIC     _IOW(KVMIO, 0x44, __u32)
+#define KVM_ISA_INTERRUPT         _IOW(KVMIO, 0x45, struct kvm_interrupt)
+#define KVM_APIC_MSG		  _IOW(KVMIO, 0x46, struct kvm_apic_msg)
 
 /*
  * ioctls for vcpu fds
@@ -300,5 +314,6 @@ struct kvm_signal_mask {
 #define KVM_SET_SIGNAL_MASK       _IOW(KVMIO,  0x8b, struct kvm_signal_mask)
 #define KVM_GET_FPU               _IOR(KVMIO,  0x8c, struct kvm_fpu)
 #define KVM_SET_FPU               _IOW(KVMIO,  0x8d, struct kvm_fpu)
+#define KVM_APIC_RESET		  _IO(KVMIO,   0x8e)
 
 #endif

[-- Attachment #3: k.smp.patch --]
[-- Type: application/octet-stream, Size: 10983 bytes --]

diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index e6861ed..a3a394b 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -372,6 +372,9 @@ struct kvm_lapic {
 	struct kvm_io_device *mmio;
 };
 
+#define KVM_VCPU_INIT_SIPI_SIPI_STATE_NORM       1
+#define KVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI  0
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 	union {
@@ -379,6 +382,7 @@ struct kvm_vcpu {
 		struct vcpu_svm *svm;
 	};
 	struct mutex mutex;
+	int   vcpu_id;
 	int   cpu;
 	int   launched;
 	u64 host_tsc;
@@ -386,6 +390,8 @@ struct kvm_vcpu {
 	int interrupt_window_open;
 	struct kvm_vcpu_irq irq;
 	struct kvm_lapic apic;
+	int init_sipi_sipi_state;
+	u8 sipi_vector;
 	int guest_mode;
 	unsigned long requests;
 	unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
@@ -459,9 +465,6 @@ struct kvm_vcpu {
 		} tr, es, ds, fs, gs;
 	} rmode;
 	int halt_request; /* real mode on Intel only */
-
-	int cpuid_nent;
-	struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
 };
 
 /*
@@ -572,6 +575,9 @@ struct kvm {
 	struct kvm_io_bus mmio_bus;
 	int enable_kernel_pic;
 	struct kvm_irqdevice isa_irq;
+
+	int cpuid_nent;
+	struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
 };
 
 struct descriptor_table {
@@ -629,6 +635,7 @@ struct kvm_arch_ops {
 	void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
 
 	int (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
+	int (*vcpu_init)(struct kvm_vcpu *vcpu);
 	int (*vcpu_setup)(struct kvm_vcpu *vcpu);
 	void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
 	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 152a7c9..3f749d8 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -332,6 +332,8 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
 	struct kvm_vcpu *vcpu;
 	atomic_t completed;
 
+	return;
+
 	atomic_set(&completed, 0);
 	cpus_clear(cpus);
 	needed = 0;
@@ -1840,8 +1842,8 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 	vcpu->regs[VCPU_REGS_RCX] = 0;
 	vcpu->regs[VCPU_REGS_RDX] = 0;
 	best = NULL;
-	for (i = 0; i < vcpu->cpuid_nent; ++i) {
-		e = &vcpu->cpuid_entries[i];
+	for (i = 0; i < vcpu->kvm->cpuid_nent; ++i) {
+		e = &vcpu->kvm->cpuid_entries[i];
 		if (e->function == function) {
 			best = e;
 			break;
@@ -2029,8 +2031,22 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 	sigset_t sigsaved;
 
+	while (1) {
+		if (vcpu->init_sipi_sipi_state ==
+			KVM_VCPU_INIT_SIPI_SIPI_STATE_NORM)
+			break;
+		yield();
+	}
+
+	//if (vcpu->vcpu_id) printk("run vcpu %d.\n", vcpu->vcpu_id);
+
 	vcpu_load(vcpu);
 
+	if (vcpu->sipi_vector) {
+		kvm_arch_ops->vcpu_setup(vcpu);
+		vcpu->sipi_vector = 0;
+	}
+
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
@@ -2624,7 +2640,7 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this,
 			 * a VMEXIT.
 			 */
 			direct_ipi = vcpu->irq.guest_cpu;
-			BUG_ON(direct_ipi == smp_processor_id());
+			//BUG_ON(direct_ipi == smp_processor_id());
 			++vcpu->stat.guest_preempt;
 		}
 
@@ -2686,8 +2702,14 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 
 	vcpu = &kvm->vcpus[n];
 
+	vcpu->init_sipi_sipi_state = (n == 0) ? 
+		KVM_VCPU_INIT_SIPI_SIPI_STATE_NORM:
+		KVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI;
+
 	mutex_lock(&vcpu->mutex);
 
+	vcpu->vcpu_id = n;
+
 	if (vcpu->vmcs) {
 		mutex_unlock(&vcpu->mutex);
 		return -EEXIST;
@@ -2732,7 +2754,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 	kvm_arch_ops->vcpu_load(vcpu);
 	r = kvm_mmu_setup(vcpu);
 	if (r >= 0)
-		r = kvm_arch_ops->vcpu_setup(vcpu);
+		r = kvm_arch_ops->vcpu_init(vcpu);
 	vcpu_put(vcpu);
 
 	if (r < 0)
@@ -2768,8 +2790,8 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
 
 	rdmsrl(MSR_EFER, efer);
 	entry = NULL;
-	for (i = 0; i < vcpu->cpuid_nent; ++i) {
-		e = &vcpu->cpuid_entries[i];
+	for (i = 0; i < vcpu->kvm->cpuid_nent; ++i) {
+		e = &vcpu->kvm->cpuid_entries[i];
 		if (e->function == 0x80000001) {
 			entry = e;
 			break;
@@ -2791,10 +2813,10 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
 		goto out;
 	r = -EFAULT;
-	if (copy_from_user(&vcpu->cpuid_entries, entries,
+	if (copy_from_user(&vcpu->kvm->cpuid_entries, entries,
 			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
 		goto out;
-	vcpu->cpuid_nent = cpuid->nent;
+	vcpu->kvm->cpuid_nent = cpuid->nent;
 	cpuid_fix_nx_cap(vcpu);
 	return 0;
 
diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c
index 602e94c..4cff061 100644
--- a/drivers/kvm/lapic.c
+++ b/drivers/kvm/lapic.c
@@ -407,11 +407,21 @@ static int __apic_accept_irq(struct kvm_kern_apic *apic,
 		apic_debug("INIT interrupt\n");
 		if (level) {
 			spin_unlock_bh(&apic->lock);
+			apic->vcpu->init_sipi_sipi_state =
+				KVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI;
 			kvm_lapic_reset(apic->vcpu);
 			spin_lock_bh(&apic->lock);
 		}
 		break;
-	case APIC_DM_STARTUP: /* FIXME: currently no support for SMP */
+	case APIC_DM_STARTUP:
+		printk("SIPI interrupt: %x\n", vector);
+		if (apic->vcpu->init_sipi_sipi_state ==
+				KVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI) {
+			apic->vcpu->sipi_vector = vector;
+			apic->vcpu->init_sipi_sipi_state =
+				KVM_VCPU_INIT_SIPI_SIPI_STATE_NORM;
+		}
+		break;
 	default:
 		printk(KERN_ALERT "TODO: support interrupt type %x\n",
 		       delivery_mode);
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index 03c58fd..01b787c 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -439,6 +439,11 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
 	seg->base = 0;
 }
 
+static int svm_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
 static int svm_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	return 0;
@@ -1911,6 +1916,7 @@ static struct kvm_arch_ops svm_arch_ops = {
 
 	.run = svm_vcpu_run,
 	.skip_emulated_instruction = skip_emulated_instruction,
+	.vcpu_init = svm_vcpu_init,
 	.vcpu_setup = svm_vcpu_setup,
 	.patch_hypercall = svm_patch_hypercall,
 };
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index fdcdc60..ad9e9ec 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -885,9 +885,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 	vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
 }
 
-static int rmode_tss_base(struct kvm* kvm)
+static gpa_t rmode_tss_base(struct kvm* kvm, int vcpu_id)
 {
-	gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
+	gfn_t base_gfn = kvm->memslots[0].base_gfn +
+			 kvm->memslots[0].npages - 3 * (1 + vcpu_id);
 	return base_gfn << PAGE_SHIFT;
 }
 
@@ -911,7 +912,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	vcpu->rmode.active = 1;
 
 	vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
-	vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
+	vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm, vcpu->vcpu_id));
 
 	vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
 	vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
@@ -936,7 +937,21 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	vmcs_write32(GUEST_CS_LIMIT, 0xffff);
 	if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
 		vmcs_writel(GUEST_CS_BASE, 0xf0000);
+	if (vcpu->vcpu_id)
+	printk("%s(%d): sipi vector %x.\n", __FUNCTION__, vcpu->vcpu_id, vcpu->sipi_vector);
+	if (vcpu->vcpu_id) {
+		vmcs_writel(GUEST_CS_BASE, vcpu->sipi_vector << 12);
+		if (vcpu->vcpu_id)
+		printk("%s(%d): cs base %lx.\n", __FUNCTION__, vcpu->vcpu_id, vmcs_readl(GUEST_CS_BASE));
+		vmcs_writel(GUEST_RIP, 0);
+	}
 	vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
+	if (vcpu->vcpu_id)
+	printk("%s(%d): cs base %lx.\n", __FUNCTION__, vcpu->vcpu_id, vmcs_readl(GUEST_CS_BASE));
+	if (vcpu->vcpu_id)
+	printk("%s(%d): cs sel %lx.\n", __FUNCTION__, vcpu->vcpu_id, vmcs_readl(GUEST_CS_SELECTOR));
+	if (vcpu->vcpu_id)
+	printk("%s(%d): ip %lx.\n", __FUNCTION__, vcpu->vcpu_id, vmcs_readl(GUEST_RIP));
 
 	fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
 	fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
@@ -1164,10 +1179,10 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
 	vmcs_writel(GUEST_GDTR_BASE, dt->base);
 }
 
-static int init_rmode_tss(struct kvm* kvm)
+static int init_rmode_tss(struct kvm* kvm, int vcpu_id)
 {
 	struct page *p1, *p2, *p3;
-	gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
+	gfn_t fn = rmode_tss_base(kvm, vcpu_id) >> PAGE_SHIFT;
 	char *page;
 
 	p1 = gfn_to_page(kvm, fn++);
@@ -1227,14 +1242,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
 	unsigned long a;
 	struct descriptor_table dt;
 	int i;
-	int ret = 0;
 	unsigned long kvm_vmx_return;
 
-	if (!init_rmode_tss(vcpu->kvm)) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	memset(vcpu->regs, 0, sizeof(vcpu->regs));
 	vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
 
@@ -1401,9 +1410,15 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
 	update_exception_bitmap(vcpu);
 
 	return 0;
+}
 
-out:
-	return ret;
+static int vmx_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	if (!init_rmode_tss(vcpu->kvm, vcpu->vcpu_id))
+		return -ENOMEM;
+	if (vcpu->vcpu_id)
+		return 0;
+	return vmx_vcpu_setup(vcpu);
 }
 
 static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
@@ -1685,6 +1700,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 								error_code)) {
 		if (vcpu->halt_request) {
 			vcpu->halt_request = 0;
+			printk("real mode halt.\n");
 			return kvm_vcpu_halt(vcpu, kvm_run);
 		}
 		return 1;
@@ -2066,6 +2082,9 @@ preempted:
 		kvm_guest_debug_pre(vcpu);
 
 again:
+	//if (vcpu->vcpu_id)
+	//	printk("entry vcpu %d rip @ %lx.\n", vcpu->vcpu_id, vmcs_readl(GUEST_RIP));
+
 	/*
 	 * We disable interrupts until the next VMEXIT to eliminate a race
 	 * condition for delivery of virtual interrutps.  Note that this is
@@ -2110,7 +2129,9 @@ again:
 	 */
 	vmcs_writel(HOST_CR0, read_cr0());
 
+redo_requests:
 	local_irq_disable();
+	vcpu->guest_mode = 1;
 
 	/*
 	 * There are optimizations we can make when signaling interrupts
@@ -2120,10 +2141,12 @@ again:
 	 */
 	vcpu->irq.guest_cpu = task_cpu(current);
 
-	vcpu->guest_mode = 1;
 	if (vcpu->requests)
-		if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
-		    vmx_flush_tlb(vcpu);
+		if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) {
+			local_irq_enable();
+			vmx_flush_tlb(vcpu);
+			goto redo_requests;
+		}
 
 	asm (
 		/* Store host registers */
@@ -2252,6 +2275,10 @@ again:
 
 	local_irq_enable();
 
+	//if (vcpu->vcpu_id)
+	//	printk("exit vcpu %d rip @ %lx with reason %x.\n",
+	//	       vcpu->vcpu_id, vmcs_readl(GUEST_RIP), vmcs_read32(VM_EXIT_REASON));
+
 	++vcpu->stat.exits;
 
 	vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
@@ -2426,6 +2453,7 @@ static struct kvm_arch_ops vmx_arch_ops = {
 
 	.run = vmx_vcpu_run,
 	.skip_emulated_instruction = skip_emulated_instruction,
+	.vcpu_init = vmx_vcpu_init,
 	.vcpu_setup = vmx_vcpu_setup,
 	.patch_hypercall = vmx_patch_hypercall,
 };

[-- Attachment #4: u.apic.patch --]
[-- Type: application/octet-stream, Size: 18484 bytes --]

commit 03328e4f12e580df69e5c8d13212af9e6ca76048
Author: root <root@vtsmp-xin.(none)>
Date:   Wed Jun 13 16:59:32 2007 +0800

    kvm apic user support.

diff --git a/kernel/Kbuild b/kernel/Kbuild
index e9bcda7..103a179 100644
--- a/kernel/Kbuild
+++ b/kernel/Kbuild
@@ -1,5 +1,5 @@
 EXTRA_CFLAGS := -I$(src)/include -include $(src)/external-module-compat.h
 obj-m := kvm.o kvm-intel.o kvm-amd.o
-kvm-objs := kvm_main.o mmu.o x86_emulate.o
+kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o kernint.o lapic.o
 kvm-intel-objs := vmx.o vmx-debug.o
 kvm-amd-objs := svm.o
diff --git a/qemu/hw/apic.c b/qemu/hw/apic.c
index 5704224..9fbeba2 100644
--- a/qemu/hw/apic.c
+++ b/qemu/hw/apic.c
@@ -18,6 +18,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #include "vl.h"
+#include "qemu-kvm.h"
 
 //#define DEBUG_APIC
 //#define DEBUG_IOAPIC
@@ -87,6 +88,7 @@ typedef struct APICState {
 } APICState;
 
 struct IOAPICState {
+    CPUState *cpu_env;
     uint8_t id;
     uint8_t ioregsel;
 
@@ -866,7 +868,7 @@ int apic_init(CPUState *env)
     return 0;
 }
 
-static void ioapic_service(IOAPICState *s)
+void ioapic_service(IOAPICState *s)
 {
     uint8_t i;
     uint8_t trig_mode;
@@ -895,10 +897,16 @@ static void ioapic_service(IOAPICState *s)
                     vector = pic_read_irq(isa_pic);
                 else
                     vector = entry & 0xff;
-                
-                apic_get_delivery_bitmask(deliver_bitmask, dest, dest_mode);
-                apic_bus_deliver(deliver_bitmask, delivery_mode, 
-                                 vector, polarity, trig_mode);
+             
+		if (use_kernel_apic())
+		    ext_apic_bus_deliver(dest, trig_mode, dest_mode,
+					 delivery_mode, vector);
+		else {
+		    apic_get_delivery_bitmask(deliver_bitmask, dest,
+					      dest_mode);
+		    apic_bus_deliver(deliver_bitmask, delivery_mode, 
+				     vector, polarity, trig_mode);
+		}
             }
         }
     }
@@ -916,7 +924,7 @@ void ioapic_set_irq(void *opaque, int vector, int level)
             /* level triggered */
             if (level) {
                 s->irr |= mask;
-                ioapic_service(s);
+		ioapic_service(s);
             } else {
                 s->irr &= ~mask;
             }
@@ -924,7 +932,7 @@ void ioapic_set_irq(void *opaque, int vector, int level)
             /* edge triggered */
             if (level) {
                 s->irr |= mask;
-                ioapic_service(s);
+		ioapic_service(s);
             }
         }
     }
@@ -1052,7 +1060,7 @@ static CPUWriteMemoryFunc *ioapic_mem_write[3] = {
     ioapic_mem_writel,
 };
 
-IOAPICState *ioapic_init(void)
+IOAPICState *ioapic_init(CPUState *env)
 {
     IOAPICState *s;
     int io_memory;
@@ -1061,6 +1069,7 @@ IOAPICState *ioapic_init(void)
     if (!s)
         return NULL;
     ioapic_reset(s);
+    s->cpu_env = env;
     s->id = last_apic_id++;
 
     io_memory = cpu_register_io_memory(0, ioapic_mem_read, 
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index eda49cf..f50909f 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -91,16 +91,19 @@ int cpu_get_pic_interrupt(CPUState *env)
 {
     int intno;
 
+    if (use_kernel_apic())
+	return -1;
+
     intno = apic_get_interrupt(env);
     if (intno >= 0) {
-        /* set irq request if a PIC irq is still pending */
-        /* XXX: improve that */
-        pic_update_irq(isa_pic); 
-        return intno;
+	/* set irq request if a PIC irq is still pending */
+	/* XXX: improve that */
+	pic_update_irq(isa_pic); 
+	return intno;
     }
     /* read the irq from the PIC */
     if (!apic_accept_pic_intr(env))
-        return -1;
+	return -1;
 
     intno = pic_read_irq(isa_pic);
     return intno;
@@ -115,6 +118,13 @@ static void pic_irq_request(void *opaque, int level)
         cpu_reset_interrupt(env, CPU_INTERRUPT_HARD);
 }
 
+static void kernel_pic_irq_request(void *opaque, int level)
+{
+    int intno = pic_read_irq(isa_pic);
+    if (intno > 0)
+	ext_set_isa_irq(intno);
+}
+
 /* PC cmos mappings */
 
 #define REG_EQUIPMENT_BYTE          0x14
@@ -483,9 +493,9 @@ static void pc_init1(int ram_size, int vga_ram_size, int boot_device,
         }
         register_savevm("cpu", i, 4, cpu_save, cpu_load, env);
         qemu_register_reset(main_cpu_reset, env);
-        if (pci_enabled) {
-            apic_init(env);
-        }
+	if (!use_kernel_apic() && pci_enabled) {
+	    apic_init(env);
+	}
     }
 
     /* allocate RAM */
@@ -671,13 +681,16 @@ static void pc_init1(int ram_size, int vga_ram_size, int boot_device,
     register_ioport_write(0x92, 1, 1, ioport92_write, NULL);
 
     if (pci_enabled) {
-        ioapic = ioapic_init();
+        ioapic = ioapic_init(env);
     }
-    isa_pic = pic_init(pic_irq_request, first_cpu);
+    if (use_kernel_apic())
+	isa_pic = pic_init(kernel_pic_irq_request, first_cpu);
+    else
+	isa_pic = pic_init(pic_irq_request, first_cpu);
     pit = pit_init(0x40, 0);
     pcspk_init(pit);
     if (pci_enabled) {
-        pic_set_alt_irq_func(isa_pic, ioapic_set_irq, ioapic);
+	pic_set_alt_irq_func(isa_pic, ioapic_set_irq, ioapic);
     }
 
     for(i = 0; i < MAX_SERIAL_PORTS; i++) {
diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
index 1849997..12df433 100644
--- a/qemu/qemu-kvm.c
+++ b/qemu/qemu-kvm.c
@@ -3,6 +3,25 @@
 #include "config-host.h"
 
 #ifdef USE_KVM
+ #define KVM_ALLOWED_DEFAULT 1
+#else
+ #define KVM_ALLOWED_DEFAULT 0
+#endif
+
+int kvm_allowed = KVM_ALLOWED_DEFAULT;
+
+struct kvm_apic
+{
+    int user_specified;
+    int level;
+};
+
+struct kvm_apic kvm_apic = {
+    .user_specified = 0,
+    .level = 1,
+};
+
+#ifdef USE_KVM
 
 #include "exec.h"
 
@@ -14,7 +33,6 @@
 
 extern void perror(const char *s);
 
-int kvm_allowed = 1;
 kvm_context_t kvm_context;
 static struct kvm_msr_list *kvm_msr_list;
 static int kvm_has_msr_star;
@@ -227,9 +245,16 @@ static void load_regs(CPUState *env)
     sregs.cr3 = env->cr[3];
     sregs.cr4 = env->cr[4];
 
-    sregs.apic_base = cpu_get_apic_base(env);
+    if (!kvm_apic.level) {
+	/* These two are no longer used once the in-kernel APIC is enabled */
+	sregs.apic_base = 0;
+	sregs.cr8 = 0;
+    } else {
+	sregs.apic_base = cpu_get_apic_base(env);
+	sregs.cr8 = cpu_get_apic_tpr(env);
+    }
+
     sregs.efer = env->efer;
-    sregs.cr8 = cpu_get_apic_tpr(env);
 
     kvm_set_sregs(kvm_context, 0, &sregs);
 
@@ -321,10 +346,12 @@ static void save_regs(CPUState *env)
     env->cr[3] = sregs.cr3;
     env->cr[4] = sregs.cr4;
 
-    cpu_set_apic_base(env, sregs.apic_base);
+    if (!kvm_apic.level) {
+	cpu_set_apic_base(env, sregs.apic_base);
+	//cpu_set_apic_tpr(env, sregs.cr8);
+    }
 
     env->efer = sregs.efer;
-    //cpu_set_apic_tpr(env, sregs.cr8);
 
 #define HFLAG_COPY_MASK ~( \
 			HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
@@ -407,12 +434,19 @@ static int try_push_interrupts(void *opaque)
     CPUState **envs = opaque, *env;
     env = envs[0];
 
+    if (kvm_apic.level)
+	return 0;
+
     if (env->ready_for_interrupt_injection &&
         (env->interrupt_request & CPU_INTERRUPT_HARD) &&
         (env->eflags & IF_MASK)) {
+	    int irq = cpu_get_pic_interrupt(env);
+
+	    if (irq != -1)
+		// for now using cpu 0
+		kvm_inject_irq(kvm_context, 0, irq);
+	    
             env->interrupt_request &= ~CPU_INTERRUPT_HARD;
-            // for now using cpu 0
-            kvm_inject_irq(kvm_context, 0, cpu_get_pic_interrupt(env));
     }
 
     return (env->interrupt_request & CPU_INTERRUPT_HARD) != 0;
@@ -427,8 +461,11 @@ static void post_kvm_run(void *opaque, int vcpu)
 	? env->eflags | IF_MASK : env->eflags & ~IF_MASK;
     env->ready_for_interrupt_injection
 	= kvm_is_ready_for_interrupt_injection(kvm_context, vcpu);
-    //cpu_set_apic_tpr(env, kvm_run->cr8);
-    cpu_set_apic_base(env, kvm_get_apic_base(kvm_context, vcpu));
+
+    if (!kvm_apic.level) {
+	//cpu_set_apic_tpr(env, kvm_run->cr8);
+	cpu_set_apic_base(env, kvm_get_apic_base(kvm_context, vcpu));
+    }
 }
 
 static void pre_kvm_run(void *opaque, int vcpu)
@@ -436,7 +473,20 @@ static void pre_kvm_run(void *opaque, int vcpu)
     CPUState **envs = opaque, *env;
     env = envs[0];
 
-    kvm_set_cr8(kvm_context, vcpu, cpu_get_apic_tpr(env));
+    if (!kvm_apic.level)
+	kvm_set_cr8(kvm_context, vcpu, cpu_get_apic_tpr(env));
+}
+
+int ext_apic_bus_deliver(int dest, int trig_mode, int dest_mode,
+			 int delivery_mode, int vector)
+{
+	return kvm_apic_bus_deliver(kvm_context, dest, trig_mode, dest_mode,
+				    delivery_mode, vector);
+}
+
+int ext_set_isa_irq(int vector)
+{
+	return kvm_inject_isa_irq(kvm_context, vector);
 }
 
 void kvm_load_registers(CPUState *env)
@@ -452,11 +502,8 @@ void kvm_save_registers(CPUState *env)
 int kvm_cpu_exec(CPUState *env)
 {
     int r;
-    int pending = (!env->ready_for_interrupt_injection ||
-                   ((env->interrupt_request & CPU_INTERRUPT_HARD) &&
-		   (env->eflags & IF_MASK)));
 
-    if (!pending && (env->interrupt_request & CPU_INTERRUPT_EXIT)) {
+    if (env->interrupt_request & CPU_INTERRUPT_EXIT) {
         env->interrupt_request &= ~CPU_INTERRUPT_EXIT;
         env->exception_index = EXCP_INTERRUPT;
         cpu_loop_exit();
@@ -663,8 +710,36 @@ int kvm_qemu_init()
 int kvm_qemu_create_context(void)
 {
     int i;
+    struct kvm_extensions ext;
+
+    if (kvm_check_extension(kvm_context, &ext) < 0) {
+	kvm_qemu_destroy();
+	return -1;
+    }
+
+    if (!ext.lapic && kvm_apic.level) {
+	/*
+	 * Opps... the kernel doesnt support apic-emulation even though
+	 * the userspace is currently configured to enable it.  If this was
+	 * just simply because of our defaults, silently disable the feature
+	 * and continue.
+	 */
+	if (!kvm_apic.user_specified)
+	    kvm_apic.level = 0;
+	else {
+	    /*
+	     * Otherwise, its a fatal error that the user must correct
+	     */
+
+	    /* FIXME: Should we also log this officially */
+	    printf("FATAL: kernel does not support -kvm_apic = %d setting",
+		   kvm_apic.level);
+	    exit(-1);
+	}
+    }
 
-    if (kvm_create(kvm_context, phys_ram_size, (void**)&phys_ram_base) < 0) {
+    if (kvm_create(kvm_context, phys_ram_size, kvm_apic.level,
+		   (void**)&phys_ram_base) < 0) {
 	kvm_qemu_destroy();
 	return -1;
     }
@@ -923,4 +998,20 @@ int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap)
  out:
     return r;
 }
+
+int kvm_qemu_set_apic_level(int level)
+{
+    if (level < 0 || level > 1)
+	return -1;
+
+    kvm_apic.user_specified = 1;
+    kvm_apic.level = level;
+    return 0;
+}
+
+int kvm_qemu_get_apic_level(void)
+{
+    return kvm_apic.level;
+}
+
 #endif
diff --git a/qemu/qemu-kvm.h b/qemu/qemu-kvm.h
index 8a1c25d..09f9e55 100644
--- a/qemu/qemu-kvm.h
+++ b/qemu/qemu-kvm.h
@@ -3,9 +3,14 @@
 
 #include "kvmctl.h"
 
+
+
 int kvm_qemu_init(void);
 int kvm_qemu_create_context(void);
 void kvm_qemu_destroy(void);
+int ext_apic_bus_deliver(int dest, int trig_mode, int dest_mode,
+			 int delivery_mode, int vector);
+int ext_set_isa_irq(int vector);
 void kvm_load_registers(CPUState *env);
 void kvm_save_registers(CPUState *env);
 int kvm_cpu_exec(CPUState *env);
@@ -16,6 +21,9 @@ int kvm_physical_memory_set_dirty_tracking(int enable);
 int kvm_update_dirty_pages_log(void);
 int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap);
 
+int kvm_qemu_set_apic_level(int level);
+int kvm_qemu_get_apic_level(void);
+
 #define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
 #define BITMAP_SIZE(m) (ALIGN(((m)>>TARGET_PAGE_BITS), HOST_LONG_BITS) / 8)
 #endif
diff --git a/qemu/vl.c b/qemu/vl.c
index 7df1c80..4bf61e8 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -6531,6 +6531,7 @@ enum {
     QEMU_OPTION_vnc,
     QEMU_OPTION_no_acpi,
     QEMU_OPTION_no_kvm,
+    QEMU_OPTION_kvm_apic,
     QEMU_OPTION_no_reboot,
     QEMU_OPTION_daemonize,
     QEMU_OPTION_option_rom,
@@ -6600,6 +6601,7 @@ const QEMUOption qemu_options[] = {
 #endif
 #ifdef USE_KVM
     { "no-kvm", 0, QEMU_OPTION_no_kvm },
+    { "kvm_apic", HAS_ARG, QEMU_OPTION_kvm_apic },
 #endif
 #if defined(TARGET_PPC) || defined(TARGET_SPARC)
     { "g", 1, QEMU_OPTION_g },
@@ -7309,6 +7311,14 @@ int main(int argc, char **argv)
 	    case QEMU_OPTION_no_kvm:
 		kvm_allowed = 0;
 		break;
+	    case QEMU_OPTION_kvm_apic:
+		if (kvm_qemu_set_apic_level(atoi(optarg)) < 0) {
+		    fprintf(stderr, "kvm_apic = %d is not supported by " \
+			    "userspace\n", atoi(optarg));
+		    exit(1);
+		}
+			
+		break;
 #endif
             case QEMU_OPTION_usb:
                 usb_enabled = 1;
diff --git a/qemu/vl.h b/qemu/vl.h
index debd17c..6e573f1 100644
--- a/qemu/vl.h
+++ b/qemu/vl.h
@@ -158,6 +158,7 @@ extern int graphic_depth;
 extern const char *keyboard_layout;
 extern int kqemu_allowed;
 extern int kvm_allowed;
+extern struct kvm_apic kvm_apic;
 extern int win2k_install_hack;
 extern int usb_enabled;
 extern int smp_cpus;
@@ -166,6 +167,19 @@ extern int semihosting_enabled;
 extern int autostart;
 extern int time_drift_fix;
 
+#ifdef USE_KVM
+extern int kvm_qemu_get_apic_level(void);
+#endif
+
+static inline int use_kernel_apic()
+{
+#ifdef USE_KVM
+    return kvm_allowed && kvm_qemu_get_apic_level();
+#else
+    return 0;
+#endif
+}
+
 #define MAX_OPTION_ROMS 16
 extern const char *option_rom[MAX_OPTION_ROMS];
 extern int nb_option_roms;
@@ -1059,7 +1073,7 @@ typedef struct IOAPICState IOAPICState;
 int apic_init(CPUState *env);
 int apic_get_interrupt(CPUState *env);
 int apic_accept_pic_intr(CPUState *env);
-IOAPICState *ioapic_init(void);
+IOAPICState *ioapic_init(CPUState *env);
 void ioapic_set_irq(void *opaque, int vector, int level);
 
 /* i8254.c */
diff --git a/user/kvmctl.c b/user/kvmctl.c
index ce361df..2879ce4 100644
--- a/user/kvmctl.c
+++ b/user/kvmctl.c
@@ -226,7 +226,8 @@ int kvm_create_vcpu(kvm_context_t kvm, int slot)
 	return 0;
 }
 
-int kvm_create(kvm_context_t kvm, unsigned long memory, void **vm_mem)
+int kvm_create(kvm_context_t kvm, unsigned long memory, int apic_level,
+	       void **vm_mem)
 {
 	unsigned long dosmem = 0xa0000;
 	unsigned long exmem = 0xc0000;
@@ -282,6 +283,14 @@ int kvm_create(kvm_context_t kvm, unsigned long memory, void **vm_mem)
 	     MAP_PRIVATE|MAP_FIXED, zfd, 0);
 	close(zfd);
 
+	if (apic_level) {
+		r = ioctl(fd, KVM_ENABLE_KERNEL_PIC, &apic_level);
+		if (r == -1) {
+			fprintf(stderr, "kvm_enable_kernel_pic: %m\n");
+			return -1;
+		}
+	}
+
 	r = kvm_create_vcpu(kvm, 0);
 	if (r < 0)
 		return r;
@@ -1011,6 +1020,30 @@ int kvm_inject_irq(kvm_context_t kvm, int vcpu, unsigned irq)
 	return ioctl(kvm->vcpu_fd[vcpu], KVM_INTERRUPT, &intr);
 }
 
+int kvm_inject_isa_irq(kvm_context_t kvm, unsigned irq)
+{
+	struct kvm_interrupt intr;
+
+	intr.irq = irq;
+	return ioctl(kvm->vm_fd, KVM_ISA_INTERRUPT, &intr);
+}
+
+int kvm_apic_bus_deliver(kvm_context_t kvm, int dest, int trig_mode,
+			 int dest_mode, int delivery_mode, int vector)
+{
+	struct kvm_apic_msg msg;
+
+	memset(&msg, 0, sizeof(msg));
+
+	msg.dest          = dest;
+	msg.trig_mode     = trig_mode;
+	msg.dest_mode     = dest_mode;
+	msg.delivery_mode = delivery_mode;
+	msg.vector        = vector;
+
+	return ioctl(kvm->vm_fd, KVM_APIC_MSG, &msg);
+}
+
 int kvm_guest_debug(kvm_context_t kvm, int vcpu, struct kvm_debug_guest *dbg)
 {
 	return ioctl(kvm->vcpu_fd[vcpu], KVM_DEBUG_GUEST, dbg);
@@ -1057,3 +1090,17 @@ int kvm_set_signal_mask(kvm_context_t kvm, int vcpu, const sigset_t *sigset)
 	free(sigmask);
 	return r;
 }
+
+int kvm_check_extension(kvm_context_t kvm, struct kvm_extensions *ext)
+{
+	memset(ext, 0, sizeof(*ext));
+
+	int r = ioctl(kvm->fd, KVM_CHECK_EXTENSION, KVM_APIC_MSG);
+	if (r < 0)
+		return r;
+
+	if (r)
+		ext->lapic = 1;
+
+	return 0;
+}
diff --git a/user/kvmctl.h b/user/kvmctl.h
index 181f5d1..a3200b4 100644
--- a/user/kvmctl.h
+++ b/user/kvmctl.h
@@ -100,12 +100,14 @@ void kvm_finalize(kvm_context_t kvm);
  *
  * \param kvm Pointer to the current kvm_context
  * \param phys_mem_bytes The amount of physical ram you want the VM to have
+ * \param apic_level The APIC emulation level (0=QEMU, 1=KVM)
  * \param phys_mem This pointer will be set to point to the memory that
  * kvm_create allocates for physical RAM
  * \return 0 on success
  */
 int kvm_create(kvm_context_t kvm,
 	       unsigned long phys_mem_bytes,
+	       int apic_level,
 	       void **phys_mem);
 /*!
  * \brief Create a new virtual cpu
@@ -292,12 +294,38 @@ int kvm_set_msrs(kvm_context_t, int vcpu, struct kvm_msr_entry *msrs, int n);
  * This allows you to simulate an external vectored interrupt.
  *
  * \param kvm Pointer to the current kvm_context
- * \param vcpu Which virtual CPU should get dumped
+ * \param vcpu Which virtual CPU should handle interrupt
  * \param irq Vector number
  * \return 0 on success
  */
 int kvm_inject_irq(kvm_context_t kvm, int vcpu, unsigned irq);
 
+/*!
+ * \brief Simulate an external vectored interrupt to the ISA bus
+ *
+ * This allows you to simulate an external vectored interrupt.
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param irq Vector number
+ * \return 0 on success
+ */
+int kvm_inject_isa_irq(kvm_context_t kvm, unsigned irq);
+
+/*!
+ * \brief Simulate an external vectored interrupt to the APIC bus
+ *
+ * This allows you to simulate a vectored interrupt via the LAPIC mechanism.
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param dest Encoded destination
+ * \param trig_mode 0=edge-trigger, 1=level-trigger
+ * \param dest_mode Destination mode encoding
+ * \param delivery_mode Delivery_mode encoding
+ * \param vector The vector number
+ * \return 0 on success
+ */
+int kvm_apic_bus_deliver(kvm_context_t kvm, int dest, int trig_mode,
+			 int dest_mode, int delivery_mode, int vector);
 int kvm_guest_debug(kvm_context_t, int vcpu, struct kvm_debug_guest *dbg);
 
 /*!
@@ -410,4 +438,12 @@ int kvm_dirty_pages_log_enable_all(kvm_context_t kvm);
  */
 int kvm_dirty_pages_log_reset(kvm_context_t kvm);
 
+struct kvm_extensions
+{
+    int lapic; /* Today we only have one.  Add more here as they come up */
+};
+
+int kvm_check_extension(kvm_context_t kvm, struct kvm_extensions *ext);
+
+
 #endif
diff --git a/user/main.c b/user/main.c
index 4c48531..6e5e3c8 100644
--- a/user/main.c
+++ b/user/main.c
@@ -367,7 +367,7 @@ int main(int ac, char **av)
 	    fprintf(stderr, "kvm_init failed\n");
 	    return 1;
 	}
-	if (kvm_create(kvm, 128 * 1024 * 1024, &vm_mem) < 0) {
+	if (kvm_create(kvm, 128 * 1024 * 1024, 1, &vm_mem) < 0) {
 	    kvm_finalize(kvm);
 	    fprintf(stderr, "kvm_create failed\n");
 	    return 1;

[-- Attachment #5: u.smp.patch --]
[-- Type: application/octet-stream, Size: 6444 bytes --]

diff --git a/qemu/cpu-exec.c b/qemu/cpu-exec.c
index e1293ad..683a056 100644
--- a/qemu/cpu-exec.c
+++ b/qemu/cpu-exec.c
@@ -249,6 +249,8 @@ int cpu_exec(CPUState *env1)
 
 #if defined(TARGET_I386)
     /* handle exit of HALTED state */
+    if (env1->cpu_index) printf("cpu exec %d.\n", env1->cpu_index);
+    if (env1->cpu_index) printf("cpu flags %lx.\n", env1->hflags);
     if (env1->hflags & HF_HALTED_MASK) {
         /* disable halt condition */
         if ((env1->interrupt_request & CPU_INTERRUPT_HARD) &&
@@ -336,6 +338,7 @@ int cpu_exec(CPUState *env1)
     env->exception_index = -1;
 
     /* prepare setjmp context for exception handling */
+    if (env->cpu_index) printf("cpu exec %d.\n", env->cpu_index);
     for(;;) {
         if (setjmp(env->jmp_env) == 0) {
             env->current_tb = NULL;
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index f50909f..2961cdd 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -484,9 +484,13 @@ static void pc_init1(int ram_size, int vga_ram_size, int boot_device,
 
     /* init CPUs */
     for(i = 0; i < smp_cpus; i++) {
+        printf("cpu init %d.\n", i);
         env = cpu_init();
+        printf("cpu init %d done with %p.\n", i, env);
+#ifndef USE_KVM
         if (i != 0)
             env->hflags |= HF_HALTED_MASK;
+#endif
         if (smp_cpus > 1) {
             /* XXX: enable it in all cases */
             env->cpuid_features |= CPUID_APIC;
diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
index 12df433..0d99544 100644
--- a/qemu/qemu-kvm.c
+++ b/qemu/qemu-kvm.c
@@ -28,6 +28,7 @@ struct kvm_apic kvm_apic = {
 #include "qemu-kvm.h"
 #include <kvmctl.h>
 #include <string.h>
+#include <pthread.h>
 
 #define MSR_IA32_TSC		0x10
 
@@ -37,6 +38,8 @@ kvm_context_t kvm_context;
 static struct kvm_msr_list *kvm_msr_list;
 static int kvm_has_msr_star;
 
+extern int smp_cpus;
+
 #define NR_CPU 16
 static CPUState *saved_env[NR_CPU];
 
@@ -160,8 +163,8 @@ static void load_regs(CPUState *env)
     int rc, n, i;
 
     /* hack: save env */
-    if (!saved_env[0])
-	saved_env[0] = env;
+    if (!saved_env[env->cpu_index])
+	saved_env[env->cpu_index] = env;
 
     regs.rax = env->regs[R_EAX];
     regs.rbx = env->regs[R_EBX];
@@ -455,7 +458,7 @@ static int try_push_interrupts(void *opaque)
 static void post_kvm_run(void *opaque, int vcpu)
 {
     CPUState **envs = opaque, *env;
-    env = envs[0];
+    env = envs[vcpu];
 
     env->eflags = kvm_get_interrupt_flag(kvm_context, vcpu)
 	? env->eflags | IF_MASK : env->eflags & ~IF_MASK;
@@ -471,7 +474,7 @@ static void post_kvm_run(void *opaque, int vcpu)
 static void pre_kvm_run(void *opaque, int vcpu)
 {
     CPUState **envs = opaque, *env;
-    env = envs[0];
+    env = envs[vcpu];
 
     if (!kvm_apic.level)
 	kvm_set_cr8(kvm_context, vcpu, cpu_get_apic_tpr(env));
@@ -510,10 +513,11 @@ int kvm_cpu_exec(CPUState *env)
     }
 
     
-    if (!saved_env[0])
-	saved_env[0] = env;
+    if (!saved_env[env->cpu_index])
+	saved_env[env->cpu_index] = env;
 
-    r = kvm_run(kvm_context, 0);
+    //if (env->cpu_index) printf("kvm_run %d\n", env->cpu_index);
+    r = kvm_run(kvm_context, env->cpu_index);
     if (r < 0) {
         printf("kvm_run returned %d\n", r);
         exit(1);
@@ -526,7 +530,7 @@ static int kvm_debug(void *opaque, int vcpu)
 {
     CPUState **envs = opaque;
 
-    env = envs[0];
+    env = envs[vcpu];
     env->exception_index = EXCP_DEBUG;
     return 1;
 }
@@ -656,7 +660,7 @@ static int kvm_halt(void *opaque, int vcpu)
 {
     CPUState **envs = opaque, *env;
 
-    env = envs[0];
+    env = envs[vcpu];
     if (!((env->interrupt_request & CPU_INTERRUPT_HARD) &&
 	  (env->eflags & IF_MASK))) {
 	    env->hflags |= HF_HALTED_MASK;
@@ -707,6 +711,38 @@ int kvm_qemu_init()
     return 0;
 }
 
+void kvm_ap_func(void *arg)
+{
+    unsigned long slot = (unsigned long)arg;
+    CPUState *env;
+    void main_loop(CPUState *env);
+
+    kvm_create_vcpu(kvm_context, slot);
+    printf("cpu %d create done.\n", slot);
+    for (env = first_cpu; env != NULL; env = env->next_cpu) {
+        if (env->cpu_index == slot)
+            break;
+    }
+    printf("cpu %d env %p.\n", slot, env);
+    if (env == NULL)
+        return;
+    main_loop(env);
+}
+
+void kvm_init_aps(void)
+{
+    int i;
+
+    for (i = 1; i < smp_cpus; ++i) {
+        int r;
+        pthread_t thr;
+        r = pthread_create(&thr, NULL, kvm_ap_func, (void *)i);
+        if (r != 0) {
+            printf("pthread create failed: %m\n");
+        }
+    }
+}
+
 int kvm_qemu_create_context(void)
 {
     int i;
diff --git a/qemu/vl.c b/qemu/vl.c
index 4bf61e8..5a42656 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -6246,42 +6246,32 @@ void main_loop_wait(int timeout)
                     qemu_get_clock(rt_clock));
 }
 
-static CPUState *cur_cpu;
-
-int main_loop(void)
+void main_loop(CPUState *env)
 {
     int ret, timeout;
 #ifdef CONFIG_PROFILER
     int64_t ti;
 #endif
-    CPUState *env;
 
-    cur_cpu = first_cpu;
     for(;;) {
         if (vm_running) {
 
-            env = cur_cpu;
             for(;;) {
-                /* get next cpu */
-                env = env->next_cpu;
-                if (!env)
-                    env = first_cpu;
 #ifdef CONFIG_PROFILER
                 ti = profile_getclock();
 #endif
+                if (env->cpu_index) printf("cpu %d loop\n", env->cpu_index);
                 ret = cpu_exec(env);
 #ifdef CONFIG_PROFILER
                 qemu_time += profile_getclock() - ti;
 #endif
                 if (ret != EXCP_HALTED)
                     break;
+
                 /* all CPUs are halted ? */
-                if (env == cur_cpu) {
-                    ret = EXCP_HLT;
-                    break;
-                }
+                sleep(1);
+		printf("halted?\n");
             }
-            cur_cpu = env;
 
             if (shutdown_requested) {
                 ret = EXCP_INTERRUPT;
@@ -6322,7 +6312,6 @@ int main_loop(void)
 #endif
     }
     cpu_disable_ticks();
-    return ret;
 }
 
 void help(void)
@@ -7737,7 +7726,8 @@ int main(int argc, char **argv)
 	close(fd);
     }
 
-    main_loop();
+    kvm_init_aps();
+    main_loop(first_cpu);
     quit_timers();
     return 0;
 }
diff --git a/qemu/vl.h b/qemu/vl.h
index 6e573f1..388bb92 100644
--- a/qemu/vl.h
+++ b/qemu/vl.h
@@ -194,7 +194,7 @@ extern int nb_option_roms;
 #endif
 
 #if USE_KVM
-#define KVM_EXTRA_PAGES 3
+#define KVM_EXTRA_PAGES (3*4)
 #endif
 
 /* keyboard/mouse support */

[-- Attachment #6: BIOS-bochs-latest --]
[-- Type: application/octet-stream, Size: 131072 bytes --]

[-- Attachment #7: Type: text/plain, Size: 286 bytes --]

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

[-- Attachment #8: Type: text/plain, Size: 186 bytes --]

_______________________________________________
kvm-devel mailing list
kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
https://lists.sourceforge.net/lists/listinfo/kvm-devel

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2007-06-14  8:30 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-06-13  9:17 SMP guest boots Li, Xin B
     [not found] ` <B30DA1341B0CFA4893EF8A36B40B5C5D01433C79-wq7ZOvIWXbNpB2pF5aRoyrfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2007-06-13 11:26   ` Gregory Haskins
     [not found]     ` <1181734005.26394.18.camel-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
2007-06-13 12:53       ` Li, Xin B
     [not found]         ` <B30DA1341B0CFA4893EF8A36B40B5C5D01433CD2-wq7ZOvIWXbNpB2pF5aRoyrfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2007-06-13 13:03           ` Gregory Haskins
2007-06-13 16:25   ` Avi Kivity
     [not found]     ` <46701A7F.1090800-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-06-14  1:52       ` Li, Xin B
     [not found]         ` <B30DA1341B0CFA4893EF8A36B40B5C5D01433E81-wq7ZOvIWXbNpB2pF5aRoyrfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2007-06-14  8:30           ` Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox