public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/9] in-kernel APIC v9 (kernel side)
@ 2007-05-31 18:08 Gregory Haskins
       [not found] ` <20070531180005.1810.23884.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
  0 siblings, 1 reply; 21+ messages in thread
From: Gregory Haskins @ 2007-05-31 18:08 UTC (permalink / raw)
  To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Incorporates v8 plus the following changes:

1) Fix for hang on AMD
2) Fixes issue where irq-windows are inaccurately reported to userspace
3) Fixed issue where irq-window-exiting requests can be ignored in some cases

Note that we no longer need the backlog.patch to handle a corner cases now.

As before, this has been tested on 32 bit XP w/ACPI and 64 bit windows.  It
offers a 17% performance improvement over git HEAD in my testing.  Note that I
am not able to fully verify that this works on AMD, as even git-head does not
work on my system.  I am able to verify that it no longer hangs the kernel
hard.  The guest hangs, but it hangs without my patches as well.  Perhaps
someone with a known good environment on AMD can verify for me?

I am being pulled off of my KVM work for a little while, so I will not be able
to contribute again until further notice. If there are any remaining issues
that need to be addressed and someone wants to carry the torch, feel free to
do so.  Otherwise, I will pick up the effort to get this merged in when I am
able to return to KVM.

Thanks all for the feedback/comments/suggestions through all of this.  It has
been very fun and quite a learning experience.

-Greg
 

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 1/9] KVM: Adds support for in-kernel mmio handlers
       [not found] ` <20070531180005.1810.23884.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
@ 2007-05-31 18:08   ` Gregory Haskins
  2007-05-31 18:08   ` [PATCH 2/9] KVM: VMX - fix interrupt checking on light-exit Gregory Haskins
                     ` (10 subsequent siblings)
  11 siblings, 0 replies; 21+ messages in thread
From: Gregory Haskins @ 2007-05-31 18:08 UTC (permalink / raw)
  To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
---

 drivers/kvm/kvm.h      |   60 +++++++++++++++++++++++++++++++
 drivers/kvm/kvm_main.c |   94 ++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 142 insertions(+), 12 deletions(-)

diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 0632d0b..1aa20ff 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -259,6 +259,65 @@ struct kvm_stat {
 	u32 efer_reload;
 };
 
+struct kvm_io_device {
+	void (*read)(struct kvm_io_device *this,
+		     gpa_t addr,
+		     int len,
+		     void *val);
+	void (*write)(struct kvm_io_device *this,
+		      gpa_t addr,
+		      int len,
+		      const void *val);
+	int (*in_range)(struct kvm_io_device *this, gpa_t addr);
+	void (*destructor)(struct kvm_io_device *this);
+
+	void             *private;
+};
+
+static inline void kvm_iodevice_read(struct kvm_io_device *dev,
+				     gpa_t addr,
+				     int len,
+				     void *val)
+{
+	dev->read(dev, addr, len, val);
+}
+
+static inline void kvm_iodevice_write(struct kvm_io_device *dev,
+				      gpa_t addr,
+				      int len,
+				      const void *val)
+{
+	dev->write(dev, addr, len, val);
+}
+
+static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
+{
+	return dev->in_range(dev, addr);
+}
+
+static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+	dev->destructor(dev);
+}
+
+/*
+ * It would be nice to use something smarter than a linear search, TBD...
+ * Thankfully we dont expect many devices to register (famous last words :),
+ * so until then it will suffice.  At least its abstracted so we can change
+ * in one place.
+ */
+struct kvm_io_bus {
+	int                   dev_count;
+#define NR_IOBUS_DEVS 6
+	struct kvm_io_device *devs[NR_IOBUS_DEVS];
+};
+
+void kvm_io_bus_init(struct kvm_io_bus *bus);
+void kvm_io_bus_destroy(struct kvm_io_bus *bus);
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
+			     struct kvm_io_device *dev);
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 	union {
@@ -383,6 +442,7 @@ struct kvm {
 	unsigned long rmap_overflow;
 	struct list_head vm_list;
 	struct file *filp;
+	struct kvm_io_bus mmio_bus;
 };
 
 struct descriptor_table {
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 0e6d5d6..008e898 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -318,6 +318,7 @@ static struct kvm *kvm_create_vm(void)
 
 	spin_lock_init(&kvm->lock);
 	INIT_LIST_HEAD(&kvm->active_mmu_pages);
+	kvm_io_bus_init(&kvm->mmio_bus);
 	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 		struct kvm_vcpu *vcpu = &kvm->vcpus[i];
 
@@ -414,6 +415,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
+	kvm_io_bus_destroy(&kvm->mmio_bus);
 	kvm_free_vcpus(kvm);
 	kvm_free_physmem(kvm);
 	kfree(kvm);
@@ -1037,12 +1039,25 @@ static int emulator_write_std(unsigned long addr,
 	return X86EMUL_UNHANDLEABLE;
 }
 
+static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
+						gpa_t addr)
+{
+	/*
+	 * Note that its important to have this wrapper function because
+	 * in the very near future we will be checking for MMIOs against
+	 * the LAPIC as well as the general MMIO bus
+	 */
+	return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
+}
+
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
 				  unsigned int bytes,
 				  struct x86_emulate_ctxt *ctxt)
 {
-	struct kvm_vcpu *vcpu = ctxt->vcpu;
+	struct kvm_vcpu      *vcpu = ctxt->vcpu;
+	struct kvm_io_device *mmio_dev;
+	gpa_t                 gpa;
 
 	if (vcpu->mmio_read_completed) {
 		memcpy(val, vcpu->mmio_data, bytes);
@@ -1051,18 +1066,26 @@ static int emulator_read_emulated(unsigned long addr,
 	} else if (emulator_read_std(addr, val, bytes, ctxt)
 		   == X86EMUL_CONTINUE)
 		return X86EMUL_CONTINUE;
-	else {
-		gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
 
-		if (gpa == UNMAPPED_GVA)
-			return X86EMUL_PROPAGATE_FAULT;
-		vcpu->mmio_needed = 1;
-		vcpu->mmio_phys_addr = gpa;
-		vcpu->mmio_size = bytes;
-		vcpu->mmio_is_write = 0;
+	gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+	if (gpa == UNMAPPED_GVA)
+		return X86EMUL_PROPAGATE_FAULT;
 
-		return X86EMUL_UNHANDLEABLE;
+	/*
+	 * Is this MMIO handled locally?
+	 */
+	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+	if (mmio_dev) {
+		kvm_iodevice_read(mmio_dev, gpa, bytes, val);
+		return X86EMUL_CONTINUE;
 	}
+
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_phys_addr = gpa;
+	vcpu->mmio_size = bytes;
+	vcpu->mmio_is_write = 0;
+
+	return X86EMUL_UNHANDLEABLE;
 }
 
 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1090,8 +1113,9 @@ static int emulator_write_emulated(unsigned long addr,
 				   unsigned int bytes,
 				   struct x86_emulate_ctxt *ctxt)
 {
-	struct kvm_vcpu *vcpu = ctxt->vcpu;
-	gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+	struct kvm_vcpu      *vcpu = ctxt->vcpu;
+	struct kvm_io_device *mmio_dev;
+	gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
 
 	if (gpa == UNMAPPED_GVA) {
 		kvm_arch_ops->inject_page_fault(vcpu, addr, 2);
@@ -1101,6 +1125,15 @@ static int emulator_write_emulated(unsigned long addr,
 	if (emulator_write_phys(vcpu, gpa, val, bytes))
 		return X86EMUL_CONTINUE;
 
+	/*
+	 * Is this MMIO handled locally?
+	 */
+	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+	if (mmio_dev) {
+		kvm_iodevice_write(mmio_dev, gpa, bytes, val);
+		return X86EMUL_CONTINUE;
+	}
+
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_phys_addr = gpa;
 	vcpu->mmio_size = bytes;
@@ -2955,6 +2988,43 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
 	return NOTIFY_OK;
 }
 
+void kvm_io_bus_init(struct kvm_io_bus *bus)
+{
+	memset(bus, 0, sizeof(*bus));
+}
+
+void kvm_io_bus_destroy(struct kvm_io_bus *bus)
+{
+	int i;
+
+	for (i = 0; i < bus->dev_count; i++) {
+		struct kvm_io_device *pos = bus->devs[i];
+
+		kvm_iodevice_destructor(pos);
+	}
+}
+
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
+{
+	int i;
+
+	for (i = 0; i < bus->dev_count; i++) {
+		struct kvm_io_device *pos = bus->devs[i];
+
+		if (pos->in_range(pos, addr))
+			return pos;
+	}
+
+	return NULL;
+}
+
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
+{
+	BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
+
+	bus->devs[bus->dev_count++] = dev;
+}
+
 static struct notifier_block kvm_cpu_notifier = {
 	.notifier_call = kvm_cpu_hotplug,
 	.priority = 20, /* must be > scheduler priority */


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 2/9] KVM: VMX - fix interrupt checking on light-exit
       [not found] ` <20070531180005.1810.23884.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
  2007-05-31 18:08   ` [PATCH 1/9] KVM: Adds support for in-kernel mmio handlers Gregory Haskins
@ 2007-05-31 18:08   ` Gregory Haskins
  2007-05-31 18:09   ` [PATCH 3/9] KVM: Add irqdevice object Gregory Haskins
                     ` (9 subsequent siblings)
  11 siblings, 0 replies; 21+ messages in thread
From: Gregory Haskins @ 2007-05-31 18:08 UTC (permalink / raw)
  To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
---

 drivers/kvm/vmx.c |    6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index bfd5f8e..3411813 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -1977,13 +1977,13 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 
 preempted:
-	if (!vcpu->mmio_read_completed)
-		do_interrupt_requests(vcpu, kvm_run);
-
 	if (vcpu->guest_debug.enabled)
 		kvm_guest_debug_pre(vcpu);
 
 again:
+	if (!vcpu->mmio_read_completed)
+		do_interrupt_requests(vcpu, kvm_run);
+
 	vmx_save_host_state(vcpu);
 	kvm_load_guest_fpu(vcpu);
 


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 3/9] KVM: Add irqdevice object
       [not found] ` <20070531180005.1810.23884.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
  2007-05-31 18:08   ` [PATCH 1/9] KVM: Adds support for in-kernel mmio handlers Gregory Haskins
  2007-05-31 18:08   ` [PATCH 2/9] KVM: VMX - fix interrupt checking on light-exit Gregory Haskins
@ 2007-05-31 18:09   ` Gregory Haskins
       [not found]     ` <20070531180903.1810.87474.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
  2007-05-31 18:09   ` [PATCH 4/9] KVM: Adds ability to preempt an executing VCPU Gregory Haskins
                     ` (8 subsequent siblings)
  11 siblings, 1 reply; 21+ messages in thread
From: Gregory Haskins @ 2007-05-31 18:09 UTC (permalink / raw)
  To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

The current code is geared towards using a user-mode (A)PIC.  This patch adds
an "irqdevice" abstraction, and implements a "userint" model to handle the
duties of the original code.  Later, we can develop other irqdevice models
to handle objects like LAPIC, IOAPIC, i8259, etc, as appropriate

Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
---

 drivers/kvm/Makefile    |    2 
 drivers/kvm/irqdevice.h |  176 +++++++++++++++++++++++++++++++++++++
 drivers/kvm/kvm.h       |   94 +++++++++++++++++++-
 drivers/kvm/kvm_main.c  |   58 +++++++++---
 drivers/kvm/svm.c       |  168 ++++++++++++++++++++++++++---------
 drivers/kvm/userint.c   |  223 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/kvm/vmx.c       |  171 +++++++++++++++++++++++++++---------
 7 files changed, 786 insertions(+), 106 deletions(-)

diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile
index c0a789f..540afbc 100644
--- a/drivers/kvm/Makefile
+++ b/drivers/kvm/Makefile
@@ -2,7 +2,7 @@
 # Makefile for Kernel-based Virtual Machine module
 #
 
-kvm-objs := kvm_main.o mmu.o x86_emulate.o
+kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/drivers/kvm/irqdevice.h b/drivers/kvm/irqdevice.h
new file mode 100644
index 0000000..097d179
--- /dev/null
+++ b/drivers/kvm/irqdevice.h
@@ -0,0 +1,176 @@
+/*
+ * Defines an interface for an abstract interrupt controller.  The model
+ * consists of a unit with an arbitrary number of input lines N (IRQ0-(N-1)),
+ * an arbitrary number of output lines (INTR) (LINT, EXTINT, NMI, etc), and
+ * methods for completing an interrupt-acknowledge cycle (INTA).  A particular
+ * implementation of this model will define various policies, such as
+ * irq-to-vector translation, INTA/auto-EOI policy, etc.
+ *
+ * In addition, the INTR callback mechanism allows the unit to be "wired" to
+ * an interruptible source in a very flexible manner. For instance, an
+ * irqdevice could have its INTR wired to a VCPU (ala LAPIC), or another
+ * interrupt controller (ala cascaded i8259s)
+ *
+ * Copyright (C) 2007 Novell
+ *
+ * Authors:
+ *   Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef __IRQDEVICE_H
+#define __IRQDEVICE_H
+
+struct kvm_irqdevice;
+
+typedef enum {
+	kvm_irqpin_localint,
+	kvm_irqpin_extint,
+	kvm_irqpin_smi,
+	kvm_irqpin_nmi,
+	kvm_irqpin_invalid, /* must always be last */
+} kvm_irqpin_t;
+
+
+struct kvm_irqsink {
+	void (*set_intr)(struct kvm_irqsink *this,
+			 struct kvm_irqdevice *dev,
+			 kvm_irqpin_t pin);
+
+	void *private;
+};
+
+#define KVM_IRQACKDATA_VECTOR_VALID   (1 << 0)
+#define KVM_IRQACKDATA_VECTOR_PENDING (1 << 1)
+
+#define KVM_IRQACK_FLAG_PEEK          (1 << 0)
+
+struct kvm_irqack_data {
+	int flags;
+	int vector;
+};
+
+struct kvm_irqdevice {
+	int  (*ack)(struct kvm_irqdevice *this, int flags,
+		    struct kvm_irqack_data *data);
+	int  (*set_pin)(struct kvm_irqdevice *this, int pin, int level);
+	void (*destructor)(struct kvm_irqdevice *this);
+
+	void               *private;
+	struct kvm_irqsink  sink;
+};
+
+/**
+ * kvm_irqdevice_init - initialize the kvm_irqdevice for use
+ * @dev: The device
+ *
+ * Description: Initialize the kvm_irqdevice for use.  Should be called before
+ *              calling any derived implementation init functions
+ *
+ * Returns: (void)
+ */
+static inline void kvm_irqdevice_init(struct kvm_irqdevice *dev)
+{
+	memset(dev, 0, sizeof(*dev));
+}
+
+/**
+ * kvm_irqdevice_ack - read and ack the highest priority vector from the device
+ * @dev: The device
+ * @flags: Modifies default behavior
+ *           [ KVM_IRQACK_FLAG_PEEK - Dont ack vector, just check status ]
+ * @data: A pointer to a kvm_irqack_data structure to hold the result
+ *
+ * Description: Read the highest priority pending vector from the device,
+ *              potentially invoking auto-EOI depending on device policy
+ *
+ *              Successful return indicates that the *data* structure is valid
+ *
+ *               data.flags -
+ *                  [KVM_IRQACKDATA_VECTOR_VALID - data.vector is valid]
+ *                  [KVM_IRQACKDATA_VECTOR_PENDING - more vectors are pending]
+ *
+ * Returns: (int)
+ *   [-1 = failure]
+ *   [ 0 = success]
+ */
+static inline int kvm_irqdevice_ack(struct kvm_irqdevice *dev, int flags,
+				    struct kvm_irqack_data *data)
+{
+	return dev->ack(dev, flags, data);
+}
+
+/**
+ * kvm_irqdevice_set_pin - allows the caller to assert/deassert an IRQ
+ * @dev: The device
+ * @pin: The input pin to alter
+ * @level: The value to set (1 = assert, 0 = deassert)
+ *
+ * Description: Allows the caller to assert/deassert an IRQ input pin to the
+ *              device according to device policy.
+ *
+ * Returns: (int)
+ *   [-1 = failure]
+ *   [ 0 = success]
+ */
+static inline int kvm_irqdevice_set_pin(struct kvm_irqdevice *dev, int pin,
+				  int level)
+{
+	return dev->set_pin(dev, pin, level);
+}
+
+/**
+ * kvm_irqdevice_register_sink - registers an kvm_irqsink object
+ * @dev: The device
+ * @sink: The sink to register.  Data will be copied so building object from
+ *        transient storage is ok.
+ *
+ * Description: Registers an kvm_irqsink object as an INTR callback
+ *
+ * Returns: (void)
+ */
+static inline void kvm_irqdevice_register_sink(struct kvm_irqdevice *dev,
+					       const struct kvm_irqsink *sink)
+{
+	dev->sink = *sink;
+}
+
+/**
+ * kvm_irqdevice_destructor - destroys an irqdevice
+ * @dev: The device
+ *
+ * Returns: (void)
+ */
+static inline void kvm_irqdevice_destructor(struct kvm_irqdevice *dev)
+{
+	dev->destructor(dev);
+}
+
+/**
+ * kvm_irqdevice_set_intr - invokes a registered INTR callback
+ * @dev: The device
+ * @pin: Identifies the pin to alter -
+ *           [ KVM_IRQPIN_LOCALINT (default) - an vector is pending on this
+ *                                             device]
+ *           [ KVM_IRQPIN_EXTINT - a vector is pending on an external device]
+ *           [ KVM_IRQPIN_SMI - system-management-interrupt pin]
+ *           [ KVM_IRQPIN_NMI - non-maskable-interrupt pin
+ *
+ * Description: Invokes a registered INTR callback (if present).  This
+ *              function is meant to be used privately by a irqdevice
+ *              implementation.
+ *
+ * Returns: (void)
+ */
+static inline void kvm_irqdevice_set_intr(struct kvm_irqdevice *dev,
+					  kvm_irqpin_t pin)
+{
+	struct kvm_irqsink *sink = &dev->sink;
+	if (sink->set_intr)
+		sink->set_intr(sink, dev, pin);
+}
+
+#endif /*  __IRQDEVICE_H */
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 1aa20ff..78025c3 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -15,6 +15,7 @@
 #include <linux/mm.h>
 
 #include "vmx.h"
+#include "irqdevice.h"
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
 
@@ -162,6 +163,11 @@ struct vmcs {
 
 struct kvm_vcpu;
 
+int kvm_user_irqdev_init(struct kvm_irqdevice *dev);
+int kvm_user_irqdev_save(struct kvm_irqdevice *this, void *data);
+int kvm_user_irqdev_restore(struct kvm_irqdevice *this, void *data);
+int kvm_userint_init(struct kvm_vcpu *vcpu);
+
 /*
  * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
  * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
@@ -318,6 +324,18 @@ struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
 void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
 			     struct kvm_io_device *dev);
 
+#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
+
+/*
+ * structure for maintaining info for interrupting an executing VCPU
+ */
+struct kvm_vcpu_irq {
+	spinlock_t           lock;
+	struct kvm_irqdevice dev;
+	int                  pending;
+	int                  deferred;
+};
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 	union {
@@ -330,9 +348,7 @@ struct kvm_vcpu {
 	u64 host_tsc;
 	struct kvm_run *run;
 	int interrupt_window_open;
-	unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
-#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
-	unsigned long irq_pending[NR_IRQ_WORDS];
+	struct kvm_vcpu_irq irq;
 	unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
 	unsigned long rip;      /* needs vcpu_load_rsp_rip() */
 
@@ -410,6 +426,78 @@ struct kvm_vcpu {
 	struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
 };
 
+/*
+ * These two functions are helpers for determining if a standard interrupt
+ * is pending to replace the old "if (vcpu->irq_summary)" logic.
+ */
+
+/*
+ * Assumes lock already held
+ */
+static inline int __kvm_vcpu_irq_pending(struct kvm_vcpu *vcpu)
+{
+	int pending = vcpu->irq.pending;
+
+	if (vcpu->irq.deferred != -1)
+		__set_bit(kvm_irqpin_localint, &pending);
+
+	return pending;
+}
+
+static inline int kvm_vcpu_irq_pending(struct kvm_vcpu *vcpu)
+{
+	int ret = 0;
+	int flags;
+
+	spin_lock_irqsave(&vcpu->irq.lock, flags);
+	ret = __kvm_vcpu_irq_pending(vcpu);
+	spin_unlock_irqrestore(&vcpu->irq.lock, flags);
+
+	return ret;
+}
+
+/*
+ * Assumes lock already held
+ */
+static inline int kvm_vcpu_irq_pop(struct kvm_vcpu *vcpu,
+				   struct kvm_irqack_data *data)
+{
+	int ret = 0;
+
+	if (vcpu->irq.deferred != -1) {
+		ret = kvm_irqdevice_ack(&vcpu->irq.dev, KVM_IRQACK_FLAG_PEEK,
+					data);
+		data->flags |= KVM_IRQACKDATA_VECTOR_VALID;
+		data->vector = vcpu->irq.deferred;
+		vcpu->irq.deferred = -1;
+	} else
+		ret = kvm_irqdevice_ack(&vcpu->irq.dev, 0, data);
+
+	/*
+	 * If there are no more interrupts we must clear the status flag
+	 */
+	if (!(data->flags & KVM_IRQACKDATA_VECTOR_PENDING))
+		__clear_bit(kvm_irqpin_localint, &vcpu->irq.pending);
+
+	return ret;
+}
+
+static inline void __kvm_vcpu_irq_push(struct kvm_vcpu *vcpu, int irq)
+{
+	BUG_ON(vcpu->irq.deferred != -1); /* We can only hold one deferred */
+
+	vcpu->irq.deferred = irq;
+}
+
+static inline void kvm_vcpu_irq_push(struct kvm_vcpu *vcpu, int irq)
+{
+	int flags;
+
+	spin_lock_irqsave(&vcpu->irq.lock, flags);
+	__kvm_vcpu_irq_push(vcpu, irq);
+	spin_unlock_irqrestore(&vcpu->irq.lock, flags);
+}
+
 struct kvm_mem_alias {
 	gfn_t base_gfn;
 	unsigned long npages;
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 008e898..dfab3f3 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -323,6 +323,11 @@ static struct kvm *kvm_create_vm(void)
 		struct kvm_vcpu *vcpu = &kvm->vcpus[i];
 
 		mutex_init(&vcpu->mutex);
+
+		memset(&vcpu->irq, 0, sizeof(vcpu->irq));
+		spin_lock_init(&vcpu->irq.lock);
+		vcpu->irq.deferred = -1;
+
 		vcpu->cpu = -1;
 		vcpu->kvm = kvm;
 		vcpu->mmu.root_hpa = INVALID_PAGE;
@@ -389,6 +394,7 @@ static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
 	vcpu_load(vcpu);
 	kvm_mmu_destroy(vcpu);
 	vcpu_put(vcpu);
+	kvm_irqdevice_destructor(&vcpu->irq.dev);
 	kvm_arch_ops->vcpu_free(vcpu);
 	free_page((unsigned long)vcpu->run);
 	vcpu->run = NULL;
@@ -2008,8 +2014,7 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	sregs->efer = vcpu->shadow_efer;
 	sregs->apic_base = vcpu->apic_base;
 
-	memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
-	       sizeof sregs->interrupt_bitmap);
+	kvm_user_irqdev_save(&vcpu->irq.dev, &sregs->interrupt_bitmap);
 
 	vcpu_put(vcpu);
 
@@ -2026,7 +2031,6 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 				    struct kvm_sregs *sregs)
 {
 	int mmu_reset_needed = 0;
-	int i;
 	struct descriptor_table dt;
 
 	vcpu_load(vcpu);
@@ -2063,12 +2067,8 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	if (mmu_reset_needed)
 		kvm_mmu_reset_context(vcpu);
 
-	memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
-	       sizeof vcpu->irq_pending);
-	vcpu->irq_summary = 0;
-	for (i = 0; i < NR_IRQ_WORDS; ++i)
-		if (vcpu->irq_pending[i])
-			__set_bit(i, &vcpu->irq_summary);
+	kvm_user_irqdev_restore(&vcpu->irq.dev,
+				&sregs->interrupt_bitmap[0]);
 
 	set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
 	set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
@@ -2229,14 +2229,8 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 {
 	if (irq->irq < 0 || irq->irq >= 256)
 		return -EINVAL;
-	vcpu_load(vcpu);
-
-	set_bit(irq->irq, vcpu->irq_pending);
-	set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
 
-	vcpu_put(vcpu);
-
-	return 0;
+	return kvm_irqdevice_set_pin(&vcpu->irq.dev, irq->irq, 1);
 }
 
 static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
@@ -2338,6 +2332,32 @@ out1:
 }
 
 /*
+ * This function will be invoked whenever the vcpu->irq.dev raises its INTR
+ * line
+ */
+static void kvm_vcpu_intr(struct kvm_irqsink *this,
+			  struct kvm_irqdevice *dev,
+			  kvm_irqpin_t pin)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu*)this->private;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vcpu->irq.lock, flags);
+	__set_bit(pin, &vcpu->irq.pending);
+	spin_unlock_irqrestore(&vcpu->irq.lock, flags);
+}
+
+static void kvm_vcpu_irqsink_init(struct kvm_vcpu *vcpu)
+{
+	struct kvm_irqsink sink = {
+		.set_intr   = kvm_vcpu_intr,
+		.private    = vcpu
+	};
+
+	kvm_irqdevice_register_sink(&vcpu->irq.dev, &sink);
+}
+
+/*
  * Creates some virtual cpus.  Good luck creating more than one.
  */
 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
@@ -2384,6 +2404,12 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 	if (r < 0)
 		goto out_free_vcpus;
 
+	kvm_irqdevice_init(&vcpu->irq.dev);
+	kvm_vcpu_irqsink_init(vcpu);
+	r = kvm_userint_init(vcpu);
+	if (r < 0)
+		goto out_free_vcpus;
+
 	kvm_arch_ops->vcpu_load(vcpu);
 	r = kvm_mmu_setup(vcpu);
 	if (r >= 0)
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index b621403..8395662 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -106,24 +106,6 @@ static unsigned get_addr_size(struct kvm_vcpu *vcpu)
 				(cs_attrib & SVM_SELECTOR_DB_MASK) ? 4 : 2;
 }
 
-static inline u8 pop_irq(struct kvm_vcpu *vcpu)
-{
-	int word_index = __ffs(vcpu->irq_summary);
-	int bit_index = __ffs(vcpu->irq_pending[word_index]);
-	int irq = word_index * BITS_PER_LONG + bit_index;
-
-	clear_bit(bit_index, &vcpu->irq_pending[word_index]);
-	if (!vcpu->irq_pending[word_index])
-		clear_bit(word_index, &vcpu->irq_summary);
-	return irq;
-}
-
-static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
-{
-	set_bit(irq, vcpu->irq_pending);
-	set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
-}
-
 static inline void clgi(void)
 {
 	asm volatile (SVM_CLGI);
@@ -904,7 +886,12 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 
 	if (is_external_interrupt(exit_int_info))
-		push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
+		/*
+		 * An exception was taken while we were trying to inject an
+		 * IRQ.  We must defer the injection of the vector until
+		 * the next window.
+		 */
+		kvm_vcpu_irq_push(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
 
 	spin_lock(&vcpu->kvm->lock);
 
@@ -1114,7 +1101,7 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1;
 	skip_emulated_instruction(vcpu);
-	if (vcpu->irq_summary)
+	if (kvm_vcpu_irq_pending(vcpu))
 		return 1;
 
 	kvm_run->exit_reason = KVM_EXIT_HLT;
@@ -1285,7 +1272,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu,
 	 * possible
 	 */
 	if (kvm_run->request_interrupt_window &&
-	    !vcpu->irq_summary) {
+	    !kvm_vcpu_irq_pending(vcpu)) {
 		++vcpu->stat.irq_window_exits;
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		return 0;
@@ -1384,60 +1371,143 @@ static void pre_svm_run(struct kvm_vcpu *vcpu)
 }
 
 
-static inline void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
-{
-	struct vmcb_control_area *control;
-
-	control = &vcpu->svm->vmcb->control;
-	control->int_vector = pop_irq(vcpu);
-	control->int_ctl &= ~V_INTR_PRIO_MASK;
-	control->int_ctl |= V_IRQ_MASK |
-		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
-}
-
 static void kvm_reput_irq(struct kvm_vcpu *vcpu)
 {
 	struct vmcb_control_area *control = &vcpu->svm->vmcb->control;
 
 	if (control->int_ctl & V_IRQ_MASK) {
 		control->int_ctl &= ~V_IRQ_MASK;
-		push_irq(vcpu, control->int_vector);
+		kvm_vcpu_irq_push(vcpu, control->int_vector);
 	}
 
 	vcpu->interrupt_window_open =
 		!(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
 }
 
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
-				       struct kvm_run *kvm_run)
+static void do_intr_requests(struct kvm_vcpu *vcpu,
+			    struct kvm_run *kvm_run,
+			    kvm_irqpin_t pin)
 {
 	struct vmcb_control_area *control = &vcpu->svm->vmcb->control;
+	int pending = 0;
 
 	vcpu->interrupt_window_open =
 		(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
 		 (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF));
 
-	if (vcpu->interrupt_window_open && vcpu->irq_summary)
+	if (vcpu->interrupt_window_open) {
 		/*
-		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
+		 * If interrupts enabled, and not blocked by sti or mov ss.
+		 * Good.
 		 */
-		kvm_do_inject_irq(vcpu);
+		struct kvm_irqack_data ack;
+		int r = 0;
+
+		memset(&ack, 0, sizeof(ack));
+
+		switch (pin) {
+		case kvm_irqpin_localint:
+			r = kvm_vcpu_irq_pop(vcpu, &ack);
+			break;
+		case kvm_irqpin_extint:
+			printk(KERN_WARNING "KVM: external-interrupts not " \
+			       "handled yet\n");
+			__clear_bit(pin, &vcpu->irq.pending);
+			break;
+		case kvm_irqpin_nmi:
+			/*
+			 * FIXME: Someday we will handle this using the
+			 * specific SVN NMI features.  For now, just inject
+			 * the NMI as a standard interrupt on vector 2
+			 */
+			ack.flags |= KVM_IRQACKDATA_VECTOR_VALID;
+			ack.vector = 2;
+			__clear_bit(pin, &vcpu->irq.pending);
+			break;
+		default:
+			panic("KVM: unknown interrupt pin raised: %d\n", pin);
+			break;
+		}
+
+		BUG_ON(r < 0);
+
+		if (ack.flags & KVM_IRQACKDATA_VECTOR_VALID) {
+			control = &vcpu->svm->vmcb->control;
+			control->int_vector = ack.vector;
+			control->int_ctl &= ~V_INTR_PRIO_MASK;
+			control->int_ctl |= V_IRQ_MASK |
+				((/*control->int_vector >> 4*/ 0xf) <<
+				 V_INTR_PRIO_SHIFT);
+		}
+	}
 
 	/*
-	 * Interrupts blocked.  Wait for unblock.
+	 * Re-read the pending interrupt state.  If anything is still
+	 * pending we need to cause an exit on the next window
 	 */
-	if (!vcpu->interrupt_window_open &&
-	    (vcpu->irq_summary || kvm_run->request_interrupt_window)) {
+	pending = __kvm_vcpu_irq_pending(vcpu);
+
+	if (test_bit(pin, &pending))
+		/*
+		 * Trigger a VMEXIT on the next IRQ window
+		 */
 		control->intercept |= 1ULL << INTERCEPT_VINTR;
-	} else
+}
+
+static void clear_pending_controls(struct kvm_vcpu *vcpu,
+				  struct kvm_run *kvm_run)
+{
+	struct vmcb_control_area *control = &vcpu->svm->vmcb->control;
+
+	if (kvm_run->request_interrupt_window)
+		control->intercept |= 1ULL << INTERCEPT_VINTR;
+	else
 		control->intercept &= ~(1ULL << INTERCEPT_VINTR);
 }
 
+static void do_interrupt_requests(struct kvm_vcpu *vcpu,
+				  struct kvm_run *kvm_run)
+{
+	int pending = __kvm_vcpu_irq_pending(vcpu);
+
+	clear_pending_controls(vcpu, kvm_run);
+
+	while (pending) {
+		kvm_irqpin_t pin = __fls(pending);
+
+		switch (pin) {
+		case kvm_irqpin_localint:
+		case kvm_irqpin_extint:
+		case kvm_irqpin_nmi:
+			do_intr_requests(vcpu, kvm_run, pin);
+			break;
+		case kvm_irqpin_smi:
+			/* ignored (for now) */
+			printk(KERN_WARNING "KVM: dropping unhandled SMI\n");
+			__clear_bit(pin, &vcpu->irq.pending);
+			break;
+		case kvm_irqpin_invalid:
+			/* drop */
+			break;
+		default:
+			panic("KVM: unknown interrupt pin raised: %d\n", pin);
+			break;
+		}
+
+		__clear_bit(pin, &pending);
+	}
+}
+
 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 			      struct kvm_run *kvm_run)
 {
-	kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open &&
-						  vcpu->irq_summary == 0);
+	struct kvm_irqack_data ack;
+	kvm_irqdevice_ack(&vcpu->irq.dev, KVM_IRQACK_FLAG_PEEK, &ack);
+
+	kvm_run->ready_for_interrupt_injection =
+		(vcpu->interrupt_window_open &&
+		 !kvm_vcpu_irq_pending(vcpu) &&
+		 !(ack.flags & KVM_IRQACKDATA_NEXT_VALID));
 	kvm_run->if_flag = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF) != 0;
 	kvm_run->cr8 = vcpu->cr8;
 	kvm_run->apic_base = vcpu->apic_base;
@@ -1452,7 +1522,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
 					  struct kvm_run *kvm_run)
 {
-	return (!vcpu->irq_summary &&
+	return (!kvm_vcpu_irq_pending(vcpu) &&
 		kvm_run->request_interrupt_window &&
 		vcpu->interrupt_window_open &&
 		(vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF));
@@ -1482,9 +1552,17 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 
 again:
+	spin_lock(&vcpu->irq.lock);
+
+	/*
+	 * We must inject interrupts (if any) while the irq_lock
+	 * is held
+	 */
 	if (!vcpu->mmio_read_completed)
 		do_interrupt_requests(vcpu, kvm_run);
 
+	spin_unlock(&vcpu->irq.lock);
+
 	clgi();
 
 	pre_svm_run(vcpu);
diff --git a/drivers/kvm/userint.c b/drivers/kvm/userint.c
new file mode 100644
index 0000000..a60707d
--- /dev/null
+++ b/drivers/kvm/userint.c
@@ -0,0 +1,223 @@
+/*
+ * User Interrupts IRQ device
+ *
+ * This acts as an extention of an interrupt controller that exists elsewhere
+ * (typically in userspace/QEMU).  Because this PIC is a pseudo device that
+ * is downstream from a real emulated PIC, the "IRQ-to-vector" mapping has
+ * already occured.  Therefore, this PIC has the following unusal properties:
+ *
+ * 1) It has 256 "pins" which are literal vectors (i.e. no translation)
+ * 2) It only supports "auto-EOI" behavior since it is expected that the
+ *    upstream emulated PIC will handle the real EOIs (if applicable)
+ * 3) It only listens to "asserts" on the pins (deasserts are dropped)
+ *    because its an auto-EOI device anyway.
+ *
+ * Copyright (C) 2007 Novell
+ *
+ * bitarray code based on original vcpu->irq_pending code,
+ *     Copyright (C) 2007 Qumranet
+ *
+ * Authors:
+ *   Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "kvm.h"
+
+/*
+ *----------------------------------------------------------------------
+ * optimized bitarray object - works like bitarrays in bitops, but uses
+ * a summary field to accelerate lookups.  Assumes external locking
+ *---------------------------------------------------------------------
+ */
+
+struct bitarray {
+	unsigned long summary; /* 1 per word in pending */
+	unsigned long pending[NR_IRQ_WORDS];
+};
+
+static inline int bitarray_pending(struct bitarray *this)
+{
+	return this->summary ? 1 : 0;
+}
+
+static inline int bitarray_findhighest(struct bitarray *this)
+{
+	if (!this->summary)
+		return -1;
+	else {
+		int word_index = __fls(this->summary);
+		int bit_index  = __fls(this->pending[word_index]);
+
+		return word_index * BITS_PER_LONG + bit_index;
+	}
+}
+
+static inline void bitarray_set(struct bitarray *this, int nr)
+{
+	__set_bit(nr, &this->pending);
+	__set_bit(nr / BITS_PER_LONG, &this->summary);
+}
+
+static inline void bitarray_clear(struct bitarray *this, int nr)
+{
+	int word = nr / BITS_PER_LONG;
+
+	__clear_bit(nr, &this->pending);
+	if (!this->pending[word])
+		__clear_bit(word, &this->summary);
+}
+
+static inline int bitarray_test(struct bitarray *this, int nr)
+{
+	return test_bit(nr, &this->pending);
+}
+
+static inline int bitarray_test_and_set(struct bitarray *this, int nr, int val)
+{
+	if (bitarray_test(this, nr) != val) {
+		if (val)
+			bitarray_set(this, nr);
+		else
+			bitarray_clear(this, nr);
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ *----------------------------------------------------------------------
+ * userint interface - provides the actual kvm_irqdevice implementation
+ *---------------------------------------------------------------------
+ */
+
+struct kvm_user_irqdev {
+	spinlock_t      lock;
+	atomic_t        ref_count;
+	struct bitarray pending;
+};
+
+static int user_irqdev_ack(struct kvm_irqdevice *this, int flags,
+			   struct kvm_irqack_data *data)
+{
+	struct kvm_user_irqdev *s = (struct kvm_user_irqdev*)this->private;
+
+	spin_lock(&s->lock);
+
+	if (!(flags & KVM_IRQACK_FLAG_PEEK)) {
+		int irq = bitarray_findhighest(&s->pending);
+
+		if (irq > -1) {
+			/*
+			 * Automatically clear the interrupt as the EOI
+			 * mechanism (if any) will take place in userspace
+			 */
+			bitarray_clear(&s->pending, irq);
+
+			data->flags |= KVM_IRQACKDATA_VECTOR_VALID;
+		}
+
+		data->vector = irq;
+	}
+
+	if (bitarray_pending(&s->pending))
+		data->flags |= KVM_IRQACKDATA_VECTOR_PENDING;
+
+	spin_unlock(&s->lock);
+
+	return 0;
+}
+
+static int user_irqdev_set_pin(struct kvm_irqdevice *this, int irq, int level)
+{
+	struct kvm_user_irqdev *s = (struct kvm_user_irqdev*)this->private;
+	int forward = 0;
+
+	spin_lock(&s->lock);
+	forward = bitarray_test_and_set(&s->pending, irq, level);
+	spin_unlock(&s->lock);
+
+	/*
+	 * alert the higher layer software we have changes
+	 */
+	if (forward)
+		kvm_irqdevice_set_intr(this, kvm_irqpin_localint);
+
+	return 0;
+}
+
+static void user_irqdev_destructor(struct kvm_irqdevice *this)
+{
+	struct kvm_user_irqdev *s = (struct kvm_user_irqdev*)this->private;
+
+	if (atomic_dec_and_test(&s->ref_count))
+		kfree(s);
+}
+
+int kvm_user_irqdev_init(struct kvm_irqdevice *irqdev)
+{
+	struct kvm_user_irqdev *s;
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	spin_lock_init(&s->lock);
+
+	irqdev->ack         = user_irqdev_ack;
+	irqdev->set_pin     = user_irqdev_set_pin;
+	irqdev->destructor  = user_irqdev_destructor;
+
+	irqdev->private = s;
+	atomic_inc(&s->ref_count);
+
+	return 0;
+}
+
+int kvm_user_irqdev_save(struct kvm_irqdevice *this, void *data)
+{
+	struct kvm_user_irqdev *s = (struct kvm_user_irqdev*)this->private;
+
+	spin_lock(&s->lock);
+	memcpy(data, s->pending.pending, sizeof s->pending.pending);
+	spin_unlock(&s->lock);
+
+	return 0;
+}
+
+int kvm_user_irqdev_restore(struct kvm_irqdevice *this, void *data)
+{
+	struct kvm_user_irqdev *s = (struct kvm_user_irqdev*)this->private;
+	int i;
+	int forward = 0;
+
+	spin_lock(&s->lock);
+
+	/*
+	 * walk the interrupt-bitmap and inject an IRQ for each bit found
+	 */
+	for (i = 0; i < 256; ++i) {
+		int val  = test_bit(i, data);
+		forward |= bitarray_test_and_set(&s->pending, i, val);
+	}
+
+	spin_unlock(&s->lock);
+
+	/*
+	 * alert the higher layer software we have changes
+	 */
+	if (forward)
+		kvm_irqdevice_set_intr(this, kvm_irqpin_localint);
+
+	return 0;
+}
+
+int kvm_userint_init(struct kvm_vcpu *vcpu)
+{
+	return kvm_user_irqdev_init(&vcpu->irq.dev);
+}
+
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 3411813..6c56ac0 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -1454,52 +1454,124 @@ static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
 	vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
 }
 
-static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
+static void do_intr_requests(struct kvm_vcpu *vcpu,
+			    struct kvm_run *kvm_run,
+			    kvm_irqpin_t pin)
 {
-	int word_index = __ffs(vcpu->irq_summary);
-	int bit_index = __ffs(vcpu->irq_pending[word_index]);
-	int irq = word_index * BITS_PER_LONG + bit_index;
-
-	clear_bit(bit_index, &vcpu->irq_pending[word_index]);
-	if (!vcpu->irq_pending[word_index])
-		clear_bit(word_index, &vcpu->irq_summary);
-
-	if (vcpu->rmode.active) {
-		inject_rmode_irq(vcpu, irq);
-		return;
-	}
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
-}
-
-
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
-				       struct kvm_run *kvm_run)
-{
-	u32 cpu_based_vm_exec_control;
+	int pending = 0;
 
 	vcpu->interrupt_window_open =
 		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
 		 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
 
 	if (vcpu->interrupt_window_open &&
-	    vcpu->irq_summary &&
-	    !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
+	    !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) {
 		/*
-		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
+		 * If interrupts enabled, and not blocked by sti or mov ss.
+		 * Good.
 		 */
-		kvm_do_inject_irq(vcpu);
+		struct kvm_irqack_data ack;
+		int r = 0;
+
+		memset(&ack, 0, sizeof(ack));
+
+		switch (pin) {
+		case kvm_irqpin_localint:
+			r = kvm_vcpu_irq_pop(vcpu, &ack);
+			break;
+		case kvm_irqpin_extint:
+			printk(KERN_WARNING "KVM: external-interrupts not " \
+			       "handled yet\n");
+			__clear_bit(pin, &vcpu->irq.pending);
+			break;
+		case kvm_irqpin_nmi:
+			/*
+			 * FIXME: Someday we will handle this using the
+			 * specific VMX NMI features.  For now, just inject
+			 * the NMI as a standard interrupt on vector 2
+			 */
+			ack.flags |= KVM_IRQACKDATA_VECTOR_VALID;
+			ack.vector = 2;
+			__clear_bit(pin, &vcpu->irq.pending);
+			break;
+		default:
+			panic("KVM: unknown interrupt pin raised: %d\n", pin);
+			break;
+		}
+
+		BUG_ON(r < 0);
 
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	if (!vcpu->interrupt_window_open &&
-	    (vcpu->irq_summary || kvm_run->request_interrupt_window))
+		if (ack.flags & KVM_IRQACKDATA_VECTOR_VALID) {
+			if (vcpu->rmode.active)
+				inject_rmode_irq(vcpu, ack.vector);
+			else
+				vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+					     ack.vector |
+					     INTR_TYPE_EXT_INTR |
+					     INTR_INFO_VALID_MASK);
+		}
+	}
+
+	/*
+	 * Re-read the pending interrupt state.  If anything is still
+	 * pending we need to cause an exit on the next window
+	 */
+	pending = __kvm_vcpu_irq_pending(vcpu);
+
+	if (test_bit(pin, &pending) || kvm_run->request_interrupt_window) {
 		/*
-		 * Interrupts blocked.  Wait for unblock.
+		 * Trigger a VMEXIT on the next IRQ window
 		 */
-		cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-	else
-		cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+		u32 cbvec = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+		cbvec |= CPU_BASED_VIRTUAL_INTR_PENDING;
+		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cbvec);
+	}
+}
+
+static void clear_pending_controls(struct kvm_vcpu *vcpu,
+				  struct kvm_run *kvm_run)
+{
+	u32 cbvec = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ 
+ 	if (kvm_run->request_interrupt_window)
+ 		cbvec |= CPU_BASED_VIRTUAL_INTR_PENDING;
+ 	else
+ 		cbvec &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cbvec);
+}
+
+static void do_interrupt_requests(struct kvm_vcpu *vcpu,
+				  struct kvm_run *kvm_run)
+{
+	int pending = __kvm_vcpu_irq_pending(vcpu);
+
+	clear_pending_controls(vcpu, kvm_run);
+
+	while (pending) {
+		kvm_irqpin_t pin = __fls(pending);
+
+		switch (pin) {
+		case kvm_irqpin_localint:
+		case kvm_irqpin_extint:
+		case kvm_irqpin_nmi:
+			do_intr_requests(vcpu, kvm_run, pin);
+			break;
+		case kvm_irqpin_smi:
+			/* ignored (for now) */
+			printk(KERN_WARNING "KVM: dropping unhandled SMI\n");
+			__clear_bit(pin, &vcpu->irq.pending);
+			break;
+		case kvm_irqpin_invalid:
+			/* drop */
+			break;
+		default:
+			panic("KVM: unknown interrupt pin raised: %d\n", pin);
+			break;
+		}
+
+		__clear_bit(pin, &pending);
+	}
 }
 
 static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
@@ -1554,9 +1626,13 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	if (is_external_interrupt(vect_info)) {
+		/*
+		 * An exception was taken while we were trying to inject an
+		 * IRQ.  We must defer the injection of the vector until
+		 * the next window.
+		 */
 		int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
-		set_bit(irq, vcpu->irq_pending);
-		set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
+		kvm_vcpu_irq_push(vcpu, irq);
 	}
 
 	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
@@ -1869,11 +1945,16 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 			      struct kvm_run *kvm_run)
 {
+ 	struct kvm_irqack_data ack;
+ 	kvm_irqdevice_ack(&vcpu->irq.dev, KVM_IRQACK_FLAG_PEEK, &ack);
+ 
 	kvm_run->if_flag = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) != 0;
 	kvm_run->cr8 = vcpu->cr8;
 	kvm_run->apic_base = vcpu->apic_base;
-	kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open &&
-						  vcpu->irq_summary == 0);
+	kvm_run->ready_for_interrupt_injection =
+		(vcpu->interrupt_window_open &&
+		 !kvm_vcpu_irq_pending(vcpu) &&
+		 !(ack.flags & KVM_IRQACKDATA_NEXT_VALID));
 }
 
 static int handle_interrupt_window(struct kvm_vcpu *vcpu,
@@ -1884,7 +1965,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
 	 * possible
 	 */
 	if (kvm_run->request_interrupt_window &&
-	    !vcpu->irq_summary) {
+	    !kvm_vcpu_irq_pending(vcpu)) {
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		++vcpu->stat.irq_window_exits;
 		return 0;
@@ -1895,7 +1976,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
 static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	skip_emulated_instruction(vcpu);
-	if (vcpu->irq_summary)
+	if (kvm_vcpu_irq_pending(vcpu))
 		return 1;
 
 	kvm_run->exit_reason = KVM_EXIT_HLT;
@@ -1965,7 +2046,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
 					  struct kvm_run *kvm_run)
 {
-	return (!vcpu->irq_summary &&
+	return (!kvm_vcpu_irq_pending(vcpu) &&
 		kvm_run->request_interrupt_window &&
 		vcpu->interrupt_window_open &&
 		(vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
@@ -1981,9 +2062,17 @@ preempted:
 		kvm_guest_debug_pre(vcpu);
 
 again:
+	spin_lock(&vcpu->irq.lock);
+
+	/*
+	 * We must inject interrupts (if any) while the irq.lock
+	 * is held
+	 */
 	if (!vcpu->mmio_read_completed)
 		do_interrupt_requests(vcpu, kvm_run);
 
+	spin_unlock(&vcpu->irq.lock);
+
 	vmx_save_host_state(vcpu);
 	kvm_load_guest_fpu(vcpu);
 


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 4/9] KVM: Adds ability to preempt an executing VCPU
       [not found] ` <20070531180005.1810.23884.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
                     ` (2 preceding siblings ...)
  2007-05-31 18:09   ` [PATCH 3/9] KVM: Add irqdevice object Gregory Haskins
@ 2007-05-31 18:09   ` Gregory Haskins
  2007-05-31 18:09   ` [PATCH 5/9] KVM: Add support for in-kernel LAPIC model Gregory Haskins
                     ` (7 subsequent siblings)
  11 siblings, 0 replies; 21+ messages in thread
From: Gregory Haskins @ 2007-05-31 18:09 UTC (permalink / raw)
  To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

The VCPU executes synchronously w.r.t. userspace today, and therefore
interrupt injection is pretty straight forward.  However, we will soon need
to be able to inject interrupts asynchronous to the execution of the VCPU
due to the introduction of SMP, paravirtualized drivers, and asynchronous
hypercalls.  This patch adds support to the interrupt mechanism to force
a VCPU to VMEXIT when a new interrupt is pending.

Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
---

 drivers/kvm/kvm.h      |    1 +
 drivers/kvm/kvm_main.c |   59 +++++++++++++++++++++++++++++++++++++++++++++---
 drivers/kvm/svm.c      |   49 ++++++++++++++++++++++++++++++++++------
 drivers/kvm/vmx.c      |   38 +++++++++++++++++++++++++++++++
 4 files changed, 136 insertions(+), 11 deletions(-)

diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 78025c3..f84950c 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -334,6 +334,7 @@ struct kvm_vcpu_irq {
 	struct kvm_irqdevice dev;
 	int                  pending;
 	int                  deferred;
+	int                  guest_cpu;
 };
 
 struct kvm_vcpu {
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index dfab3f3..2957023 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -2332,6 +2332,20 @@ out1:
 }
 
 /*
+ * This function is invoked whenever we want to interrupt a vcpu that is
+ * currently executing in guest-mode.  It currently is a no-op because
+ * the simple delivery of the IPI to execute this function accomplishes our
+ * goal: To cause a VMEXIT.  We pass the vcpu (which contains the
+ * vcpu->irq.task, etc) for future use
+ */
+static void kvm_vcpu_guest_intr(void *info)
+{
+#ifdef NOT_YET
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu*)info;
+#endif
+}
+
+/*
  * This function will be invoked whenever the vcpu->irq.dev raises its INTR
  * line
  */
@@ -2340,11 +2354,48 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this,
 			  kvm_irqpin_t pin)
 {
 	struct kvm_vcpu *vcpu = (struct kvm_vcpu*)this->private;
-	unsigned long flags;
+	int direct_ipi = -1;
 
-	spin_lock_irqsave(&vcpu->irq.lock, flags);
-	__set_bit(pin, &vcpu->irq.pending);
-	spin_unlock_irqrestore(&vcpu->irq.lock, flags);
+	spin_lock_irq(&vcpu->irq.lock);
+
+	if (!test_bit(pin, &vcpu->irq.pending)) {
+		/*
+		 * Record the change..
+		 */
+		__set_bit(pin, &vcpu->irq.pending);
+
+		/*
+		 * then wake up the vcpu (if necessary)
+		 */
+		if (vcpu->irq.guest_cpu != -1) {
+			/*
+			 * If we are in guest mode, we must send a host-IPI
+			 * to the CPU which is running the guest to cause
+			 * a VMEXIT.
+			 */
+			direct_ipi = vcpu->irq.guest_cpu;
+			BUG_ON(direct_ipi == smp_processor_id());
+		}
+	}
+
+	spin_unlock_irq(&vcpu->irq.lock);
+
+	/*
+	 * we can safely send the IPI outside of the lock-scope because the
+	 * irq.pending has already been updated.  This code assumes that
+	 * userspace will not sleep on anything other than HLT instructions.
+	 * HLT is covered in a race-free way because irq.pending was updated
+	 * in the critical section, and handle_halt() which check if any
+	 * interrupts are pending before returning to userspace.
+	 *
+	 * If it turns out that userspace can sleep on conditions other than
+	 * HLT, this code will need to be enhanced to allow the irq.pending
+	 * flags to be exported to userspace
+	 */
+	if (direct_ipi != -1)
+		smp_call_function_single(direct_ipi,
+					 kvm_vcpu_guest_intr,
+					 vcpu, 0, 0);
 }
 
 static void kvm_vcpu_irqsink_init(struct kvm_vcpu *vcpu)
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index 8395662..1b9d633 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -1552,9 +1552,36 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 
 again:
+	clgi();
+
 	spin_lock(&vcpu->irq.lock);
 
 	/*
+	 * If there are any signals pending (virtual interrupt related or
+	 * otherwise), don't even bother trying to enter guest mode...
+	 */
+	if (signal_pending(current)) {
+		kvm_run->exit_reason = KVM_EXIT_INTR;
+		spin_unlock(&vcpu->irq.lock);
+		stgi();
+		r = -EINTR;
+		/*
+		 * FIXME: We probably want to move this whole lock-block below
+		 * the host->guest state loading so we don't restore when
+		 * the system was never saved to begin with
+		 */
+		goto out;
+	}
+
+	/*
+	 * There are optimizations we can make when signaling interrupts
+	 * if we know the VCPU is in GUEST mode, so record the guest's
+	 * CPU to both serve as an indicator of vcpu state and a target
+	 * for our interrupts
+	 */
+	vcpu->irq.guest_cpu = task_cpu(current);
+
+	/*
 	 * We must inject interrupts (if any) while the irq_lock
 	 * is held
 	 */
@@ -1563,8 +1590,6 @@ again:
 
 	spin_unlock(&vcpu->irq.lock);
 
-	clgi();
-
 	pre_svm_run(vcpu);
 
 	save_host_msrs(vcpu);
@@ -1724,6 +1749,13 @@ again:
 		profile_hit(KVM_PROFILING,
 			(void *)(unsigned long)vcpu->svm->vmcb->save.rip);
 
+	/*
+	 * Signal that we have transitioned back to host mode
+	 */
+	spin_lock(&vcpu->irq.lock);
+	vcpu->irq.guest_cpu = -1;
+	spin_unlock(&vcpu->irq.lock);
+
 	stgi();
 
 	kvm_reput_irq(vcpu);
@@ -1734,28 +1766,31 @@ again:
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->fail_entry.hardware_entry_failure_reason
 			= vcpu->svm->vmcb->control.exit_code;
-		post_kvm_run_save(vcpu, kvm_run);
-		return 0;
+		r = 0;
+		goto out;
 	}
 
 	r = handle_exit(vcpu, kvm_run);
 	if (r > 0) {
 		if (signal_pending(current)) {
 			++vcpu->stat.signal_exits;
-			post_kvm_run_save(vcpu, kvm_run);
 			kvm_run->exit_reason = KVM_EXIT_INTR;
-			return -EINTR;
+			r = -EINTR;
+			goto out;
 		}
 
 		if (dm_request_for_irq_injection(vcpu, kvm_run)) {
 			++vcpu->stat.request_irq_exits;
 			post_kvm_run_save(vcpu, kvm_run);
 			kvm_run->exit_reason = KVM_EXIT_INTR;
-			return -EINTR;
+			r = -EINTR;
+			goto out;
 		}
 		kvm_resched(vcpu);
 		goto again;
 	}
+
+ out:
 	post_kvm_run_save(vcpu, kvm_run);
 	return r;
 }
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 6c56ac0..7f2af92 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -2062,9 +2062,38 @@ preempted:
 		kvm_guest_debug_pre(vcpu);
 
 again:
+	/*
+	 * We disable interrupts until the next VMEXIT to eliminate a race
+	 * condition for delivery of virtual interrutps.  Note that this is
+	 * probably not as bad as it sounds, as interrupts will still invoke
+	 * a VMEXIT once transitioned to GUEST mode (and thus exit this lock
+	 * scope) even if they are disabled.
+	 */
+	local_irq_disable();
+
 	spin_lock(&vcpu->irq.lock);
 
 	/*
+	 * If there are any signals pending (virtual interrupt related or
+	 * otherwise), don't even bother trying to enter guest mode...
+	 */
+	if (signal_pending(current)) {
+		kvm_run->exit_reason = KVM_EXIT_INTR;
+		spin_unlock(&vcpu->irq.lock);
+		local_irq_enable();
+		r = -EINTR;
+		goto out;
+	}
+
+	/*
+	 * There are optimizations we can make when signaling interrupts
+	 * if we know the VCPU is in GUEST mode, so record the guest's
+	 * CPU to both serve as an indicator of vcpu state and a target
+	 * for our interrupts
+	 */
+	vcpu->irq.guest_cpu = task_cpu(current);
+
+	/*
 	 * We must inject interrupts (if any) while the irq.lock
 	 * is held
 	 */
@@ -2199,6 +2228,15 @@ again:
 		[cr2]"i"(offsetof(struct kvm_vcpu, cr2))
 	      : "cc", "memory" );
 
+	/*
+	 * Signal that we have transitioned back to host mode
+	 */
+	spin_lock(&vcpu->irq.lock);
+	vcpu->irq.guest_cpu = -1;
+	spin_unlock(&vcpu->irq.lock);
+
+	local_irq_enable();
+
 	++vcpu->stat.exits;
 
 	vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 5/9] KVM: Add support for in-kernel LAPIC model
       [not found] ` <20070531180005.1810.23884.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
                     ` (3 preceding siblings ...)
  2007-05-31 18:09   ` [PATCH 4/9] KVM: Adds ability to preempt an executing VCPU Gregory Haskins
@ 2007-05-31 18:09   ` Gregory Haskins
  2007-05-31 18:09   ` [PATCH 6/9] KVM: Adds support for real NMI injection on VMX processors Gregory Haskins
                     ` (6 subsequent siblings)
  11 siblings, 0 replies; 21+ messages in thread
From: Gregory Haskins @ 2007-05-31 18:09 UTC (permalink / raw)
  To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
---

 drivers/kvm/Makefile   |    2 
 drivers/kvm/kernint.c  |  149 +++++
 drivers/kvm/kvm.h      |   35 +
 drivers/kvm/kvm_main.c |  198 ++++++-
 drivers/kvm/lapic.c    | 1418 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/kvm/svm.c      |   13 
 drivers/kvm/userint.c  |    8 
 drivers/kvm/vmx.c      |   16 -
 include/linux/kvm.h    |   15 +
 9 files changed, 1809 insertions(+), 45 deletions(-)

diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile
index 540afbc..1aad737 100644
--- a/drivers/kvm/Makefile
+++ b/drivers/kvm/Makefile
@@ -2,7 +2,7 @@
 # Makefile for Kernel-based Virtual Machine module
 #
 
-kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o
+kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o lapic.o kernint.o
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/drivers/kvm/kernint.c b/drivers/kvm/kernint.c
new file mode 100644
index 0000000..b5cbcae
--- /dev/null
+++ b/drivers/kvm/kernint.c
@@ -0,0 +1,149 @@
+/*
+ * Kernel Interrupt IRQ device
+ *
+ * Provides a model for connecting in-kernel interrupt resources to a VCPU.
+ *
+ * A typical modern x86 processor has the concept of an internal Local-APIC
+ * and some external signal pins.  The way in which interrupts are injected is
+ * dependent on whether software enables the LAPIC or not.  When enabled,
+ * interrupts are acknowledged through the LAPIC.  Otherwise they are through
+ * an externally connected PIC (typically an i8259 on the BSP)
+ *
+ * Copyright (C) 2007 Novell
+ *
+ * Authors:
+ *   Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "kvm.h"
+
+struct kvm_kernint {
+	struct kvm_vcpu              *vcpu;
+	struct kvm_irqdevice         *self_irq;
+	struct kvm_irqdevice         *ext_irq;
+	struct kvm_irqdevice          apic_irq;
+
+};
+
+static struct kvm_irqdevice *get_irq_dev(struct kvm_kernint *s)
+{
+	struct kvm_irqdevice *dev;
+
+	if (kvm_lapic_enabled(s->vcpu))
+		dev = &s->apic_irq;
+	else
+		dev = s->ext_irq;
+
+	if (!dev)
+		kvm_crash_guest(s->vcpu->kvm);
+
+	return dev;
+}
+
+static int kernint_irqdev_ack(struct kvm_irqdevice *this, int flags,
+			      struct kvm_irqack_data *data)
+{
+	struct kvm_kernint *s = (struct kvm_kernint*)this->private;
+
+	return kvm_irqdevice_ack(get_irq_dev(s), flags, data);
+}
+
+static int kernint_irqdev_set_pin(struct kvm_irqdevice *this,
+				  int irq, int level)
+{
+	/* no-op */
+	return 0;
+}
+
+static void kernint_irqdev_destructor(struct kvm_irqdevice *this)
+{
+	struct kvm_kernint *s = (struct kvm_kernint*)this->private;
+
+	kvm_irqdevice_destructor(&s->apic_irq);
+	kvm_lapic_destroy(s->vcpu);
+	kfree(s);
+}
+
+static void kvm_apic_intr(struct kvm_irqsink *this,
+			  struct kvm_irqdevice *dev,
+			  kvm_irqpin_t pin)
+{
+	struct kvm_kernint *s = (struct kvm_kernint*)this->private;
+
+	/*
+	 * If the LAPIC sent us an interrupt it *must* be enabled,
+	 * just forward it on to the CPU
+	 */
+	kvm_irqdevice_set_intr(s->self_irq, pin);
+}
+
+static void kvm_ext_intr(struct kvm_irqsink *this,
+			 struct kvm_irqdevice *dev,
+			 kvm_irqpin_t pin)
+{
+	struct kvm_kernint *s = (struct kvm_kernint*)this->private;
+
+	/*
+	 * If the EXTINT device sent us an interrupt, forward it to the LINT0
+	 * pin of the LAPIC
+	 */
+	if (pin != kvm_irqpin_localint)
+		return;
+
+	/*
+	 * "irq 0" = LINT0, 1 = LINT1
+	 */
+	kvm_irqdevice_set_pin(&s->apic_irq, 0, 1);
+}
+
+int kvm_kernint_init(struct kvm_vcpu *vcpu)
+{
+	struct kvm_irqdevice *irqdev = &vcpu->irq.dev;
+	struct kvm_kernint *s;
+	struct kvm_irqsink apicsink;
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	s->vcpu = vcpu;
+
+	/*
+	 * Configure the irqdevice interface
+	 */
+	irqdev->ack         = kernint_irqdev_ack;
+	irqdev->set_pin     = kernint_irqdev_set_pin;
+	irqdev->destructor  = kernint_irqdev_destructor;
+
+	irqdev->private = s;
+	s->self_irq = irqdev;
+
+	/*
+	 * Configure the EXTINT device if this is the BSP processor
+	 */
+	if (!vcpu_slot(vcpu)) {
+		struct kvm_irqsink extsink = {
+			.set_intr   = kvm_ext_intr,
+			.private    = s
+		};
+		s->ext_irq = &vcpu->kvm->isa_irq;
+		kvm_irqdevice_register_sink(s->ext_irq, &extsink);
+	}
+
+	/*
+	 * Configure the LAPIC device
+	 */
+	apicsink.set_intr = kvm_apic_intr;
+	apicsink.private  = s;
+
+	kvm_irqdevice_init(&s->apic_irq);
+	kvm_irqdevice_register_sink(&s->apic_irq, &apicsink);
+	kvm_lapic_init(vcpu, &s->apic_irq, 0);
+
+	return 0;
+}
+
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index f84950c..1f30274 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -167,6 +167,21 @@ int kvm_user_irqdev_init(struct kvm_irqdevice *dev);
 int kvm_user_irqdev_save(struct kvm_irqdevice *this, void *data);
 int kvm_user_irqdev_restore(struct kvm_irqdevice *this, void *data);
 int kvm_userint_init(struct kvm_vcpu *vcpu);
+int kvm_kernint_init(struct kvm_vcpu *vcpu);
+
+#define KVM_LAPIC_OPTION_USERMODE (1 << 0)
+
+int kvm_lapic_init(struct kvm_vcpu *vcpu, struct kvm_irqdevice *dev,
+		   int flags);
+void kvm_lapic_destroy(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, u64 cr8);
+u64  kvm_lapic_get_tpr(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 base);
+u64  kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+void kvm_lapic_save(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+void kvm_lapic_restore(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+int  kvm_lapic_enabled(struct kvm_vcpu *vcpu);
 
 /*
  * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
@@ -337,6 +352,11 @@ struct kvm_vcpu_irq {
 	int                  guest_cpu;
 };
 
+struct kvm_lapic {
+	void                 *dev;
+	struct kvm_io_device *mmio;
+};
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 	union {
@@ -350,6 +370,7 @@ struct kvm_vcpu {
 	struct kvm_run *run;
 	int interrupt_window_open;
 	struct kvm_vcpu_irq irq;
+	struct kvm_lapic apic;
 	unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
 	unsigned long rip;      /* needs vcpu_load_rsp_rip() */
 
@@ -360,10 +381,8 @@ struct kvm_vcpu {
 	struct page *para_state_page;
 	gpa_t hypercall_gpa;
 	unsigned long cr4;
-	unsigned long cr8;
 	u64 pdptrs[4]; /* pae */
 	u64 shadow_efer;
-	u64 apic_base;
 	u64 ia32_misc_enable_msr;
 	int nmsrs;
 	int save_nmsrs;
@@ -532,6 +551,8 @@ struct kvm {
 	struct list_head vm_list;
 	struct file *filp;
 	struct kvm_io_bus mmio_bus;
+	int enable_kernel_pic;
+	struct kvm_irqdevice isa_irq;
 };
 
 struct descriptor_table {
@@ -606,6 +627,9 @@ void kvm_exit_arch(void);
 int kvm_mmu_module_init(void);
 void kvm_mmu_module_exit(void);
 
+int kvm_apicbus_send(struct kvm *kvm, int dest, int trig_mode, int level,
+		     int dest_mode, int delivery_mode, int vector);
+
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
@@ -739,6 +763,13 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 	return (struct kvm_mmu_page *)page_private(page);
 }
 
+static inline int vcpu_slot(struct kvm_vcpu *vcpu)
+{
+	return vcpu - vcpu->kvm->vcpus;
+}
+
+void kvm_crash_guest(struct kvm *kvm);
+
 static inline u16 read_fs(void)
 {
 	u16 seg;
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 2957023..2a1b376 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -319,6 +319,7 @@ static struct kvm *kvm_create_vm(void)
 	spin_lock_init(&kvm->lock);
 	INIT_LIST_HEAD(&kvm->active_mmu_pages);
 	kvm_io_bus_init(&kvm->mmio_bus);
+	kvm_irqdevice_init(&kvm->isa_irq);
 	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 		struct kvm_vcpu *vcpu = &kvm->vcpus[i];
 
@@ -411,6 +412,23 @@ static void kvm_free_vcpus(struct kvm *kvm)
 		kvm_free_vcpu(&kvm->vcpus[i]);
 }
 
+/*
+ * The function kills a guest while there still is a user space processes
+ * with a descriptor to it
+ */
+void kvm_crash_guest(struct kvm *kvm)
+{
+	unsigned int i;
+
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		/*
+		 * FIXME: in the future it should send IPI to gracefully
+		 * stop the other vCPUs
+		 */
+		kvm_free_vcpu(&kvm->vcpus[i]);
+	}
+}
+
 static int kvm_dev_release(struct inode *inode, struct file *filp)
 {
 	return 0;
@@ -422,6 +440,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
 	kvm_io_bus_destroy(&kvm->mmio_bus);
+	if (kvm->enable_kernel_pic)
+		kvm_irqdevice_destructor(&kvm->isa_irq);
 	kvm_free_vcpus(kvm);
 	kvm_free_physmem(kvm);
 	kfree(kvm);
@@ -627,7 +647,7 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 		inject_gp(vcpu);
 		return;
 	}
-	vcpu->cr8 = cr8;
+	kvm_lapic_set_tpr(vcpu, cr8);
 }
 EXPORT_SYMBOL_GPL(set_cr8);
 
@@ -928,6 +948,69 @@ out:
 	return r;
 }
 
+static int kvm_vm_ioctl_enable_kernel_pic(struct kvm *kvm, __u32 val)
+{
+	/*
+	 * FIXME: We should not allow this if VCPUs have already been created
+	 */
+	if (kvm->enable_kernel_pic)
+		return -EINVAL;
+
+	/*
+	 * Someday we may offer two levels of in-kernel PIC support:
+	 *
+	 *  level 0 = (default) compatiblity mode (everything in userspace)
+	 *  level 1 = LAPIC in kernel, IOAPIC/i8259 in userspace
+	 *  level 2 = All three in kernel
+	 *
+	 * For now we only support level 0 and 1.  However, you cant set
+	 * level 0
+	 */
+	if (val != 1)
+		return -EINVAL;
+
+	kvm->enable_kernel_pic = val;
+
+	printk(KERN_INFO "KVM: Setting in-kernel PIC level to %d\n", val);
+
+	/*
+	 * installing a user_irqdev model to the kvm->isa_irq device
+	 * creates a level-1 environment, where the userspace completely
+	 * controls the ISA domain interrupts in the IOAPIC/i8259.
+	 * Interrupts come down to the VCPU either as an ISA vector to
+	 * this controller, or as an APIC bus message (or both)
+	 */
+	kvm_user_irqdev_init(&kvm->isa_irq);
+
+	return 0;
+}
+
+static int kvm_vm_ioctl_isa_interrupt(struct kvm *kvm,
+				      struct kvm_interrupt *irq)
+{
+	if (irq->irq < 0 || irq->irq >= 256)
+		return -EINVAL;
+
+	if (!kvm->enable_kernel_pic)
+		return -EINVAL;
+
+	return kvm_irqdevice_set_pin(&kvm->isa_irq, irq->irq, 1);
+}
+
+static int kvm_vm_ioctl_apic_msg(struct kvm *kvm,
+				 struct kvm_apic_msg *msg)
+{
+	if (!kvm->enable_kernel_pic)
+		return -EINVAL;
+
+	msg->delivery_mode = (msg->delivery_mode << 8) & 0xF00;
+
+	kvm_apicbus_send(kvm, msg->dest, msg->trig_mode, 1, msg->dest_mode,
+			 msg->delivery_mode, msg->vector);
+
+	return 0;
+}
+
 static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 {
 	int i;
@@ -1048,10 +1131,16 @@ static int emulator_write_std(unsigned long addr,
 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
 						gpa_t addr)
 {
+	struct kvm_io_device *dev = vcpu->apic.mmio;
+
+	/*
+	 * First check if the LAPIC will snarf this request
+	 */
+	if (dev && dev->in_range(dev, addr))
+		return dev;
+
 	/*
-	 * Note that its important to have this wrapper function because
-	 * in the very near future we will be checking for MMIOs against
-	 * the LAPIC as well as the general MMIO bus
+	 * And then fallback to allow any device to participate
 	 */
 	return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
 }
@@ -1518,7 +1607,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		data = 3;
 		break;
 	case MSR_IA32_APICBASE:
-		data = vcpu->apic_base;
+		data = kvm_lapic_get_base(vcpu);
 		break;
 	case MSR_IA32_MISC_ENABLE:
 		data = vcpu->ia32_misc_enable_msr;
@@ -1596,7 +1685,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	case 0x200 ... 0x2ff: /* MTRRs */
 		break;
 	case MSR_IA32_APICBASE:
-		vcpu->apic_base = data;
+		kvm_lapic_set_base(vcpu, data);
 		break;
 	case MSR_IA32_MISC_ENABLE:
 		vcpu->ia32_misc_enable_msr = data;
@@ -1860,8 +1949,9 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
-	/* re-sync apic's tpr */
-	vcpu->cr8 = kvm_run->cr8;
+	if (!vcpu->kvm->enable_kernel_pic)
+		/* re-sync apic's tpr if the APIC is in userspace */
+		kvm_lapic_set_tpr(vcpu, kvm_run->cr8);
 
 	if (vcpu->pio.cur_count) {
 		r = complete_pio(vcpu);
@@ -2010,11 +2100,12 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	sregs->cr2 = vcpu->cr2;
 	sregs->cr3 = vcpu->cr3;
 	sregs->cr4 = vcpu->cr4;
-	sregs->cr8 = vcpu->cr8;
 	sregs->efer = vcpu->shadow_efer;
-	sregs->apic_base = vcpu->apic_base;
 
-	kvm_user_irqdev_save(&vcpu->irq.dev, &sregs->interrupt_bitmap);
+	kvm_lapic_save(vcpu, sregs);
+
+	if (!vcpu->kvm->enable_kernel_pic)
+		kvm_user_irqdev_save(&vcpu->irq.dev, &sregs->interrupt_bitmap);
 
 	vcpu_put(vcpu);
 
@@ -2046,14 +2137,10 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
 	vcpu->cr3 = sregs->cr3;
 
-	vcpu->cr8 = sregs->cr8;
-
 	mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
 #ifdef CONFIG_X86_64
 	kvm_arch_ops->set_efer(vcpu, sregs->efer);
 #endif
-	vcpu->apic_base = sregs->apic_base;
-
 	kvm_arch_ops->decache_cr4_guest_bits(vcpu);
 
 	mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
@@ -2067,8 +2154,11 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	if (mmu_reset_needed)
 		kvm_mmu_reset_context(vcpu);
 
-	kvm_user_irqdev_restore(&vcpu->irq.dev,
-				&sregs->interrupt_bitmap[0]);
+	kvm_lapic_restore(vcpu, sregs);
+
+	if (!vcpu->kvm->enable_kernel_pic)
+		kvm_user_irqdev_restore(&vcpu->irq.dev,
+					&sregs->interrupt_bitmap[0]);
 
 	set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
 	set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
@@ -2457,7 +2547,12 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 
 	kvm_irqdevice_init(&vcpu->irq.dev);
 	kvm_vcpu_irqsink_init(vcpu);
-	r = kvm_userint_init(vcpu);
+
+	if (kvm->enable_kernel_pic)
+		r = kvm_kernint_init(vcpu);
+	else
+		r = kvm_userint_init(vcpu);
+
 	if (r < 0)
 		goto out_free_vcpus;
 
@@ -2601,6 +2696,12 @@ static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	return 0;
 }
 
+static int kvm_vcpu_ioctl_apic_reset(struct kvm_vcpu *vcpu)
+{
+	kvm_lapic_reset(vcpu);
+	return 0;
+}
+
 static long kvm_vcpu_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -2770,6 +2871,13 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_APIC_RESET: {
+		r = kvm_vcpu_ioctl_apic_reset(vcpu);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
 	default:
 		;
 	}
@@ -2823,6 +2931,41 @@ static long kvm_vm_ioctl(struct file *filp,
 			goto out;
 		break;
 	}
+	case KVM_ENABLE_KERNEL_PIC: {
+		__u32 val;
+
+		r = -EFAULT;
+		if (copy_from_user(&val, argp, sizeof val))
+			goto out;
+		r = kvm_vm_ioctl_enable_kernel_pic(kvm, val);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_ISA_INTERRUPT: {
+		struct kvm_interrupt irq;
+
+		r = -EFAULT;
+		if (copy_from_user(&irq, argp, sizeof irq))
+			goto out;
+		r = kvm_vm_ioctl_isa_interrupt(kvm, &irq);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_APIC_MSG: {
+		struct kvm_apic_msg msg;
+
+		r = -EFAULT;
+		if (copy_from_user(&msg, argp, sizeof msg))
+			goto out;
+		r = kvm_vm_ioctl_apic_msg(kvm, &msg);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
 	default:
 		;
 	}
@@ -2954,12 +3097,21 @@ static long kvm_dev_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
-	case KVM_CHECK_EXTENSION:
-		/*
-		 * No extensions defined at present.
-		 */
-		r = 0;
+	case KVM_CHECK_EXTENSION: {
+		int ext = (long)argp;
+
+		switch (ext) {
+		case KVM_ISA_INTERRUPT:
+		case KVM_APIC_MSG:
+		case KVM_APIC_RESET:
+			r = 1;
+			break;
+		default:
+			r = 0;
+			break;
+		}
 		break;
+	}
 	case KVM_GET_VCPU_MMAP_SIZE:
 		r = -EINVAL;
 		if (arg)
diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c
new file mode 100644
index 0000000..9f0ab7e
--- /dev/null
+++ b/drivers/kvm/lapic.c
@@ -0,0 +1,1418 @@
+/*
+ * Local APIC virtualization
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2007 Novell
+ *
+ * Authors:
+ *   Dor Laor <dor.laor-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
+ *   Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
+ *
+ * Based on Xen 3.0 code, Copyright (c) 2004, Intel Corporation.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "kvm.h"
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/msr.h>
+#include <asm/page.h>
+#include <asm/current.h>
+
+/*XXX remove this definition after GFW enabled */
+#define APIC_NO_BIOS
+
+#define PRId64 "d"
+#define PRIx64 "llx"
+#define PRIu64 "u"
+#define PRIo64 "o"
+
+#define APIC_BUS_CYCLE_NS 1
+
+/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
+#define apic_debug(fmt,arg...)
+
+struct kvm_kern_apic {
+	spinlock_t              lock;
+	atomic_t                ref_count;
+	int                     usermode;
+	u32                     status;
+	u32                     vcpu_id;
+	u64                     base_msr;
+	unsigned long           base_address;
+	struct kvm_io_device    mmio_dev;
+	struct {
+		unsigned long   pending;
+		u32             divide_count;
+		ktime_t         last_update;
+		struct hrtimer  dev;
+	} timer;
+	u32                     err_status;
+	u32                     err_write_count;
+	struct kvm_vcpu         *vcpu;
+	struct kvm_irqdevice    *irq_dev;
+	struct page             *regs_page;
+	void                    *regs;
+};
+
+static __inline__ int find_highest_bit(unsigned long *data, int nr_bits)
+{
+	int length = BITS_TO_LONGS(nr_bits);
+	while (length && !data[--length])
+		continue;
+	return __ffs(data[length]) + (length * BITS_PER_LONG);
+}
+
+#define APIC_LVT_NUM			6
+/* 14 is the version for Xeon and Pentium 8.4.8*/
+#define APIC_VERSION			(0x14UL | ((APIC_LVT_NUM - 1) << 16))
+#define VLOCAL_APIC_MEM_LENGTH		(1 << 12)
+/* followed define is not in apicdef.h */
+#define APIC_SHORT_MASK			0xc0000
+#define APIC_DEST_NOSHORT		0x0
+#define APIC_DEST_MASK			0x800
+#define _APIC_GLOB_DISABLE		0x0
+#define APIC_GLOB_DISABLE_MASK		0x1
+#define APIC_SOFTWARE_DISABLE_MASK	0x2
+#define _APIC_BSP_ACCEPT_PIC		0x3
+#define MAX_APIC_INT_VECTOR             256
+
+#define inject_gp(vcpu) kvm_arch_ops->inject_gp(vcpu, 0);
+
+#define apic_enabled(apic)              \
+	(!((apic)->status &                   \
+	   (APIC_GLOB_DISABLE_MASK | APIC_SOFTWARE_DISABLE_MASK)))
+
+#define apic_global_enabled(apic)       \
+	(!(test_bit(_APIC_GLOB_DISABLE, &(apic)->status)))
+
+#define LVT_MASK \
+	APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK
+
+#define LINT_MASK   \
+	LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY |\
+	APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER
+
+#define KVM_APIC_ID(apic)   \
+	(GET_APIC_ID(apic_get_reg(apic, APIC_ID)))
+
+#define apic_lvt_enabled(apic, lvt_type)    \
+	(!(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED))
+
+#define apic_lvt_vector(apic, lvt_type)     \
+	(apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK)
+
+#define apic_lvt_dm(apic, lvt_type)           \
+	(apic_get_reg(apic, lvt_type) & APIC_MODE_MASK)
+
+#define apic_lvtt_period(apic)     \
+	(apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC)
+
+static inline u32 apic_get_reg(struct kvm_kern_apic *apic, u32 reg)
+{
+	return *((u32 *)(apic->regs + reg));
+}
+
+static inline void apic_set_reg(struct kvm_kern_apic *apic,
+				u32 reg, u32 val)
+{
+	*((u32 *)(apic->regs + reg)) = val;
+}
+
+static unsigned int apic_lvt_mask[APIC_LVT_NUM] =
+{
+	LVT_MASK | APIC_LVT_TIMER_PERIODIC, 	/* LVTT */
+	LVT_MASK | APIC_MODE_MASK, 		/* LVTTHMR */
+	LVT_MASK | APIC_MODE_MASK, 		/* LVTPC */
+	LINT_MASK, LINT_MASK, 			/* LVT0-1 */
+	LVT_MASK 				/* LVTERR */
+};
+
+#define ASSERT(x)  							     \
+	if (!(x)) { 							     \
+		printk(KERN_EMERG "assertion failed %s: %d: %s\n",           \
+		       __FILE__, __LINE__, #x);                              \
+		BUG();                                                       \
+	}
+
+static int apic_find_highest_irr(struct kvm_kern_apic *apic)
+{
+	int result;
+
+	result = find_highest_bit((unsigned long *)(apic->regs + APIC_IRR),
+				  MAX_APIC_INT_VECTOR);
+
+	ASSERT( result == 0 || result >= 16);
+
+	return result;
+}
+
+
+static int apic_find_highest_isr(struct kvm_kern_apic *apic)
+{
+	int result;
+
+	result = find_highest_bit((unsigned long *)(apic->regs + APIC_ISR),
+				  MAX_APIC_INT_VECTOR);
+
+	ASSERT( result == 0 || result >= 16);
+
+	return result;
+}
+
+static void apic_dropref(struct kvm_kern_apic *apic)
+{
+	if (atomic_dec_and_test(&apic->ref_count)) {
+
+		spin_lock_bh(&apic->lock);
+
+		hrtimer_cancel(&apic->timer.dev);
+
+		if (apic->regs_page) {
+			__free_page(apic->regs_page);
+			apic->regs_page = 0;
+		}
+
+		spin_unlock_bh(&apic->lock);
+
+		kfree(apic);
+	}
+}
+
+#if 0
+static void apic_dump_state(struct kvm_kern_apic *apic)
+{
+	u64 *tmp;
+
+	printk(KERN_INFO "%s begin\n", __FUNCTION__);
+
+	printk(KERN_INFO "status = 0x%08x\n", apic->status);
+	printk(KERN_INFO "base_msr=0x%016llx, apicbase = 0x%08lx\n",
+	       apic->base_msr, apic->base_address);
+
+	tmp = (u64*)(apic->regs + APIC_IRR);
+	printk(KERN_INFO "IRR = 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n",
+	       tmp[3], tmp[2], tmp[1], tmp[0]);
+	tmp = (u64*)(apic->regs + APIC_ISR);
+	printk(KERN_INFO "ISR = 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n",
+	       tmp[3], tmp[2], tmp[1], tmp[0]);
+	tmp = (u64*)(apic->regs + APIC_TMR);
+	printk(KERN_INFO "TMR = 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n",
+	       tmp[3], tmp[2], tmp[1], tmp[0]);
+
+	printk(KERN_INFO "APIC_ID=0x%08x\n", apic_get_reg(apic, APIC_ID));
+	printk(KERN_INFO "APIC_TASKPRI=0x%08x\n",
+	       apic_get_reg(apic, APIC_TASKPRI) & 0xff);
+	printk(KERN_INFO "APIC_PROCPRI=0x%08x\n",
+	       apic_get_reg(apic, APIC_PROCPRI));
+
+	printk(KERN_INFO "APIC_DFR=0x%08x\n",
+	       apic_get_reg(apic, APIC_DFR) | 0x0FFFFFFF);
+	printk(KERN_INFO "APIC_LDR=0x%08x\n",
+	       apic_get_reg(apic, APIC_LDR) & APIC_LDR_MASK);
+	printk(KERN_INFO "APIC_SPIV=0x%08x\n",
+	       apic_get_reg(apic, APIC_SPIV) & 0x3ff);
+	printk(KERN_INFO "APIC_ESR=0x%08x\n",
+	       apic_get_reg(apic, APIC_ESR));
+	printk(KERN_INFO "APIC_ICR=0x%08x\n",
+	       apic_get_reg(apic, APIC_ICR) & ~(1 << 12));
+	printk(KERN_INFO "APIC_ICR2=0x%08x\n",
+	       apic_get_reg(apic, APIC_ICR2) & 0xff000000);
+
+	printk(KERN_INFO "APIC_LVTERR=0x%08x\n",
+	       apic_get_reg(apic, APIC_LVTERR));
+	printk(KERN_INFO "APIC_LVT1=0x%08x\n",
+	       apic_get_reg(apic, APIC_LVT1));
+	printk(KERN_INFO "APIC_LVT0=0x%08x\n",
+	       apic_get_reg(apic, APIC_LVT0));
+	printk(KERN_INFO "APIC_LVTPC=0x%08x\n",
+	       apic_get_reg(apic, APIC_LVTPC));
+	printk(KERN_INFO "APIC_LVTTHMR=0x%08x\n",
+	       apic_get_reg(apic, APIC_LVTTHMR));
+	printk(KERN_INFO "APIC_LVTT=0x%08x\n",
+	       apic_get_reg(apic, APIC_LVTT));
+
+	printk(KERN_INFO "APIC_TMICT=0x%08x\n",
+	       apic_get_reg(apic, APIC_TMICT));
+	printk(KERN_INFO "APIC_TDCR=0x%08x\n",
+	       apic_get_reg(apic, APIC_TDCR));
+
+	printk(KERN_INFO "%s end\n", __FUNCTION__);
+}
+#endif
+
+
+static int apic_update_ppr(struct kvm_kern_apic *apic)
+{
+	u32 tpr, isrv, ppr, orig_ppr;
+	int irq;
+	int masked = 0;
+	int forward = 0;
+
+	ppr = apic_get_reg(apic, APIC_PROCPRI);
+	orig_ppr = ppr;
+
+	/*
+	 * Before we change anything, see if the only pending vectors we have
+	 * are anything masked by PPR
+	 */
+	irq = apic_find_highest_irr(apic);
+	if (irq && ((irq & 0xf0) <= ppr))
+		masked = true;
+
+	/*
+	 * Compute the PPR value based on the current settings of TPR/ISR
+	 */
+	tpr = apic_get_reg(apic, APIC_TASKPRI);
+	irq = apic_find_highest_isr(apic);
+	isrv = (irq >> 4) & 0xf;
+
+	if ((tpr >> 4) >= isrv)
+		ppr = tpr & 0xff;
+	else
+		ppr = isrv << 4;  /* low 4 bits of PPR have to be cleared */
+
+	apic_set_reg(apic, APIC_PROCPRI, ppr);
+
+	if (masked) {
+		/*
+		 * If we get here its because there were vectors that
+		 * were masked by PPR.  Check again to see if anything is
+		 * now available
+		 */
+		irq = apic_find_highest_irr(apic);
+		if ((irq & 0xf0) > ppr)
+			forward = 1;
+	}
+
+	apic_debug("%s: ppr 0x%x (old) 0x%x (new), isr 0x%x, isrv 0x%x\n",
+	       __FUNCTION__, orig_ppr, ppr, irq, isrv);
+
+	return forward;
+}
+
+static void apic_set_tpr(struct kvm_kern_apic *apic, u32 tpr)
+{
+	int forward = 0;
+
+	apic_debug("new value = %x\n", tpr);
+
+	apic_set_reg(apic, APIC_TASKPRI, tpr);
+	forward = apic_update_ppr(apic);
+
+	if (forward) {
+		spin_unlock_bh(&apic->lock);
+		kvm_irqdevice_set_intr(apic->irq_dev, kvm_irqpin_localint);
+		spin_lock_bh(&apic->lock);
+	}
+}
+
+static int apic_match_dest(struct kvm_kern_apic *target,
+			   int dest,
+			   int dest_mode,
+			   int delivery_mode)
+{
+	int result = 0;
+
+	spin_lock_bh(&target->lock);
+
+	if (!dest_mode) /* Physical */
+		result = (GET_APIC_ID(apic_get_reg(target, APIC_ID)) == dest);
+	else { /* Logical */
+		u32 ldr = apic_get_reg(target, APIC_LDR);
+
+		/* Flat mode */
+		if (apic_get_reg(target, APIC_DFR) == APIC_DFR_FLAT)
+			result = GET_APIC_LOGICAL_ID(ldr) & dest;
+		else {
+			if ((delivery_mode == APIC_DM_LOWEST) &&
+			    (dest == 0xff)) {
+				printk(KERN_ALERT "Broadcast IPI " \
+				       "with lowest priority "
+				       "delivery mode\n");
+				spin_unlock_bh(&target->lock);
+				kvm_crash_guest(target->vcpu->kvm);
+				return 0;
+			}
+			if (GET_APIC_LOGICAL_ID(ldr) == (dest & 0xf))
+				result = (GET_APIC_LOGICAL_ID(ldr) >> 4) &
+					(dest >> 4);
+			else
+				result = 0;
+		}
+	}
+
+	spin_unlock_bh(&target->lock);
+
+	return result;
+}
+
+/*
+ * Add a pending IRQ into lapic.
+ * Return 1 if successfully added and 0 if discarded.
+ */
+static int __apic_accept_irq(struct kvm_kern_apic *apic,
+			     int delivery_mode,
+			     int vector,
+			     int level,
+			     int trig_mode)
+{
+	kvm_irqpin_t pin = kvm_irqpin_invalid;
+
+	switch (delivery_mode) {
+	case APIC_DM_FIXED:
+	case APIC_DM_LOWEST:
+		if (unlikely(!apic_enabled(apic)))
+			break;
+
+		if (test_and_set_bit(vector, apic->regs + APIC_IRR)
+		    && trig_mode) {
+			apic_debug("level trig mode repeatedly for vector " \
+				   "%d\n", vector);
+			break;
+		}
+
+		if (trig_mode) {
+			apic_debug("level trig mode for vector %d\n", vector);
+			set_bit(vector, apic->regs + APIC_TMR);
+		}
+
+		apic_debug("FIXED/LOWEST interrupt for vector %d\n", vector);
+		pin = kvm_irqpin_localint;
+		break;
+	case APIC_DM_REMRD:
+		printk(KERN_WARNING "%s: Ignore deliver mode %d\n",
+		       __FUNCTION__, delivery_mode);
+		break;
+	case APIC_DM_EXTINT:
+		apic_debug("EXTINT interrupt\n");
+		pin  = kvm_irqpin_extint;
+		break;
+	case APIC_DM_SMI:
+		apic_debug("SMI interrupt\n");
+		pin = kvm_irqpin_smi;
+		break;
+	case APIC_DM_NMI:
+		apic_debug("NMI interrupt\n");
+		pin = kvm_irqpin_nmi;
+		break;
+	case APIC_DM_INIT:
+		apic_debug("INIT interrupt\n");
+		if (level) {
+			spin_unlock_bh(&apic->lock);
+			kvm_lapic_reset(apic->vcpu);
+			spin_lock_bh(&apic->lock);
+		}
+		break;
+	case APIC_DM_STARTUP: /* FIXME: currently no support for SMP */
+	default:
+		printk(KERN_ALERT "TODO: support interrupt type %x\n",
+		       delivery_mode);
+		spin_unlock_bh(&apic->lock);
+		kvm_crash_guest(apic->vcpu->kvm);
+		spin_lock_bh(&apic->lock);
+		break;
+	}
+
+	if (likely(pin != kvm_irqpin_invalid)) {
+		/*
+		 * temp release of the lock to transmit
+		 */
+		spin_unlock_bh(&apic->lock);
+		kvm_irqdevice_set_intr(apic->irq_dev, pin);
+		spin_lock_bh(&apic->lock);
+
+		return 1;
+	} else
+		return 0;
+}
+
+static int apic_accept_irq(struct kvm_kern_apic *apic,
+			   int delivery_mode,
+			   int vector,
+			   int level,
+			   int trig_mode)
+{
+	int ret;
+
+	spin_lock_bh(&apic->lock);
+	ret = __apic_accept_irq(apic, delivery_mode, vector,
+				level, trig_mode);
+	spin_unlock_bh(&apic->lock);
+
+	return ret;
+}
+
+static void apic_set_eoi(struct kvm_kern_apic *apic)
+{
+	int vector = apic_find_highest_isr(apic);
+	int forward;
+
+	/*
+	 * Not every write EOI will has corresponding ISR,
+	 * one example is when Kernel check timer on setup_IO_APIC
+	 */
+	if (!vector)
+		return;
+
+	__clear_bit(vector, apic->regs + APIC_ISR);
+	forward = apic_update_ppr(apic);
+
+	__clear_bit(vector, apic->regs + APIC_TMR);
+
+	if (forward) {
+		spin_unlock_bh(&apic->lock);
+		kvm_irqdevice_set_intr(apic->irq_dev, kvm_irqpin_localint);
+		spin_lock_bh(&apic->lock);
+	}
+}
+
+static int apic_check_vector(struct kvm_kern_apic *apic,u32 dm, u32 vector)
+{
+	if ((dm == APIC_DM_FIXED) && (vector < 16)) {
+		apic->err_status |= 0x40;
+		__apic_accept_irq(apic, APIC_DM_FIXED,
+				  apic_lvt_vector(apic, APIC_LVTERR), 0, 0);
+		apic_debug("%s: check failed "
+		       " dm %x vector %x\n", __FUNCTION__, dm, vector);
+		return 0;
+	}
+	return 1;
+}
+
+int kvm_apicbus_send(struct kvm *kvm, int dest, int trig_mode, int level,
+		     int dest_mode, int delivery_mode, int vector)
+{
+	int i;
+	u32 lpr_map = 0;
+
+	apic_debug("%s: %d %d %d %d %d %d\n", __FUNCTION__,
+		   dest, trig_mode, level, dest_mode, delivery_mode, vector);
+
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		struct kvm_kern_apic *target;
+		target = kvm->vcpus[i].apic.dev;
+
+		if (!target)
+			continue;
+
+		if (apic_match_dest(target, dest, dest_mode, delivery_mode)) {
+			if (delivery_mode == APIC_DM_LOWEST)
+				__set_bit(target->vcpu_id, &lpr_map);
+			else
+				apic_accept_irq(target, delivery_mode,
+						vector, level, trig_mode);
+		}
+	}
+
+	if (delivery_mode == APIC_DM_LOWEST) {
+		struct kvm_kern_apic *target;
+
+		/* Currently only UP is supported */
+		target = kvm->vcpus[0].apic.dev;
+
+		if (target)
+			apic_accept_irq(target, delivery_mode,
+					vector, level, trig_mode);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_apicbus_send);
+
+static void apic_send_ipi(struct kvm_kern_apic *apic)
+{
+	u32 icr_low = apic_get_reg(apic, APIC_ICR);
+	u32 icr_high = apic_get_reg(apic, APIC_ICR2);
+
+	unsigned int dest =          GET_APIC_DEST_FIELD(icr_high);
+	unsigned int short_hand =    icr_low & APIC_SHORT_MASK;
+	unsigned int trig_mode =     icr_low & APIC_INT_LEVELTRIG;
+	unsigned int level =         icr_low & APIC_INT_ASSERT;
+	unsigned int dest_mode =     icr_low & APIC_DEST_MASK;
+	unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
+	unsigned int vector =        icr_low & APIC_VECTOR_MASK;
+
+	apic_debug("icr_high 0x%x, icr_low 0x%x, "
+		 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
+		 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
+		 icr_high, icr_low, short_hand, dest,
+		 trig_mode, level, dest_mode, delivery_mode, vector);
+
+	/*
+	 * We unlock here because we would enter this function in a lock
+	 * state and we dont want to remain this way while we transmit
+	 */
+	spin_unlock_bh(&apic->lock);
+
+	switch (short_hand) {
+	case APIC_DEST_NOSHORT:
+		/*
+		 * If no short-hand notation is in use, just forward the
+		 * message onto the apicbus and let the bus handle the routing.
+		 */
+		kvm_apicbus_send(apic->vcpu->kvm, dest, trig_mode, level,
+				 dest_mode, delivery_mode, vector);
+		break;
+	case APIC_DEST_SELF:
+		apic_accept_irq(apic, delivery_mode, vector, level, trig_mode);
+		break;
+	default: {
+		/*
+		 * Otherwise we need to consider the short-hand to find the
+		 * correct targets.
+		 */
+		unsigned int i;
+
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			struct kvm_kern_apic *target;
+			int result = 0;
+
+			target = apic->vcpu->kvm->vcpus[i].apic.dev;
+
+			if (!target)
+				continue;
+
+			switch (short_hand) {
+			case APIC_DEST_ALLINC:
+				result = 1;
+				break;
+
+			case APIC_DEST_ALLBUT:
+				if (target != apic)
+					result = 1;
+				break;
+			default:
+				kvm_crash_guest(apic->vcpu->kvm);
+				return;
+			}
+
+			if (result)
+				apic_accept_irq(target, delivery_mode,
+						vector, level, trig_mode);
+		}
+	}
+	}
+
+	/*
+	 * Relock before returning
+	 */
+	spin_lock_bh(&apic->lock);
+
+}
+
+static u32 apic_get_tmcct(struct kvm_kern_apic *apic)
+{
+	u32 counter_passed;
+	ktime_t passed, now = apic->timer.dev.base->get_time();
+	u32 tmcct = apic_get_reg(apic, APIC_TMCCT);
+
+	ASSERT(apic != NULL);
+
+	if (unlikely(ktime_to_ns(now) <=
+		     ktime_to_ns(apic->timer.last_update))) {
+		/* Wrap around */
+		passed = ktime_add(
+			({ (ktime_t){
+				.tv64 = KTIME_MAX -
+					 (apic->timer.last_update).tv64 };
+			}), now);
+		apic_debug("time elapsed\n");
+	} else
+		passed = ktime_sub(now, apic->timer.last_update);
+
+	counter_passed = ktime_to_ns(passed) /
+		(APIC_BUS_CYCLE_NS * apic->timer.divide_count);
+	tmcct -= counter_passed;
+
+	if (tmcct <= 0) {
+		if (unlikely(!apic_lvtt_period(apic))) {
+			tmcct =  0;
+		} else {
+			do {
+				tmcct += apic_get_reg(apic, APIC_TMICT);
+			} while ( tmcct <= 0 );
+		}
+	}
+
+	apic->timer.last_update = now;
+	apic_set_reg(apic, APIC_TMCCT, tmcct);
+
+	return tmcct;
+}
+
+/*
+ *----------------------------------------------------------------------
+ * MMIO
+ *----------------------------------------------------------------------
+ */
+
+#define align(val, len) (val & ~(len-1))
+
+static int validate_mmio(struct kvm_kern_apic *apic, gpa_t address, int len)
+{
+	/*
+	 * According to IA 32 Manual, all registers should be accessed with
+	 * 32 bits alignment.
+	 */
+	if (align(address, 4) != align(address+(len-1), 4)) {
+		printk(KERN_WARNING "KVM: MMIO request for %d bytes at " \
+		       "0x%lx is not 32 bit aligned.  Injecting #GP\n",
+		       len, address);
+		inject_gp(apic->vcpu);
+		return 0;
+	}
+
+	return 1;
+}
+
+static u32 __apic_read(struct kvm_kern_apic *apic,
+				unsigned int offset)
+{
+	u32 val = 0;
+
+	if (offset > APIC_TDCR)
+		return 0;
+
+	switch (offset) {
+	case APIC_ARBPRI:
+		printk(KERN_WARNING "access local APIC ARBPRI register " \
+		       "which is for P6\n");
+		break;
+
+	case APIC_TMCCT:        /* Timer CCR */
+		val = apic_get_tmcct(apic);
+		break;
+
+	case APIC_ESR:
+		apic->err_write_count = 0;
+		/* fall through */
+	default:
+		val = apic_get_reg(apic, offset);
+		break;
+	}
+
+	return val;
+}
+
+static void apic_mmio_read(struct kvm_io_device *this,
+			   gpa_t address,
+			   int len,
+			   void *data)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private;
+	unsigned int          offset = address - apic->base_address;
+	unsigned char         alignment = offset & 0x3;
+	u32                   val;
+
+	if (!validate_mmio(apic, address, len))
+		return;
+
+	spin_lock_bh(&apic->lock);
+	val = __apic_read(apic, offset & ~0x3);
+	spin_unlock_bh(&apic->lock);
+
+	switch (len) {
+	case 1:
+	case 2:
+	case 4:
+		memcpy(data, (char*)((char*)&val + alignment), len);
+		break;
+	default:
+		printk(KERN_ALERT "Local APIC read with len = %x, " \
+		       "should be 1,2, or 4 instead\n", len);
+		inject_gp(apic->vcpu);
+		break;
+	}
+}
+
+static void apic_mmio_write(struct kvm_io_device *this,
+			    gpa_t address,
+			    int len,
+			    const void *data)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private;
+	unsigned int          offset = address - apic->base_address;
+	unsigned char         alignment = offset & 0x3;
+	u32                   val;
+
+	if (!validate_mmio(apic, address, len))
+		return;
+
+	spin_lock_bh(&apic->lock);
+
+	switch (len) {
+	case 1:
+	case 2: {
+		unsigned int tmp;
+
+		/*
+		 * Some kernels will access with byte/word alignment
+		 */
+		apic_debug("Notice: Local APIC write with len = %x\n", len);
+		tmp = __apic_read(apic, offset & ~0x3);
+		switch (len) {
+		case 1:
+			val = *(u8*)data;
+
+			val = (tmp & ~(0xff << (8*alignment))) |
+			      ((val & 0xff) << (8*alignment));
+			break;
+
+		case 2:
+			if (alignment != 0x0 && alignment != 0x2) {
+				printk(KERN_ALERT "alignment error for apic " \
+				       "with len == 2\n");
+				inject_gp(apic->vcpu);
+			}
+
+			/*
+			 * assumes 16 bit alignment on the pointer.
+			 * Mis-alignment is a host-side issue, however, so
+			 * we crash
+			 */
+			BUG_ON(((long)data & 0x1));
+
+			val = *(u16*)data;
+
+			val = (tmp & ~(0xffff << (8*alignment))) |
+			      ((val & 0xffff) << (8*alignment));
+			break;
+		}
+
+		break;
+	}
+	case 4:
+		memcpy(&val, data, 4);
+		break;
+	default:
+		printk(KERN_ALERT "Local APIC write with len = %x, " \
+		       "should be 1,2, or 4 instead\n", len);
+		inject_gp(apic->vcpu);
+		break;
+	}
+
+	/* too common printing */
+	if (offset != APIC_EOI)
+		apic_debug("%s: offset 0x%x with length 0x%x, and value is " \
+			 "0x%lx\n",
+		       __FUNCTION__, offset, len, val);
+
+	offset &= 0xff0;
+
+	switch (offset) {
+	case APIC_ID:   /* Local APIC ID */
+		apic_set_reg(apic, APIC_ID, val);
+		break;
+
+	case APIC_TASKPRI:
+		apic_set_tpr(apic, val & 0xff);
+		break;
+
+	case APIC_EOI:
+		apic_set_eoi(apic);
+		break;
+
+	case APIC_LDR:
+		apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+		break;
+
+	case APIC_DFR:
+		apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+		break;
+
+	case APIC_SPIV:
+		apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
+		if (!(val & APIC_SPIV_APIC_ENABLED)) {
+			int i;
+			u32 lvt_val;
+
+			apic->status |= APIC_SOFTWARE_DISABLE_MASK;
+			for (i = 0; i < APIC_LVT_NUM; i++) {
+				lvt_val = apic_get_reg(apic,
+							   APIC_LVTT +
+							   0x10 * i);
+				apic_set_reg(apic, APIC_LVTT + 0x10 * i,
+						 lvt_val | APIC_LVT_MASKED);
+			}
+
+			if ((apic_get_reg(apic, APIC_LVT0) &
+			     APIC_MODE_MASK) == APIC_DM_EXTINT)
+				clear_bit(_APIC_BSP_ACCEPT_PIC, &apic->status);
+		} else {
+			apic->status &= ~APIC_SOFTWARE_DISABLE_MASK;
+			if ((apic_get_reg(apic, APIC_LVT0) &
+			     APIC_MODE_MASK) == APIC_DM_EXTINT)
+				set_bit(_APIC_BSP_ACCEPT_PIC, &apic->status);
+		}
+		break;
+
+	case APIC_ESR:
+		apic->err_write_count = !apic->err_write_count;
+		if (!apic->err_write_count)
+			apic->err_status = 0;
+		break;
+
+	case APIC_ICR:
+		/* No delay here, so we always clear the pending bit*/
+		apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
+		apic_send_ipi(apic);
+		break;
+
+	case APIC_ICR2:
+		apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
+		break;
+
+	case APIC_LVTT:
+	case APIC_LVTTHMR:
+	case APIC_LVTPC:
+	case APIC_LVT0:
+	case APIC_LVT1:
+	case APIC_LVTERR:
+	{
+		if (apic->status & APIC_SOFTWARE_DISABLE_MASK)
+			val |= APIC_LVT_MASKED;
+
+		val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
+		apic_set_reg(apic, offset, val);
+
+		/* On hardware, when write vector less than 0x20 will error */
+		if (!(val & APIC_LVT_MASKED))
+			apic_check_vector(apic, apic_lvt_dm(apic, offset),
+					  apic_lvt_vector(apic, offset));
+		if (!apic->vcpu_id && (offset == APIC_LVT0)) {
+			if ((val & APIC_MODE_MASK) == APIC_DM_EXTINT)
+				if (val & APIC_LVT_MASKED)
+					clear_bit(_APIC_BSP_ACCEPT_PIC,
+						  &apic->status);
+				else
+					set_bit(_APIC_BSP_ACCEPT_PIC,
+						&apic->status);
+			else
+				clear_bit(_APIC_BSP_ACCEPT_PIC,
+					  &apic->status);
+		}
+	}
+		break;
+
+	case APIC_TMICT:
+	{
+		ktime_t now = apic->timer.dev.base->get_time();
+		u32 offset;
+
+		apic_set_reg(apic, APIC_TMICT, val);
+		apic_set_reg(apic, APIC_TMCCT, val);
+		apic->timer.last_update = now;
+		offset = APIC_BUS_CYCLE_NS * apic->timer.divide_count * val;
+
+		/* Make sure the lock ordering is coherent */
+		spin_unlock_bh(&apic->lock);
+		hrtimer_cancel(&apic->timer.dev);
+		hrtimer_start(&apic->timer.dev,
+			      ktime_add_ns(now, offset),
+			      HRTIMER_MODE_ABS);
+
+		apic_debug("%s: bus cycle is %"PRId64"ns, now 0x%016"PRIx64", "
+			 "timer initial count 0x%x, offset 0x%x, "
+			 "expire @ 0x%016"PRIx64".\n", __FUNCTION__,
+			 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
+			 apic_get_reg(apic, APIC_TMICT),
+			 offset, ktime_to_ns(ktime_add_ns(now, offset)));
+	}
+		return;
+
+	case APIC_TDCR:
+	{
+		unsigned int tmp1, tmp2;
+
+		tmp1 = val & 0xf;
+		tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
+		apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
+
+		apic_set_reg(apic, APIC_TDCR, val);
+
+		apic_debug("timer divide count is 0x%x\n",
+		       apic->timer.divide_count);
+	}
+		break;
+
+	default:
+		printk(KERN_WARNING "Local APIC Write to read-only register\n");
+		break;
+	}
+
+	spin_unlock_bh(&apic->lock);
+}
+
+static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private;
+	int ret = 0;
+
+	spin_lock_bh(&apic->lock);
+
+	if (apic_global_enabled(apic) &&
+	    (addr >= apic->base_address) &&
+	    (addr < (apic->base_address + VLOCAL_APIC_MEM_LENGTH)))
+		ret = 1;
+
+	spin_unlock_bh(&apic->lock);
+
+	return ret;
+}
+
+static void apic_mmio_destructor(struct kvm_io_device *this)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private;
+
+	apic_dropref(apic);
+}
+
+static void apic_mmio_register(struct kvm_kern_apic *apic)
+{
+	/* Register ourselves with the MMIO subsystem */
+	struct kvm_io_device *dev = &apic->mmio_dev;
+
+	dev->read       = apic_mmio_read;
+	dev->write      = apic_mmio_write;
+	dev->in_range   = apic_mmio_range;
+	dev->destructor = apic_mmio_destructor;
+
+	dev->private = apic;
+	atomic_inc(&apic->ref_count);
+
+	apic->vcpu->apic.mmio = dev;
+}
+
+/*
+ *----------------------------------------------------------------------
+ * LAPIC interface
+ *----------------------------------------------------------------------
+ */
+
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, u64 cr8)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev;
+
+	spin_lock_bh(&apic->lock);
+	apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
+	spin_unlock_bh(&apic->lock);
+}
+
+u64 kvm_lapic_get_tpr(struct kvm_vcpu *vcpu)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev;
+	u64 tpr;
+
+	spin_lock_bh(&apic->lock);
+	tpr = (u64)apic_get_reg(apic, APIC_TASKPRI);
+	spin_unlock_bh(&apic->lock);
+
+	return (tpr & 0xf0) >> 4;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_get_tpr);
+
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev;
+
+	spin_lock_bh(&apic->lock);
+	if (apic->vcpu_id)
+		value &= ~MSR_IA32_APICBASE_BSP;
+
+	apic->base_msr = value;
+	apic->base_address = apic->base_msr & MSR_IA32_APICBASE_BASE;
+
+	/* with FSB delivery interrupt, we can restart APIC functionality */
+	if (!(value & MSR_IA32_APICBASE_ENABLE))
+		set_bit(_APIC_GLOB_DISABLE, &apic->status);
+	else
+		clear_bit(_APIC_GLOB_DISABLE, &apic->status);
+
+	apic_debug("apic base msr is 0x%016"PRIx64", and base address is " \
+		 "0x%lx.\n", apic->base_msr, apic->base_address);
+
+	spin_unlock_bh(&apic->lock);
+}
+
+u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev;
+	u64 base;
+
+	spin_lock_bh(&apic->lock);
+	base = apic->base_msr;
+	spin_unlock_bh(&apic->lock);
+
+	return base;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
+
+void kvm_lapic_save(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	/*
+	 * FIXME: This needs to support the entire register set when
+	 * enabled
+	 */
+	sregs->cr8       = kvm_lapic_get_tpr(vcpu);
+	sregs->apic_base = kvm_lapic_get_base(vcpu);
+}
+
+void kvm_lapic_restore(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	/*
+	 * FIXME: This needs to support the entire register set when
+	 * enabled
+	 */
+	kvm_lapic_set_tpr(vcpu, sregs->cr8);
+	kvm_lapic_set_base(vcpu, sregs->apic_base);
+}
+
+void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+{
+	struct kvm_kern_apic *apic;
+	int i;
+
+	apic_debug("%s\n", __FUNCTION__);
+
+	ASSERT(vcpu);
+	apic = vcpu->apic.dev;
+	ASSERT(apic != NULL);
+
+	/* Stop the timer in case it's a reset to an active apic */
+	hrtimer_cancel(&apic->timer.dev);
+
+	spin_lock_bh(&apic->lock);
+
+	apic_set_reg(apic, APIC_ID, vcpu_slot(vcpu) << 24);
+	apic_set_reg(apic, APIC_LVR, APIC_VERSION);
+
+	for (i = 0; i < APIC_LVT_NUM; i++)
+		apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
+
+	apic_set_reg(apic, APIC_DFR, 0xffffffffU);
+	apic_set_reg(apic, APIC_SPIV, 0xff);
+	apic_set_reg(apic, APIC_TASKPRI, 0);
+	apic_set_reg(apic, APIC_LDR, 0);
+	apic_set_reg(apic, APIC_ESR, 0);
+	apic_set_reg(apic, APIC_ICR, 0);
+	apic_set_reg(apic, APIC_ICR2, 0);
+	apic_set_reg(apic, APIC_TDCR, 0);
+	apic_set_reg(apic, APIC_TMICT, 0);
+	memset((void*)(apic->regs + APIC_IRR), 0, KVM_IRQ_BITMAP_SIZE(u8));
+	memset((void*)(apic->regs + APIC_ISR), 0, KVM_IRQ_BITMAP_SIZE(u8));
+	memset((void*)(apic->regs + APIC_TMR), 0, KVM_IRQ_BITMAP_SIZE(u8));
+
+	apic->base_msr =
+		MSR_IA32_APICBASE_ENABLE |
+		APIC_DEFAULT_PHYS_BASE;
+	if (vcpu_slot(vcpu) == 0)
+		apic->base_msr |= MSR_IA32_APICBASE_BSP;
+	apic->base_address = apic->base_msr & MSR_IA32_APICBASE_BASE;
+
+	apic->timer.divide_count = 0;
+	apic->timer.pending = 0;
+	apic->status = 0;
+
+#ifdef APIC_NO_BIOS
+	/*
+	 * XXX According to mp specification, BIOS will enable LVT0/1,
+	 * remove it after BIOS enabled
+	 */
+	if (!vcpu_slot(vcpu)) {
+		apic_set_reg(apic, APIC_LVT0, APIC_MODE_EXTINT << 8);
+		apic_set_reg(apic, APIC_LVT1, APIC_MODE_NMI << 8);
+		set_bit(_APIC_BSP_ACCEPT_PIC, &apic->status);
+	}
+#endif
+
+	spin_unlock_bh(&apic->lock);
+
+	printk(KERN_INFO  "%s: vcpu=%p, id=%d, base_msr=" \
+	       "0x%016"PRIx64", base_address=0x%0lx.\n", __FUNCTION__, vcpu,
+	       GET_APIC_ID(apic_get_reg(apic, APIC_ID)),
+	       apic->base_msr, apic->base_address);
+}
+
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev;
+	int ret = 0;
+
+	spin_lock_bh(&apic->lock);
+	if (!apic->usermode)
+		ret = apic_enabled(apic);
+	spin_unlock_bh(&apic->lock);
+
+	return ret;
+}
+
+/*
+ *----------------------------------------------------------------------
+ * timer interface
+ *----------------------------------------------------------------------
+ */
+static int __apic_timer_fn(struct kvm_kern_apic *apic)
+{
+	u32 vector;
+	ktime_t now;
+	int result = 0;
+
+	if (unlikely(!apic_enabled(apic) ||
+		     !apic_lvt_enabled(apic, APIC_LVTT))) {
+		apic_debug("%s: time interrupt although apic is down\n",
+			 __FUNCTION__);
+		return 0;
+	}
+
+	vector                  = apic_lvt_vector(apic, APIC_LVTT);
+	now                     = apic->timer.dev.base->get_time();
+	apic->timer.last_update = now;
+	apic->timer.pending++;
+
+	__apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
+
+	if (apic_lvtt_period(apic)) {
+		u32 offset;
+		u32 tmict = apic_get_reg(apic, APIC_TMICT);
+
+		apic_set_reg(apic, APIC_TMCCT, tmict);
+		offset = APIC_BUS_CYCLE_NS * apic->timer.divide_count * tmict;
+
+		result = 1;
+		apic->timer.dev.expires = ktime_add_ns(now, offset);
+
+		apic_debug("%s: now 0x%016"PRIx64", expire @ 0x%016"PRIx64", "
+		       "timer initial count 0x%x, timer current count 0x%x.\n",
+		       __FUNCTION__,
+		       ktime_to_ns(now), ktime_add_ns(now, offset),
+		       apic_get_reg(apic, APIC_TMICT),
+	               apic_get_reg(apic, APIC_TMCCT));
+	} else {
+		apic_set_reg(apic, APIC_TMCCT, 0);
+		apic_debug("%s: now 0x%016"PRIx64", "
+		       "timer initial count 0x%x, timer current count 0x%x.\n",
+		       __FUNCTION__,
+		       ktime_to_ns(now), apic_get_reg(apic, APIC_TMICT),
+		       apic_get_reg(apic, APIC_TMCCT));
+	}
+
+	return result;
+}
+
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
+{
+	struct kvm_kern_apic *apic;
+	int restart_timer = 0;
+
+	apic = container_of(data, struct kvm_kern_apic, timer.dev);
+
+	spin_lock_bh(&apic->lock);
+	restart_timer = __apic_timer_fn(apic);
+	spin_unlock_bh(&apic->lock);
+
+	if (restart_timer)
+		return HRTIMER_RESTART;
+	else
+		return HRTIMER_NORESTART;
+}
+
+/*
+ *----------------------------------------------------------------------
+ * IRQDEVICE interface
+ *----------------------------------------------------------------------
+ */
+
+static int apic_irqdev_ack(struct kvm_irqdevice *this, int flags,
+			   struct kvm_irqack_data *data)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private;
+	int irq;
+
+	apic_debug("LAPIC ACK attempt\n");
+
+	spin_lock_bh(&apic->lock);
+
+	if (!apic_enabled(apic))
+		goto out;
+
+	if (!(flags & KVM_IRQACK_FLAG_PEEK)) {
+		irq = apic_find_highest_irr(apic);
+		if ((irq & 0xf0) > apic_get_reg(apic, APIC_PROCPRI)) {
+			BUG_ON (irq < 0x10);
+
+			__set_bit(irq, apic->regs + APIC_ISR);
+			__clear_bit(irq, apic->regs + APIC_IRR);
+			apic_update_ppr(apic);
+
+			/*
+			 * We have to special case the timer interrupt
+			 * because we want the vector to stay pending
+			 * for each tick of the clock, even for a backlog.
+			 * Therefore, if this was a timer vector and we
+			 * still have ticks pending, keep IRR set
+			 */
+			if (irq == apic_lvt_vector(apic, APIC_LVTT)) {
+				BUG_ON(!apic->timer.pending);
+				apic->timer.pending--;
+				if (apic->timer.pending)
+					__set_bit(irq, apic->regs + APIC_IRR);
+			}
+
+			data->flags |= KVM_IRQACKDATA_VECTOR_VALID;
+			data->vector = irq;
+		}
+		else
+			data->vector = -1;
+
+		apic_debug("ACK for vector %d\n", data->vector);
+	}
+
+	/*
+	 * See if there is anything still pending.  Don't forget that we may
+	 * have entered this function with PEEK just to check pending
+	 * status.  This is really the only way we could ever find something
+	 * still eligible, since otherwise we would have just injected
+	 * the highest priority vector above
+	 */
+	irq = apic_find_highest_irr(apic);
+	if ((irq & 0xf0) > apic_get_reg(apic, APIC_PROCPRI))
+		data->flags |= KVM_IRQACKDATA_VECTOR_PENDING;
+
+ out:
+	spin_unlock_bh(&apic->lock);
+
+	return 0;
+}
+
+static int apic_irqdev_set_pin(struct kvm_irqdevice *this, int irq, int level)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private;
+	int lvt = 0;
+
+	spin_lock_bh(&apic->lock);
+
+	if (!apic_enabled(apic)) {
+		/*
+		 * If the LAPIC is disabled, we simply forward the interrupt
+		 * on to the output line
+		 */
+		__apic_accept_irq(apic, APIC_DM_EXTINT, 0, level, 1);
+		goto out;
+	}
+
+	/*
+	 * pin "0" is LINT0, and "1" is LINT1
+	 */
+	BUG_ON(irq > 1);
+
+	switch(irq) {
+	case 0:
+		lvt = APIC_LVT0;
+		break;
+	case 1:
+		lvt = APIC_LVT1;
+		break;
+	}
+
+	if (apic_lvt_enabled(apic, lvt))
+		__apic_accept_irq(apic,
+				  apic_lvt_dm(apic, lvt),
+				  apic_lvt_vector(apic, lvt),
+				  level,
+				  1);
+
+
+ out:
+	spin_unlock_bh(&apic->lock);
+
+	return 0;
+}
+
+static void apic_irqdev_destructor(struct kvm_irqdevice *this)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)this->private;
+
+	apic_dropref(apic);
+}
+
+static void apic_irqdev_register(struct kvm_kern_apic *apic,
+				 struct kvm_irqdevice *dev)
+{
+	dev->ack         = apic_irqdev_ack;
+	dev->set_pin     = apic_irqdev_set_pin;
+	dev->destructor  = apic_irqdev_destructor;
+
+	dev->private = apic;
+	atomic_inc(&apic->ref_count);
+
+	apic->irq_dev = dev;
+}
+
+int kvm_lapic_init(struct kvm_vcpu *vcpu,
+		   struct kvm_irqdevice *irq_dev, int flags)
+{
+	struct kvm_kern_apic *apic = NULL;
+	struct kvm_io_device *mmio_dev = NULL;
+
+	ASSERT(vcpu != NULL);
+	apic_debug("apic_init %d\n", vcpu_slot(vcpu));
+
+	apic = kzalloc(sizeof(*apic), GFP_KERNEL);
+	if (!apic)
+		goto nomem;
+
+	spin_lock_init(&apic->lock);
+	atomic_inc(&apic->ref_count);
+	apic->vcpu_id = vcpu_slot(vcpu);
+
+	apic->regs_page = alloc_page(GFP_KERNEL);
+	if ( apic->regs_page == NULL ) {
+		printk(KERN_ALERT "malloc apic regs error for vcpu %x\n",
+		       vcpu_slot(vcpu));
+		goto nomem;
+	}
+	apic->regs = page_address(apic->regs_page);
+	memset(apic->regs, 0, PAGE_SIZE);
+
+	apic->vcpu = vcpu;
+	vcpu->apic.dev = apic;
+
+	if (!(flags & KVM_LAPIC_OPTION_USERMODE)) {
+		apic_irqdev_register(apic, irq_dev);
+		apic_mmio_register(apic);
+	} else
+		apic->usermode = 1;
+
+	hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	apic->timer.dev.function = apic_timer_fn;
+
+	kvm_lapic_reset(vcpu);
+	return 0;
+
+ nomem:
+	if (mmio_dev)
+		kfree(mmio_dev);
+
+	if (apic)
+		apic_dropref(apic);
+
+	return -ENOMEM;
+}
+
+void kvm_lapic_destroy(struct kvm_vcpu *vcpu)
+{
+	struct kvm_kern_apic *apic = vcpu->apic.dev;
+
+	if (vcpu->apic.mmio)
+		kvm_iodevice_destructor(vcpu->apic.mmio);
+
+	apic_dropref(apic);
+}
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index 1b9d633..ccc5856 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -570,9 +570,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 
 	fx_init(vcpu);
 	vcpu->fpu_active = 1;
-	vcpu->apic_base = 0xfee00000 |
-			/*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
-			MSR_IA32_APICBASE_ENABLE;
 
 	return 0;
 
@@ -1410,9 +1407,9 @@ static void do_intr_requests(struct kvm_vcpu *vcpu,
 			r = kvm_vcpu_irq_pop(vcpu, &ack);
 			break;
 		case kvm_irqpin_extint:
-			printk(KERN_WARNING "KVM: external-interrupts not " \
-			       "handled yet\n");
-			__clear_bit(pin, &vcpu->irq.pending);
+			r = kvm_irqdevice_ack(&vcpu->kvm->isa_irq, 0, &ack);
+			if (!(ack.flags & KVM_IRQACKDATA_VECTOR_PENDING))
+				__clear_bit(pin, &vcpu->irq.pending);
 			break;
 		case kvm_irqpin_nmi:
 			/*
@@ -1509,8 +1506,8 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 		 !kvm_vcpu_irq_pending(vcpu) &&
 		 !(ack.flags & KVM_IRQACKDATA_NEXT_VALID));
 	kvm_run->if_flag = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF) != 0;
-	kvm_run->cr8 = vcpu->cr8;
-	kvm_run->apic_base = vcpu->apic_base;
+	kvm_run->cr8 = kvm_lapic_get_tpr(vcpu);
+	kvm_run->apic_base = kvm_lapic_get_base(vcpu);
 }
 
 /*
diff --git a/drivers/kvm/userint.c b/drivers/kvm/userint.c
index a60707d..c6118b0 100644
--- a/drivers/kvm/userint.c
+++ b/drivers/kvm/userint.c
@@ -218,6 +218,12 @@ int kvm_user_irqdev_restore(struct kvm_irqdevice *this, void *data)
 
 int kvm_userint_init(struct kvm_vcpu *vcpu)
 {
-	return kvm_user_irqdev_init(&vcpu->irq.dev);
+	int ret;
+
+	ret = kvm_user_irqdev_init(&vcpu->irq.dev);
+	if (ret < 0)
+		return ret;
+
+	return kvm_lapic_init(vcpu, NULL, KVM_LAPIC_OPTION_USERMODE);
 }
 
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 7f2af92..82e40c9 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -1236,10 +1236,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	memset(vcpu->regs, 0, sizeof(vcpu->regs));
 	vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
-	vcpu->cr8 = 0;
-	vcpu->apic_base = 0xfee00000 |
-			/*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
-			MSR_IA32_APICBASE_ENABLE;
 
 	fx_init(vcpu);
 
@@ -1480,9 +1476,9 @@ static void do_intr_requests(struct kvm_vcpu *vcpu,
 			r = kvm_vcpu_irq_pop(vcpu, &ack);
 			break;
 		case kvm_irqpin_extint:
-			printk(KERN_WARNING "KVM: external-interrupts not " \
-			       "handled yet\n");
-			__clear_bit(pin, &vcpu->irq.pending);
+			r = kvm_irqdevice_ack(&vcpu->kvm->isa_irq, 0, &ack);
+			if (!(ack.flags & KVM_IRQACKDATA_VECTOR_PENDING))
+				__clear_bit(pin, &vcpu->irq.pending);
 			break;
 		case kvm_irqpin_nmi:
 			/*
@@ -1849,7 +1845,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 			return 1;
 		case 8:
 			vcpu_load_rsp_rip(vcpu);
-			vcpu->regs[reg] = vcpu->cr8;
+			vcpu->regs[reg] = kvm_lapic_get_tpr(vcpu);
 			vcpu_put_rsp_rip(vcpu);
 			skip_emulated_instruction(vcpu);
 			return 1;
@@ -1949,8 +1945,8 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
  	kvm_irqdevice_ack(&vcpu->irq.dev, KVM_IRQACK_FLAG_PEEK, &ack);
  
 	kvm_run->if_flag = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) != 0;
-	kvm_run->cr8 = vcpu->cr8;
-	kvm_run->apic_base = vcpu->apic_base;
+	kvm_run->cr8 = kvm_lapic_get_tpr(vcpu);
+	kvm_run->apic_base = kvm_lapic_get_base(vcpu);
 	kvm_run->ready_for_interrupt_injection =
 		(vcpu->interrupt_window_open &&
 		 !kvm_vcpu_irq_pending(vcpu) &&
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index e6edca8..aaa826e 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -231,6 +231,17 @@ struct kvm_dirty_log {
 	};
 };
 
+/* for KVM_APIC */
+struct kvm_apic_msg {
+	/* in */
+	__u32 dest;
+	__u32 trig_mode;
+	__u32 dest_mode;
+	__u32 delivery_mode;
+	__u32 vector;
+	__u32 padding;
+};
+
 struct kvm_cpuid_entry {
 	__u32 function;
 	__u32 eax;
@@ -282,6 +293,9 @@ struct kvm_signal_mask {
 #define KVM_CREATE_VCPU           _IO(KVMIO,  0x41)
 #define KVM_GET_DIRTY_LOG         _IOW(KVMIO, 0x42, struct kvm_dirty_log)
 #define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO, 0x43, struct kvm_memory_alias)
+#define KVM_ENABLE_KERNEL_PIC     _IOW(KVMIO, 0x44, __u32)
+#define KVM_ISA_INTERRUPT         _IOW(KVMIO, 0x45, struct kvm_interrupt)
+#define KVM_APIC_MSG		  _IOW(KVMIO, 0x46, struct kvm_apic_msg)
 
 /*
  * ioctls for vcpu fds
@@ -300,5 +314,6 @@ struct kvm_signal_mask {
 #define KVM_SET_SIGNAL_MASK       _IOW(KVMIO,  0x8b, struct kvm_signal_mask)
 #define KVM_GET_FPU               _IOR(KVMIO,  0x8c, struct kvm_fpu)
 #define KVM_SET_FPU               _IOW(KVMIO,  0x8d, struct kvm_fpu)
+#define KVM_APIC_RESET		  _IO(KVMIO,   0x8e)
 
 #endif


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 6/9] KVM: Adds support for real NMI injection on VMX processors
       [not found] ` <20070531180005.1810.23884.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
                     ` (4 preceding siblings ...)
  2007-05-31 18:09   ` [PATCH 5/9] KVM: Add support for in-kernel LAPIC model Gregory Haskins
@ 2007-05-31 18:09   ` Gregory Haskins
       [not found]     ` <20070531180919.1810.30009.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
  2007-05-31 18:09   ` [PATCH 7/9] KVM: Adds basic plumbing to support TPR shadow features Gregory Haskins
                     ` (5 subsequent siblings)
  11 siblings, 1 reply; 21+ messages in thread
From: Gregory Haskins @ 2007-05-31 18:09 UTC (permalink / raw)
  To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
---

 drivers/kvm/vmx.c |   56 +++++++++++++++++++++++++++++++++++++++++++++++++----
 drivers/kvm/vmx.h |    3 +++
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 82e40c9..7923a42 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -1301,7 +1301,14 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
 			       PIN_BASED_VM_EXEC_CONTROL,
 			       PIN_BASED_EXT_INTR_MASK   /* 20.6.1 */
 			       | PIN_BASED_NMI_EXITING   /* 20.6.1 */
+			       | PIN_BASED_VIRTUAL_NMI   /* 20.6.1 */
 			);
+
+	if (!(vmcs_read32(PIN_BASED_VM_EXEC_CONTROL) & PIN_BASED_VIRTUAL_NMI))
+		printk(KERN_DEBUG "KVM: Warning - Host processor does " \
+		       "not support virtual-NMI injection.  Using IRQ " \
+		       "method\n");
+ 
 	vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS,
 			       CPU_BASED_VM_EXEC_CONTROL,
 			       CPU_BASED_HLT_EXITING         /* 20.6.2 */
@@ -1450,6 +1457,37 @@ static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
 	vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
 }
 
+static void do_nmi_requests(struct kvm_vcpu *vcpu)
+{
+	int nmi_window = 0;
+
+	BUG_ON(!(test_bit(kvm_irqpin_nmi, &vcpu->irq.pending)));
+
+	nmi_window =
+		(((vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 0xb) == 0)
+		 && (vmcs_read32(VM_ENTRY_INTR_INFO_FIELD)
+		     & INTR_INFO_VALID_MASK));
+
+	if (nmi_window) {
+		if (vcpu->rmode.active)
+			inject_rmode_irq(vcpu, 2);
+		else
+			vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+				     2 |
+				     INTR_TYPE_NMI |
+				     INTR_INFO_VALID_MASK);
+
+		__clear_bit(kvm_irqpin_nmi, &vcpu->irq.pending);
+	} else {
+		/*
+		 * NMIs blocked.  Wait for unblock.
+		 */
+		u32 cbvec = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+		cbvec |= CPU_BASED_NMI_EXITING;
+		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cbvec);	
+	}
+}
+
 static void do_intr_requests(struct kvm_vcpu *vcpu,
 			    struct kvm_run *kvm_run,
 			    kvm_irqpin_t pin)
@@ -1482,9 +1520,11 @@ static void do_intr_requests(struct kvm_vcpu *vcpu,
 			break;
 		case kvm_irqpin_nmi:
 			/*
-			 * FIXME: Someday we will handle this using the
-			 * specific VMX NMI features.  For now, just inject
-			 * the NMI as a standard interrupt on vector 2
+			 * We should only get here if the processor does
+			 * not support virtual NMIs.  Inject the NMI as a
+			 * standard interrupt on vector 2.  The implication is
+			 * that NMIs are going to be subject to RFLAGS.IF
+			 * masking, unfortunately.
 			 */
 			ack.flags |= KVM_IRQACKDATA_VECTOR_VALID;
 			ack.vector = 2;
@@ -1534,6 +1574,8 @@ static void clear_pending_controls(struct kvm_vcpu *vcpu,
  	else
  		cbvec &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
 
+	cbvec &= ~CPU_BASED_NMI_EXITING;
+
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cbvec);
 }
 
@@ -1550,7 +1592,6 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 		switch (pin) {
 		case kvm_irqpin_localint:
 		case kvm_irqpin_extint:
-		case kvm_irqpin_nmi:
 			do_intr_requests(vcpu, kvm_run, pin);
 			break;
 		case kvm_irqpin_smi:
@@ -1558,6 +1599,13 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 			printk(KERN_WARNING "KVM: dropping unhandled SMI\n");
 			__clear_bit(pin, &vcpu->irq.pending);
 			break;
+		case kvm_irqpin_nmi:
+			if (vmcs_read32(PIN_BASED_VM_EXEC_CONTROL)
+			    & PIN_BASED_VIRTUAL_NMI)
+				do_nmi_requests(vcpu);
+			else
+				do_intr_requests(vcpu, kvm_run, pin);	
+			break;
 		case kvm_irqpin_invalid:
 			/* drop */
 			break;
diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h
index d0dc93d..d3fe017 100644
--- a/drivers/kvm/vmx.h
+++ b/drivers/kvm/vmx.h
@@ -35,6 +35,7 @@
 #define CPU_BASED_CR8_LOAD_EXITING      0x00080000
 #define CPU_BASED_CR8_STORE_EXITING     0x00100000
 #define CPU_BASED_TPR_SHADOW            0x00200000
+#define CPU_BASED_NMI_EXITING           0x00400000
 #define CPU_BASED_MOV_DR_EXITING        0x00800000
 #define CPU_BASED_UNCOND_IO_EXITING     0x01000000
 #define CPU_BASED_ACTIVATE_IO_BITMAP    0x02000000
@@ -44,6 +45,7 @@
 
 #define PIN_BASED_EXT_INTR_MASK 0x1
 #define PIN_BASED_NMI_EXITING   0x8
+#define PIN_BASED_VIRTUAL_NMI   0x20
 
 #define VM_EXIT_ACK_INTR_ON_EXIT        0x00008000
 #define VM_EXIT_HOST_ADD_SPACE_SIZE     0x00000200
@@ -221,6 +223,7 @@ enum vmcs_field {
 #define VECTORING_INFO_VALID_MASK       	INTR_INFO_VALID_MASK
 
 #define INTR_TYPE_EXT_INTR              (0 << 8) /* external interrupt */
+#define INTR_TYPE_NMI                   (2 << 8) /* non-maskable interrupt */
 #define INTR_TYPE_EXCEPTION             (3 << 8) /* processor exception */
 
 /*


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 7/9] KVM: Adds basic plumbing to support TPR shadow features
       [not found] ` <20070531180005.1810.23884.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
                     ` (5 preceding siblings ...)
  2007-05-31 18:09   ` [PATCH 6/9] KVM: Adds support for real NMI injection on VMX processors Gregory Haskins
@ 2007-05-31 18:09   ` Gregory Haskins
  2007-05-31 18:09   ` [PATCH 8/9] KVM: Add statistics from interrupt subsystem Gregory Haskins
                     ` (4 subsequent siblings)
  11 siblings, 0 replies; 21+ messages in thread
From: Gregory Haskins @ 2007-05-31 18:09 UTC (permalink / raw)
  To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
---

 drivers/kvm/irqdevice.h |    3 +++
 drivers/kvm/kvm.h       |    1 +
 drivers/kvm/lapic.c     |   17 +++++++++++++++++
 3 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/drivers/kvm/irqdevice.h b/drivers/kvm/irqdevice.h
index 097d179..173313d 100644
--- a/drivers/kvm/irqdevice.h
+++ b/drivers/kvm/irqdevice.h
@@ -45,12 +45,14 @@ struct kvm_irqsink {
 
 #define KVM_IRQACKDATA_VECTOR_VALID   (1 << 0)
 #define KVM_IRQACKDATA_VECTOR_PENDING (1 << 1)
+#define KVM_IRQACKDATA_NEXT_VALID     (1 << 2)
 
 #define KVM_IRQACK_FLAG_PEEK          (1 << 0)
 
 struct kvm_irqack_data {
 	int flags;
 	int vector;
+	int next;
 };
 
 struct kvm_irqdevice {
@@ -92,6 +94,7 @@ static inline void kvm_irqdevice_init(struct kvm_irqdevice *dev)
  *               data.flags -
  *                  [KVM_IRQACKDATA_VECTOR_VALID - data.vector is valid]
  *                  [KVM_IRQACKDATA_VECTOR_PENDING - more vectors are pending]
+ *                  [KVM_IRQACKDATA_NEXT_VALID - next-vector is valid]
  *
  * Returns: (int)
  *   [-1 = failure]
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 1f30274..76582e2 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -182,6 +182,7 @@ void kvm_lapic_save(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 void kvm_lapic_restore(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 void kvm_lapic_reset(struct kvm_vcpu *vcpu);
 int  kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+void *kvm_lapic_get_regs(struct kvm_vcpu *vcpu);
 
 /*
  * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c
index 9f0ab7e..602e94c 100644
--- a/drivers/kvm/lapic.c
+++ b/drivers/kvm/lapic.c
@@ -1154,6 +1154,13 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
+void *kvm_lapic_get_regs(struct kvm_vcpu *vcpu)
+{
+	struct kvm_kern_apic *apic = (struct kvm_kern_apic*)vcpu->apic.dev;
+	return apic->regs;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_get_regs);
+
 /*
  *----------------------------------------------------------------------
  * timer interface
@@ -1286,6 +1293,16 @@ static int apic_irqdev_ack(struct kvm_irqdevice *this, int flags,
 	if ((irq & 0xf0) > apic_get_reg(apic, APIC_PROCPRI))
 		data->flags |= KVM_IRQACKDATA_VECTOR_PENDING;
 
+	if (irq) {
+		/*
+		 * We report the next pending vector here so that the system
+		 * can assess TPR thresholds for TPR-shadowing purposes
+		 * (if applicable)
+		 */
+		data->next   = irq;
+		data->flags |= KVM_IRQACKDATA_NEXT_VALID;
+	}
+
  out:
 	spin_unlock_bh(&apic->lock);
 


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 8/9] KVM: Add statistics from interrupt subsystem
       [not found] ` <20070531180005.1810.23884.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
                     ` (6 preceding siblings ...)
  2007-05-31 18:09   ` [PATCH 7/9] KVM: Adds basic plumbing to support TPR shadow features Gregory Haskins
@ 2007-05-31 18:09   ` Gregory Haskins
  2007-05-31 18:09   ` [PATCH 9/9] KVM: Adds support for halting in the kernel Gregory Haskins
                     ` (3 subsequent siblings)
  11 siblings, 0 replies; 21+ messages in thread
From: Gregory Haskins @ 2007-05-31 18:09 UTC (permalink / raw)
  To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
---

 drivers/kvm/kvm.h      |    6 ++++++
 drivers/kvm/kvm_main.c |   23 ++++++++++++++++++++---
 drivers/kvm/svm.c      |    2 ++
 drivers/kvm/vmx.c      |    2 ++
 4 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 76582e2..57b6d14 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -279,6 +279,12 @@ struct kvm_stat {
 	u32 irq_exits;
 	u32 light_exits;
 	u32 efer_reload;
+	u32 irq_posted;
+	u32 irq_ignored;
+	u32 irq_accepted;
+	u32 guest_preempt;
+	u32 apic_mmio;
+	u32 local_mmio;
 };
 
 struct kvm_io_device {
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 2a1b376..79b6477 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -73,6 +73,12 @@ static struct kvm_stats_debugfs_item {
 	{ "irq_exits", STAT_OFFSET(irq_exits) },
 	{ "light_exits", STAT_OFFSET(light_exits) },
 	{ "efer_reload", STAT_OFFSET(efer_reload) },
+	{ "irq_posted", STAT_OFFSET(irq_posted) },
+	{ "irq_ignored", STAT_OFFSET(irq_ignored) },
+	{ "irq_accepted", STAT_OFFSET(irq_accepted) },
+	{ "guest_preempt", STAT_OFFSET(guest_preempt) },
+	{ "apic_mmio", STAT_OFFSET(apic_mmio) },
+	{ "local_mmio", STAT_OFFSET(local_mmio) },
 	{ NULL }
 };
 
@@ -1136,13 +1142,19 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
 	/*
 	 * First check if the LAPIC will snarf this request
 	 */
-	if (dev && dev->in_range(dev, addr))
+	if (dev && dev->in_range(dev, addr)) {
+		++vcpu->stat.apic_mmio;
 		return dev;
+	}
 
 	/*
 	 * And then fallback to allow any device to participate
 	 */
-	return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
+	dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
+	if (dev)
+		++vcpu->stat.local_mmio;
+
+	return dev;
 }
 
 static int emulator_read_emulated(unsigned long addr,
@@ -2446,6 +2458,8 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this,
 	struct kvm_vcpu *vcpu = (struct kvm_vcpu*)this->private;
 	int direct_ipi = -1;
 
+	++vcpu->stat.irq_posted;
+
 	spin_lock_irq(&vcpu->irq.lock);
 
 	if (!test_bit(pin, &vcpu->irq.pending)) {
@@ -2465,8 +2479,11 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this,
 			 */
 			direct_ipi = vcpu->irq.guest_cpu;
 			BUG_ON(direct_ipi == smp_processor_id());
+			++vcpu->stat.guest_preempt;
 		}
-	}
+	} else
+		++vcpu->stat.irq_ignored;
+
 
 	spin_unlock_irq(&vcpu->irq.lock);
 
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index ccc5856..61dfee2 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -1435,6 +1435,8 @@ static void do_intr_requests(struct kvm_vcpu *vcpu,
 			control->int_ctl |= V_IRQ_MASK |
 				((/*control->int_vector >> 4*/ 0xf) <<
 				 V_INTR_PRIO_SHIFT);
+
+			++vcpu->stat.irq_accepted;
 		}
 	}
 
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 7923a42..1dd8c9c 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -1545,6 +1545,8 @@ static void do_intr_requests(struct kvm_vcpu *vcpu,
 					     ack.vector |
 					     INTR_TYPE_EXT_INTR |
 					     INTR_INFO_VALID_MASK);
+
+			++vcpu->stat.irq_accepted;
 		}
 	}
 


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 9/9] KVM: Adds support for halting in the kernel
       [not found] ` <20070531180005.1810.23884.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
                     ` (7 preceding siblings ...)
  2007-05-31 18:09   ` [PATCH 8/9] KVM: Add statistics from interrupt subsystem Gregory Haskins
@ 2007-05-31 18:09   ` Gregory Haskins
       [not found]     ` <20070531180934.1810.45024.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
  2007-06-02 22:04   ` [PATCH 0/9] in-kernel APIC v9 (kernel side) Dor Laor
                     ` (2 subsequent siblings)
  11 siblings, 1 reply; 21+ messages in thread
From: Gregory Haskins @ 2007-05-31 18:09 UTC (permalink / raw)
  To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Halting in userspace requires a relatively cumbersome mechanism to signal the
halted VCPU.  Implementing halt in kernel should be relatively straight
forward and it eliminates the need for the signaling

Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
---

 drivers/kvm/kvm.h      |    3 ++
 drivers/kvm/kvm_main.c |   85 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/kvm/svm.c      |    7 +---
 drivers/kvm/vmx.c      |    7 +---
 4 files changed, 90 insertions(+), 12 deletions(-)

diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 57b6d14..7030f0d 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -275,6 +275,7 @@ struct kvm_stat {
 	u32 signal_exits;
 	u32 irq_window_exits;
 	u32 halt_exits;
+	u32 halt_wakeup;
 	u32 request_irq_exits;
 	u32 irq_exits;
 	u32 light_exits;
@@ -357,6 +358,7 @@ struct kvm_vcpu_irq {
 	int                  pending;
 	int                  deferred;
 	int                  guest_cpu;
+	wait_queue_head_t    wq;
 };
 
 struct kvm_lapic {
@@ -636,6 +638,7 @@ void kvm_mmu_module_exit(void);
 
 int kvm_apicbus_send(struct kvm *kvm, int dest, int trig_mode, int level,
 		     int dest_mode, int delivery_mode, int vector);
+int kvm_vcpu_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
 
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 79b6477..59f94cf 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -69,6 +69,7 @@ static struct kvm_stats_debugfs_item {
 	{ "signal_exits", STAT_OFFSET(signal_exits) },
 	{ "irq_window", STAT_OFFSET(irq_window_exits) },
 	{ "halt_exits", STAT_OFFSET(halt_exits) },
+	{ "halt_wakeup", STAT_OFFSET(halt_wakeup) },
 	{ "request_irq", STAT_OFFSET(request_irq_exits) },
 	{ "irq_exits", STAT_OFFSET(irq_exits) },
 	{ "light_exits", STAT_OFFSET(light_exits) },
@@ -334,6 +335,7 @@ static struct kvm *kvm_create_vm(void)
 		memset(&vcpu->irq, 0, sizeof(vcpu->irq));
 		spin_lock_init(&vcpu->irq.lock);
 		vcpu->irq.deferred = -1;
+		init_waitqueue_head(&vcpu->irq.wq);
 
 		vcpu->cpu = -1;
 		vcpu->kvm = kvm;
@@ -2434,6 +2436,79 @@ out1:
 }
 
 /*
+ * The vCPU has executed a HLT instruction with in-kernel mode enabled.
+ */
+static int kvm_vcpu_kern_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	int r = 1;
+
+	spin_lock_irq(&vcpu->irq.lock);
+	__add_wait_queue(&vcpu->irq.wq, &wait);
+
+	/*
+	 * We will block until either an interrupt or a signal wakes us up
+	 */
+	while(!__kvm_vcpu_irq_pending(vcpu)
+	      && !signal_pending(current)
+	      && !kvm_run->request_interrupt_window) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		spin_unlock_irq(&vcpu->irq.lock);
+		vcpu_put(vcpu);
+
+		schedule();
+
+		vcpu_load(vcpu);
+		spin_lock_irq(&vcpu->irq.lock);
+	}
+
+	/*
+	 * If userspace is waiting for an injection point, we cant sleep here
+	 */
+	if (kvm_run->request_interrupt_window
+	    && !__kvm_vcpu_irq_pending(vcpu)) {
+		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+		r = 0;
+	}
+
+	__remove_wait_queue(&vcpu->irq.wq, &wait);
+	__set_current_state(TASK_RUNNING);
+	spin_unlock_irq(&vcpu->irq.lock);
+
+	return r;
+}
+
+/*
+ * The vCPU has executed a HLT instruction.
+ */
+int kvm_vcpu_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	int r = -EINVAL;
+
+	++vcpu->stat.halt_exits;
+
+	if (vcpu->kvm->enable_kernel_pic)
+		/*
+		 * If the in-kernel PIC is enabled, we will perform HLT
+		 * in-kernel as well
+		 */
+		r = kvm_vcpu_kern_halt(vcpu, kvm_run);
+	else {
+		/*
+		 * Else, we decide to go back to userspace or vmenter depending
+		 * on whether there are interrupts currently pending or not
+		 */
+		r = kvm_vcpu_irq_pending(vcpu) ? 1 : 0;
+		if (!r)
+			kvm_run->exit_reason = KVM_EXIT_HLT;
+	}
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
+
+/*
  * This function is invoked whenever we want to interrupt a vcpu that is
  * currently executing in guest-mode.  It currently is a no-op because
  * the simple delivery of the IPI to execute this function accomplishes our
@@ -2481,6 +2556,16 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this,
 			BUG_ON(direct_ipi == smp_processor_id());
 			++vcpu->stat.guest_preempt;
 		}
+
+		/*
+		 * If the CPU is halted it will be waiting for a wake-up
+		 */
+		if (waitqueue_active(&vcpu->irq.wq)) {
+			wake_up_interruptible_sync(&vcpu->irq.wq);
+			set_tsk_need_resched(current);
+			++vcpu->stat.halt_wakeup;
+		}
+
 	} else
 		++vcpu->stat.irq_ignored;
 
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index 61dfee2..bdc5d98 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -1098,12 +1098,7 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1;
 	skip_emulated_instruction(vcpu);
-	if (kvm_vcpu_irq_pending(vcpu))
-		return 1;
-
-	kvm_run->exit_reason = KVM_EXIT_HLT;
-	++vcpu->stat.halt_exits;
-	return 0;
+	return kvm_vcpu_halt(vcpu, kvm_run);
 }
 
 static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 1dd8c9c..b7d756d 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -2022,12 +2022,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
 static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	skip_emulated_instruction(vcpu);
-	if (kvm_vcpu_irq_pending(vcpu))
-		return 1;
-
-	kvm_run->exit_reason = KVM_EXIT_HLT;
-	++vcpu->stat.halt_exits;
-	return 0;
+	return kvm_vcpu_halt(vcpu, kvm_run);
 }
 
 static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 6/9] KVM: Adds support for real NMI injection onVMX processors
       [not found]     ` <20070531180919.1810.30009.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
@ 2007-06-01  2:43       ` Li, Xin B
  0 siblings, 0 replies; 21+ messages in thread
From: Li, Xin B @ 2007-06-01  2:43 UTC (permalink / raw)
  To: Gregory Haskins, kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

I don't think this patch is complete to add NMIP support to KVM, and can
you pls exclude it from current APIC patchsets? We can revisit it later
with more thoughts.

-Xin 

>-----Original Message-----
>From: kvm-devel-bounces-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org 
>[mailto:kvm-devel-bounces-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org] On Behalf Of 
>Gregory Haskins
>Sent: Friday, June 01, 2007 2:09 AM
>To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
>Subject: [kvm-devel] [PATCH 6/9] KVM: Adds support for real 
>NMI injection onVMX processors
>
>Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
>---
>
> drivers/kvm/vmx.c |   56 
>+++++++++++++++++++++++++++++++++++++++++++++++++----
> drivers/kvm/vmx.h |    3 +++
> 2 files changed, 55 insertions(+), 4 deletions(-)
>
>diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
>index 82e40c9..7923a42 100644
>--- a/drivers/kvm/vmx.c
>+++ b/drivers/kvm/vmx.c
>@@ -1301,7 +1301,14 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
> 			       PIN_BASED_VM_EXEC_CONTROL,
> 			       PIN_BASED_EXT_INTR_MASK   /* 20.6.1 */
> 			       | PIN_BASED_NMI_EXITING   /* 20.6.1 */
>+			       | PIN_BASED_VIRTUAL_NMI   /* 20.6.1 */
> 			);
>+
>+	if (!(vmcs_read32(PIN_BASED_VM_EXEC_CONTROL) & 
>PIN_BASED_VIRTUAL_NMI))
>+		printk(KERN_DEBUG "KVM: Warning - Host 
>processor does " \
>+		       "not support virtual-NMI injection.  
>Using IRQ " \
>+		       "method\n");
>+ 
> 	vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS,
> 			       CPU_BASED_VM_EXEC_CONTROL,
> 			       CPU_BASED_HLT_EXITING         /* 
>20.6.2 */
>@@ -1450,6 +1457,37 @@ static void inject_rmode_irq(struct 
>kvm_vcpu *vcpu, int irq)
> 	vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & 
>~0xffff) | (sp - 6));
> }
> 
>+static void do_nmi_requests(struct kvm_vcpu *vcpu)
>+{
>+	int nmi_window = 0;
>+
>+	BUG_ON(!(test_bit(kvm_irqpin_nmi, &vcpu->irq.pending)));
>+
>+	nmi_window =
>+		(((vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 0xb) == 0)
>+		 && (vmcs_read32(VM_ENTRY_INTR_INFO_FIELD)
>+		     & INTR_INFO_VALID_MASK));
>+
>+	if (nmi_window) {
>+		if (vcpu->rmode.active)
>+			inject_rmode_irq(vcpu, 2);
>+		else
>+			vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
>+				     2 |
>+				     INTR_TYPE_NMI |
>+				     INTR_INFO_VALID_MASK);
>+
>+		__clear_bit(kvm_irqpin_nmi, &vcpu->irq.pending);
>+	} else {
>+		/*
>+		 * NMIs blocked.  Wait for unblock.
>+		 */
>+		u32 cbvec = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
>+		cbvec |= CPU_BASED_NMI_EXITING;
>+		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cbvec);	
>+	}
>+}
>+
> static void do_intr_requests(struct kvm_vcpu *vcpu,
> 			    struct kvm_run *kvm_run,
> 			    kvm_irqpin_t pin)
>@@ -1482,9 +1520,11 @@ static void do_intr_requests(struct 
>kvm_vcpu *vcpu,
> 			break;
> 		case kvm_irqpin_nmi:
> 			/*
>-			 * FIXME: Someday we will handle this using the
>-			 * specific VMX NMI features.  For now, 
>just inject
>-			 * the NMI as a standard interrupt on vector 2
>+			 * We should only get here if the processor does
>+			 * not support virtual NMIs.  Inject 
>the NMI as a
>+			 * standard interrupt on vector 2.  The 
>implication is
>+			 * that NMIs are going to be subject to 
>RFLAGS.IF
>+			 * masking, unfortunately.
> 			 */
> 			ack.flags |= KVM_IRQACKDATA_VECTOR_VALID;
> 			ack.vector = 2;
>@@ -1534,6 +1574,8 @@ static void 
>clear_pending_controls(struct kvm_vcpu *vcpu,
>  	else
>  		cbvec &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
> 
>+	cbvec &= ~CPU_BASED_NMI_EXITING;
>+
> 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cbvec);
> }
> 
>@@ -1550,7 +1592,6 @@ static void do_interrupt_requests(struct 
>kvm_vcpu *vcpu,
> 		switch (pin) {
> 		case kvm_irqpin_localint:
> 		case kvm_irqpin_extint:
>-		case kvm_irqpin_nmi:
> 			do_intr_requests(vcpu, kvm_run, pin);
> 			break;
> 		case kvm_irqpin_smi:
>@@ -1558,6 +1599,13 @@ static void 
>do_interrupt_requests(struct kvm_vcpu *vcpu,
> 			printk(KERN_WARNING "KVM: dropping 
>unhandled SMI\n");
> 			__clear_bit(pin, &vcpu->irq.pending);
> 			break;
>+		case kvm_irqpin_nmi:
>+			if (vmcs_read32(PIN_BASED_VM_EXEC_CONTROL)
>+			    & PIN_BASED_VIRTUAL_NMI)
>+				do_nmi_requests(vcpu);
>+			else
>+				do_intr_requests(vcpu, kvm_run, pin);	
>+			break;
> 		case kvm_irqpin_invalid:
> 			/* drop */
> 			break;
>diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h
>index d0dc93d..d3fe017 100644
>--- a/drivers/kvm/vmx.h
>+++ b/drivers/kvm/vmx.h
>@@ -35,6 +35,7 @@
> #define CPU_BASED_CR8_LOAD_EXITING      0x00080000
> #define CPU_BASED_CR8_STORE_EXITING     0x00100000
> #define CPU_BASED_TPR_SHADOW            0x00200000
>+#define CPU_BASED_NMI_EXITING           0x00400000
> #define CPU_BASED_MOV_DR_EXITING        0x00800000
> #define CPU_BASED_UNCOND_IO_EXITING     0x01000000
> #define CPU_BASED_ACTIVATE_IO_BITMAP    0x02000000
>@@ -44,6 +45,7 @@
> 
> #define PIN_BASED_EXT_INTR_MASK 0x1
> #define PIN_BASED_NMI_EXITING   0x8
>+#define PIN_BASED_VIRTUAL_NMI   0x20
> 
> #define VM_EXIT_ACK_INTR_ON_EXIT        0x00008000
> #define VM_EXIT_HOST_ADD_SPACE_SIZE     0x00000200
>@@ -221,6 +223,7 @@ enum vmcs_field {
> #define VECTORING_INFO_VALID_MASK       	INTR_INFO_VALID_MASK
> 
> #define INTR_TYPE_EXT_INTR              (0 << 8) /* external 
>interrupt */
>+#define INTR_TYPE_NMI                   (2 << 8) /* 
>non-maskable interrupt */
> #define INTR_TYPE_EXCEPTION             (3 << 8) /* processor 
>exception */
> 
> /*
>
>
>---------------------------------------------------------------
>----------
>This SF.net email is sponsored by DB2 Express
>Download DB2 Express C - the FREE version of DB2 express and take
>control of your XML. No limits. Just data. Click to get it now.
>http://sourceforge.net/powerbar/db2/
>_______________________________________________
>kvm-devel mailing list
>kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
>https://lists.sourceforge.net/lists/listinfo/kvm-devel
>

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 0/9] in-kernel APIC v9 (kernel side)
       [not found] ` <20070531180005.1810.23884.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
                     ` (8 preceding siblings ...)
  2007-05-31 18:09   ` [PATCH 9/9] KVM: Adds support for halting in the kernel Gregory Haskins
@ 2007-06-02 22:04   ` Dor Laor
  2007-06-03  9:28   ` Avi Kivity
  2007-06-07  8:20   ` Dong, Eddie
  11 siblings, 0 replies; 21+ messages in thread
From: Dor Laor @ 2007-06-02 22:04 UTC (permalink / raw)
  To: Gregory Haskins, kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

>Incorporates v8 plus the following changes:
>
>1) Fix for hang on AMD
>2) Fixes issue where irq-windows are inaccurately reported to userspace
>3) Fixed issue where irq-window-exiting requests can be ignored in some
>cases
>
>Note that we no longer need the backlog.patch to handle a corner cases
now.
>
>As before, this has been tested on 32 bit XP w/ACPI and 64 bit windows.
It
>offers a 17% performance improvement over git HEAD in my testing.  Note
>that I
>am not able to fully verify that this works on AMD, as even git-head
does
>not
>work on my system.  I am able to verify that it no longer hangs the
kernel
>hard.  The guest hangs, but it hangs without my patches as well.
Perhaps
>someone with a known good environment on AMD can verify for me?

We'll re-run regression and if it's ok or not the blame we'll commit you
patch set finally ;)

>I am being pulled off of my KVM work for a little while, so I will not
be
>able
>to contribute again until further notice. If there are any remaining
issues
>that need to be addressed and someone wants to carry the torch, feel
free
>to
>do so.  Otherwise, I will pick up the effort to get this merged in when
I
>am
>able to return to KVM.
>
>Thanks all for the feedback/comments/suggestions through all of this.
It

Thanks for the important patches, without your help the apic would have
stayed in user space for a long time.

>has
>been very fun and quite a learning experience.
>
>-Greg
>
>
>-----------------------------------------------------------------------
--
>This SF.net email is sponsored by DB2 Express
>Download DB2 Express C - the FREE version of DB2 express and take
>control of your XML. No limits. Just data. Click to get it now.
>http://sourceforge.net/powerbar/db2/
>_______________________________________________
>kvm-devel mailing list
>kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
>https://lists.sourceforge.net/lists/listinfo/kvm-devel

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 0/9] in-kernel APIC v9 (kernel side)
       [not found] ` <20070531180005.1810.23884.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
                     ` (9 preceding siblings ...)
  2007-06-02 22:04   ` [PATCH 0/9] in-kernel APIC v9 (kernel side) Dor Laor
@ 2007-06-03  9:28   ` Avi Kivity
       [not found]     ` <466289A4.9000201-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  2007-06-07  8:20   ` Dong, Eddie
  11 siblings, 1 reply; 21+ messages in thread
From: Avi Kivity @ 2007-06-03  9:28 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Gregory Haskins wrote:
> Incorporates v8 plus the following changes:
>
> 1) Fix for hang on AMD
> 2) Fixes issue where irq-windows are inaccurately reported to userspace
> 3) Fixed issue where irq-window-exiting requests can be ignored in some cases
>
>   

FC6 x86_64 hangs on this (AMD) after 'Detected 62.502 MHz APIC Timer.'.  
At least it doesn't kill the machine -- just the guest.  I'll try to see 
what's wrong.

Note that this is with just the kernel patches applied.

The patchset is available as the lapic branch in kvm.git (with some 
whitespace damage fixed).

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 0/9] in-kernel APIC v9 (kernel side)
       [not found]     ` <466289A4.9000201-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-06-03 15:39       ` Avi Kivity
  0 siblings, 0 replies; 21+ messages in thread
From: Avi Kivity @ 2007-06-03 15:39 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

[-- Attachment #1: Type: text/plain, Size: 1481 bytes --]

Avi Kivity wrote:
> Gregory Haskins wrote:
>> Incorporates v8 plus the following changes:
>>
>> 1) Fix for hang on AMD
>> 2) Fixes issue where irq-windows are inaccurately reported to userspace
>> 3) Fixed issue where irq-window-exiting requests can be ignored in 
>> some cases
>>
>>   
>
> FC6 x86_64 hangs on this (AMD) after 'Detected 62.502 MHz APIC 
> Timer.'.  At least it doesn't kill the machine -- just the guest.  
> I'll try to see what's wrong.
>
> Note that this is with just the kernel patches applied.
>
> The patchset is available as the lapic branch in kvm.git (with some 
> whitespace damage fixed).
>

Things are a little better with the attached patch.

On AMD, to detect the interrupt window opening, we queue an interrupt 
and then ask for an intercept immediately before interrupt disaptching.  
Effectively that means an intercept after sti, as on vmx.

The problem occurs when we want to inject an interrupt _and_ request an 
interrupt window.  Using the current code, we loop immediately because 
we queue the requested interrupt, then ask for an intercept when it is 
dispatched.  A complex way to spin.

The fix is to inject the interrupt instead of queueing it.  Injected 
events are not intercepted, so we can ask for an interrupt window 
concurrently with injecting an interrupt.

However, there are still problems (like Windows spontaneously 
rebooting).  Will investigate further.

-- 
error compiling committee.c: too many arguments to function


[-- Attachment #2: lapic-amd-fix.patch --]
[-- Type: text/x-patch, Size: 634 bytes --]

diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index bdc5d98..651c860 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -1425,13 +1425,18 @@ static void do_intr_requests(struct kvm_vcpu *vcpu,
 
 		if (ack.flags & KVM_IRQACKDATA_VECTOR_VALID) {
 			control = &vcpu->svm->vmcb->control;
+#if 0
 			control->int_vector = ack.vector;
 			control->int_ctl &= ~V_INTR_PRIO_MASK;
 			control->int_ctl |= V_IRQ_MASK |
 				((/*control->int_vector >> 4*/ 0xf) <<
 				 V_INTR_PRIO_SHIFT);
+#endif
+			control->event_inj = ack.vector
+				| SVM_EVTINJ_VALID
+				| SVM_EVTINJ_TYPE_INTR;
 
 			++vcpu->stat.irq_accepted;
 		}
 	}
 

[-- Attachment #3: Type: text/plain, Size: 286 bytes --]

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

[-- Attachment #4: Type: text/plain, Size: 186 bytes --]

_______________________________________________
kvm-devel mailing list
kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
https://lists.sourceforge.net/lists/listinfo/kvm-devel

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/9] KVM: Add irqdevice object
       [not found]     ` <20070531180903.1810.87474.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
@ 2007-06-04  7:13       ` Dong, Eddie
  0 siblings, 0 replies; 21+ messages in thread
From: Dong, Eddie @ 2007-06-04  7:13 UTC (permalink / raw)
  To: Gregory Haskins, kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Greg:
	Can you explain a bit why we need to distinguish
kvm_irqpin_extint from kvm_irqpin_localint? I am not in previous thread
for this. I saw latest V09 has hole here such as handle_exception due to
IRQ inject fail. I.e. IDT_VECTORING_INFO_FIELD only push localint back,
but no extint. 

	In Xen, when we implementing APIC patch, we simplify it by
limiting a single IRQ can either be passed by PIC or APIC, we never
handle the combination of both PIC and APIC servicing them in same time.
I.e. If APIC is taking the IRQ, PIC must already mask the pin. That
works fine so far.  I am just wondering if we should start from simple
model and improve later if we find problem.  Same for NMI/SMI handling.

	
	BTW, I am not sure if we should have this abstract now, the irq
handling path is quit tricky when we do that in Xen time.
thx,eddie


kvm-devel-bounces-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org wrote:
> The current code is geared towards using a user-mode (A)PIC.
> This patch adds
> an "irqdevice" abstraction, and implements a "userint" model
> to handle the
> duties of the original code.  Later, we can develop other
> irqdevice models
> to handle objects like LAPIC, IOAPIC, i8259, etc, as appropriate
> 
> Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> ---
> 
> drivers/kvm/Makefile    |    2
> drivers/kvm/irqdevice.h |  176 +++++++++++++++++++++++++++++++++++++
> drivers/kvm/kvm.h       |   94 +++++++++++++++++++-
> drivers/kvm/kvm_main.c  |   58 +++++++++---
> drivers/kvm/svm.c       |  168 ++++++++++++++++++++++++++---------
> drivers/kvm/userint.c   |  223
> +++++++++++++++++++++++++++++++++++++++++++++++
> drivers/kvm/vmx.c       |  171 +++++++++++++++++++++++++++---------
> 7 files changed, 786 insertions(+), 106 deletions(-)
> 
> diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile
> index c0a789f..540afbc 100644
> --- a/drivers/kvm/Makefile
> +++ b/drivers/kvm/Makefile
> @@ -2,7 +2,7 @@
> # Makefile for Kernel-based Virtual Machine module
> #
> 
> -kvm-objs := kvm_main.o mmu.o x86_emulate.o
> +kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o
> obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o
> obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
> diff --git a/drivers/kvm/irqdevice.h b/drivers/kvm/irqdevice.h
> new file mode 100644
> index 0000000..097d179
> --- /dev/null
> +++ b/drivers/kvm/irqdevice.h
> @@ -0,0 +1,176 @@
> +/*
> + * Defines an interface for an abstract interrupt controller.
> The model
> + * consists of a unit with an arbitrary number of input lines
> N (IRQ0-(N-1)),
> + * an arbitrary number of output lines (INTR) (LINT, EXTINT,
> NMI, etc), and
> + * methods for completing an interrupt-acknowledge cycle (INTA).  A
> particular + * implementation of this model will define various
> policies, such as + * irq-to-vector translation, INTA/auto-EOI
> policy, etc. + * + * In addition, the INTR callback mechanism allows
> the unit 
> to be "wired" to
> + * an interruptible source in a very flexible manner. For instance,
> an + * irqdevice could have its INTR wired to a VCPU (ala LAPIC),
> or another
> + * interrupt controller (ala cascaded i8259s)
> + *
> + * Copyright (C) 2007 Novell
> + *
> + * Authors:
> + *   Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
> + *
> + * This work is licensed under the terms of the GNU GPL,
> version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + */
> +
> +#ifndef __IRQDEVICE_H
> +#define __IRQDEVICE_H
> +
> +struct kvm_irqdevice;
> +
> +typedef enum {
> +	kvm_irqpin_localint,
> +	kvm_irqpin_extint,
> +	kvm_irqpin_smi,
> +	kvm_irqpin_nmi,
> +	kvm_irqpin_invalid, /* must always be last */
> +} kvm_irqpin_t;
> +
> +
> +struct kvm_irqsink {
> +	void (*set_intr)(struct kvm_irqsink *this,
> +			 struct kvm_irqdevice *dev,
> +			 kvm_irqpin_t pin);
> +
> +	void *private;
> +};
> +
> +#define KVM_IRQACKDATA_VECTOR_VALID   (1 << 0)
> +#define KVM_IRQACKDATA_VECTOR_PENDING (1 << 1)
> +
> +#define KVM_IRQACK_FLAG_PEEK          (1 << 0)
> +
> +struct kvm_irqack_data {
> +	int flags;
> +	int vector;
> +};
> +
> +struct kvm_irqdevice {
> +	int  (*ack)(struct kvm_irqdevice *this, int flags,
> +		    struct kvm_irqack_data *data);
> +	int  (*set_pin)(struct kvm_irqdevice *this, int pin, int level);
> +	void (*destructor)(struct kvm_irqdevice *this);
> +
> +	void               *private;
> +	struct kvm_irqsink  sink;
> +};
> +
> +/**
> + * kvm_irqdevice_init - initialize the kvm_irqdevice for use + *
> @dev: The device + *
> + * Description: Initialize the kvm_irqdevice for use.  Should
> be called before
> + *              calling any derived implementation init functions + *
> + * Returns: (void)
> + */
> +static inline void kvm_irqdevice_init(struct kvm_irqdevice *dev) +{
> +	memset(dev, 0, sizeof(*dev));
> +}
> +
> +/**
> + * kvm_irqdevice_ack - read and ack the highest priority
> vector from the device
> + * @dev: The device
> + * @flags: Modifies default behavior
> + *           [ KVM_IRQACK_FLAG_PEEK - Dont ack vector, just
> check status ]
> + * @data: A pointer to a kvm_irqack_data structure to hold the
> result + * + * Description: Read the highest priority pending vector
> from 
> the device,
> + *              potentially invoking auto-EOI depending on
> device policy
> + *
> + *              Successful return indicates that the *data*
> structure is valid
> + *
> + *               data.flags -
> + *                  [KVM_IRQACKDATA_VECTOR_VALID -
> data.vector is valid]
> + *                  [KVM_IRQACKDATA_VECTOR_PENDING - more
> vectors are pending]
> + *
> + * Returns: (int)
> + *   [-1 = failure]
> + *   [ 0 = success]
> + */
> +static inline int kvm_irqdevice_ack(struct kvm_irqdevice *dev, int
> flags, +				    struct kvm_irqack_data
*data)
> +{
> +	return dev->ack(dev, flags, data);
> +}
> +
> +/**
> + * kvm_irqdevice_set_pin - allows the caller to assert/deassert an
> IRQ + * @dev: The device + * @pin: The input pin to alter
> + * @level: The value to set (1 = assert, 0 = deassert) + *
> + * Description: Allows the caller to assert/deassert an IRQ
> input pin to the
> + *              device according to device policy. + *
> + * Returns: (int)
> + *   [-1 = failure]
> + *   [ 0 = success]
> + */
> +static inline int kvm_irqdevice_set_pin(struct kvm_irqdevice *dev,
> int pin, +				  int level)
> +{
> +	return dev->set_pin(dev, pin, level);
> +}
> +
> +/**
> + * kvm_irqdevice_register_sink - registers an kvm_irqsink object + *
> @dev: The device + * @sink: The sink to register.  Data will be
> copied so 
> building object from
> + *        transient storage is ok.
> + *
> + * Description: Registers an kvm_irqsink object as an INTR callback
> + * + * Returns: (void)
> + */
> +static inline void kvm_irqdevice_register_sink(struct
> kvm_irqdevice *dev,
> +					       const struct
> kvm_irqsink *sink)
> +{
> +	dev->sink = *sink;
> +}
> +
> +/**
> + * kvm_irqdevice_destructor - destroys an irqdevice + * @dev: The
> device + *
> + * Returns: (void)
> + */
> +static inline void kvm_irqdevice_destructor(struct kvm_irqdevice
> *dev) +{ +	dev->destructor(dev);
> +}
> +
> +/**
> + * kvm_irqdevice_set_intr - invokes a registered INTR callback + *
> @dev: The device + * @pin: Identifies the pin to alter -
> + *           [ KVM_IRQPIN_LOCALINT (default) - an vector is
> pending on this
> + *                                             device]
> + *           [ KVM_IRQPIN_EXTINT - a vector is pending on an
> external device]
> + *           [ KVM_IRQPIN_SMI - system-management-interrupt pin]
> + *           [ KVM_IRQPIN_NMI - non-maskable-interrupt pin + *
> + * Description: Invokes a registered INTR callback (if present). 
> This + *              function is meant to be used privately by a
> irqdevice + *              implementation.
> + *
> + * Returns: (void)
> + */
> +static inline void kvm_irqdevice_set_intr(struct kvm_irqdevice *dev,
> +					  kvm_irqpin_t pin) +{
> +	struct kvm_irqsink *sink = &dev->sink;
> +	if (sink->set_intr)
> +		sink->set_intr(sink, dev, pin);
> +}
> +
> +#endif /*  __IRQDEVICE_H */
> diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
> index 1aa20ff..78025c3 100644
> --- a/drivers/kvm/kvm.h
> +++ b/drivers/kvm/kvm.h
> @@ -15,6 +15,7 @@
> #include <linux/mm.h>
> 
> #include "vmx.h"
> +#include "irqdevice.h"
> #include <linux/kvm.h>
> #include <linux/kvm_para.h>
> 
> @@ -162,6 +163,11 @@ struct vmcs {
> 
> struct kvm_vcpu;
> 
> +int kvm_user_irqdev_init(struct kvm_irqdevice *dev);
> +int kvm_user_irqdev_save(struct kvm_irqdevice *this, void *data);
> +int kvm_user_irqdev_restore(struct kvm_irqdevice *this, void *data);
> +int kvm_userint_init(struct kvm_vcpu *vcpu);
> +
> /*
>  * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and
> 2-level 
>  * 32-bit).  The kvm_mmu structure abstracts the details of
> the current mmu
> @@ -318,6 +324,18 @@ struct kvm_io_device
> *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
> void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
> 			     struct kvm_io_device *dev);
> 
> +#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) +
> +/*
> + * structure for maintaining info for interrupting an executing VCPU
> + */ +struct kvm_vcpu_irq {
> +	spinlock_t           lock;
> +	struct kvm_irqdevice dev;
> +	int                  pending;
> +	int                  deferred;
> +};
> +
> struct kvm_vcpu {
> 	struct kvm *kvm;
> 	union {
> @@ -330,9 +348,7 @@ struct kvm_vcpu {
> 	u64 host_tsc;
> 	struct kvm_run *run;
> 	int interrupt_window_open;
> -	unsigned long irq_summary; /* bit vector: 1 per word in
> irq_pending */
> -#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
> -	unsigned long irq_pending[NR_IRQ_WORDS];
> +	struct kvm_vcpu_irq irq;
> 	unsigned long regs[NR_VCPU_REGS]; /* for rsp:
> vcpu_load_rsp_rip() */
> 	unsigned long rip;      /* needs vcpu_load_rsp_rip() */
> 
> @@ -410,6 +426,78 @@ struct kvm_vcpu {
> 	struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES]; };
> 
> +/*
> + * These two functions are helpers for determining if a
> standard interrupt
> + * is pending to replace the old "if (vcpu->irq_summary)" logic. + */
> +
> +/*
> + * Assumes lock already held
> + */
> +static inline int __kvm_vcpu_irq_pending(struct kvm_vcpu *vcpu) +{
> +	int pending = vcpu->irq.pending;
> +
> +	if (vcpu->irq.deferred != -1)
> +		__set_bit(kvm_irqpin_localint, &pending);
> +
> +	return pending;
> +}
> +
> +static inline int kvm_vcpu_irq_pending(struct kvm_vcpu *vcpu) +{
> +	int ret = 0;
> +	int flags;
> +
> +	spin_lock_irqsave(&vcpu->irq.lock, flags);
> +	ret = __kvm_vcpu_irq_pending(vcpu);
> +	spin_unlock_irqrestore(&vcpu->irq.lock, flags);
> +
> +	return ret;
> +}
> +
> +/*
> + * Assumes lock already held
> + */
> +static inline int kvm_vcpu_irq_pop(struct kvm_vcpu *vcpu,
> +				   struct kvm_irqack_data *data)
> +{
> +	int ret = 0;
> +
> +	if (vcpu->irq.deferred != -1) {
> +		ret = kvm_irqdevice_ack(&vcpu->irq.dev,
> KVM_IRQACK_FLAG_PEEK,
> +					data);
> +		data->flags |= KVM_IRQACKDATA_VECTOR_VALID;
> +		data->vector = vcpu->irq.deferred;
> +		vcpu->irq.deferred = -1;
> +	} else
> +		ret = kvm_irqdevice_ack(&vcpu->irq.dev, 0, data); +
> +	/*
> +	 * If there are no more interrupts we must clear the status flag
+	
> */ +	if (!(data->flags & KVM_IRQACKDATA_VECTOR_PENDING))
> +		__clear_bit(kvm_irqpin_localint, &vcpu->irq.pending); +
> +	return ret;
> +}
> +
> +static inline void __kvm_vcpu_irq_push(struct kvm_vcpu *vcpu, int
> irq) +{ +	BUG_ON(vcpu->irq.deferred != -1); /* We can only hold
> one deferred */
> +
> +	vcpu->irq.deferred = irq;
> +}
> +
> +static inline void kvm_vcpu_irq_push(struct kvm_vcpu *vcpu, int irq)
> +{ +	int flags;
> +
> +	spin_lock_irqsave(&vcpu->irq.lock, flags);
> +	__kvm_vcpu_irq_push(vcpu, irq);
> +	spin_unlock_irqrestore(&vcpu->irq.lock, flags);
> +}
> +
> struct kvm_mem_alias {
> 	gfn_t base_gfn;
> 	unsigned long npages;
> diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
> index 008e898..dfab3f3 100644
> --- a/drivers/kvm/kvm_main.c
> +++ b/drivers/kvm/kvm_main.c
> @@ -323,6 +323,11 @@ static struct kvm *kvm_create_vm(void)
> 		struct kvm_vcpu *vcpu = &kvm->vcpus[i];
> 
> 		mutex_init(&vcpu->mutex);
> +
> +		memset(&vcpu->irq, 0, sizeof(vcpu->irq));
> +		spin_lock_init(&vcpu->irq.lock);
> +		vcpu->irq.deferred = -1;
> +
> 		vcpu->cpu = -1;
> 		vcpu->kvm = kvm;
> 		vcpu->mmu.root_hpa = INVALID_PAGE;
> @@ -389,6 +394,7 @@ static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
> 	vcpu_load(vcpu); kvm_mmu_destroy(vcpu);
> 	vcpu_put(vcpu);
> +	kvm_irqdevice_destructor(&vcpu->irq.dev);
> 	kvm_arch_ops->vcpu_free(vcpu);
> 	free_page((unsigned long)vcpu->run);
> 	vcpu->run = NULL;
> @@ -2008,8 +2014,7 @@ static int
> kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
> 	sregs->efer = vcpu->shadow_efer;
> 	sregs->apic_base = vcpu->apic_base;
> 
> -	memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
> -	       sizeof sregs->interrupt_bitmap);
> +	kvm_user_irqdev_save(&vcpu->irq.dev, &sregs->interrupt_bitmap);
> 
> 	vcpu_put(vcpu);
> 
> @@ -2026,7 +2031,6 @@ static int
> kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
> 				    struct kvm_sregs *sregs)
> {
> 	int mmu_reset_needed = 0;
> -	int i;
> 	struct descriptor_table dt;
> 
> 	vcpu_load(vcpu);
> @@ -2063,12 +2067,8 @@ static int
> kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
> 	if (mmu_reset_needed)
> 		kvm_mmu_reset_context(vcpu);
> 
> -	memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
> -	       sizeof vcpu->irq_pending);
> -	vcpu->irq_summary = 0;
> -	for (i = 0; i < NR_IRQ_WORDS; ++i)
> -		if (vcpu->irq_pending[i])
> -			__set_bit(i, &vcpu->irq_summary);
> +	kvm_user_irqdev_restore(&vcpu->irq.dev,
> +				&sregs->interrupt_bitmap[0]);
> 
> 	set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
> 	set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
> @@ -2229,14 +2229,8 @@ static int
> kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
> {
> 	if (irq->irq < 0 || irq->irq >= 256)
> 		return -EINVAL;
> -	vcpu_load(vcpu);
> -
> -	set_bit(irq->irq, vcpu->irq_pending);
> -	set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
> 
> -	vcpu_put(vcpu);
> -
> -	return 0;
> +	return kvm_irqdevice_set_pin(&vcpu->irq.dev, irq->irq, 1); }
> 
> static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, @@
> -2338,6 +2332,32 @@ out1: }
> 
> /*
> + * This function will be invoked whenever the vcpu->irq.dev
> raises its INTR
> + * line
> + */
> +static void kvm_vcpu_intr(struct kvm_irqsink *this,
> +			  struct kvm_irqdevice *dev,
> +			  kvm_irqpin_t pin)
> +{
> +	struct kvm_vcpu *vcpu = (struct kvm_vcpu*)this->private; +
unsigned
> long flags; +
> +	spin_lock_irqsave(&vcpu->irq.lock, flags);
> +	__set_bit(pin, &vcpu->irq.pending);
> +	spin_unlock_irqrestore(&vcpu->irq.lock, flags);
> +}
> +
> +static void kvm_vcpu_irqsink_init(struct kvm_vcpu *vcpu) +{
> +	struct kvm_irqsink sink = {
> +		.set_intr   = kvm_vcpu_intr,
> +		.private    = vcpu
> +	};
> +
> +	kvm_irqdevice_register_sink(&vcpu->irq.dev, &sink); +}
> +
> +/*
>  * Creates some virtual cpus.  Good luck creating more than one.  */
> static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
> @@ -2384,6 +2404,12 @@ static int
> kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
> 	if (r < 0)
> 		goto out_free_vcpus;
> 
> +	kvm_irqdevice_init(&vcpu->irq.dev);
> +	kvm_vcpu_irqsink_init(vcpu);
> +	r = kvm_userint_init(vcpu);
> +	if (r < 0)
> +		goto out_free_vcpus;
> +
> 	kvm_arch_ops->vcpu_load(vcpu);
> 	r = kvm_mmu_setup(vcpu);
> 	if (r >= 0)
> diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
> index b621403..8395662 100644
> --- a/drivers/kvm/svm.c
> +++ b/drivers/kvm/svm.c
> @@ -106,24 +106,6 @@ static unsigned get_addr_size(struct
> kvm_vcpu *vcpu)
> 				(cs_attrib &
> SVM_SELECTOR_DB_MASK) ? 4 : 2;
> }
> 
> -static inline u8 pop_irq(struct kvm_vcpu *vcpu)
> -{
> -	int word_index = __ffs(vcpu->irq_summary);
> -	int bit_index = __ffs(vcpu->irq_pending[word_index]);
> -	int irq = word_index * BITS_PER_LONG + bit_index; -
> -	clear_bit(bit_index, &vcpu->irq_pending[word_index]);
> -	if (!vcpu->irq_pending[word_index])
> -		clear_bit(word_index, &vcpu->irq_summary);
> -	return irq;
> -}
> -
> -static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) -{
> -	set_bit(irq, vcpu->irq_pending);
> -	set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); -}
> -
> static inline void clgi(void)
> {
> 	asm volatile (SVM_CLGI);
> @@ -904,7 +886,12 @@ static int pf_interception(struct
> kvm_vcpu *vcpu, struct kvm_run *kvm_run)
> 	int r;
> 
> 	if (is_external_interrupt(exit_int_info))
> -		push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); +
/*
> +		 * An exception was taken while we were trying
> to inject an
> +		 * IRQ.  We must defer the injection of the vector until
+		 * the
> next window. +		 */
> +		kvm_vcpu_irq_push(vcpu, exit_int_info &
> SVM_EVTINJ_VEC_MASK);
> 
> 	spin_lock(&vcpu->kvm->lock);
> 
> @@ -1114,7 +1101,7 @@ static int halt_interception(struct
> kvm_vcpu *vcpu, struct kvm_run *kvm_run)
> {
> 	vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1;
> 	skip_emulated_instruction(vcpu);
> -	if (vcpu->irq_summary)
> +	if (kvm_vcpu_irq_pending(vcpu))
> 		return 1;
> 
> 	kvm_run->exit_reason = KVM_EXIT_HLT;
> @@ -1285,7 +1272,7 @@ static int
> interrupt_window_interception(struct kvm_vcpu *vcpu,
> 	 * possible
> 	 */
> 	if (kvm_run->request_interrupt_window &&
> -	    !vcpu->irq_summary) {
> +	    !kvm_vcpu_irq_pending(vcpu)) {
> 		++vcpu->stat.irq_window_exits;
> 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
> 		return 0;
> @@ -1384,60 +1371,143 @@ static void pre_svm_run(struct kvm_vcpu
> *vcpu) } 
> 
> 
> -static inline void kvm_do_inject_irq(struct kvm_vcpu *vcpu) -{
> -	struct vmcb_control_area *control;
> -
> -	control = &vcpu->svm->vmcb->control;
> -	control->int_vector = pop_irq(vcpu);
> -	control->int_ctl &= ~V_INTR_PRIO_MASK;
> -	control->int_ctl |= V_IRQ_MASK |
> -		((/*control->int_vector >> 4*/ 0xf) <<
> V_INTR_PRIO_SHIFT);
> -}
> -
> static void kvm_reput_irq(struct kvm_vcpu *vcpu)
> {
> 	struct vmcb_control_area *control = &vcpu->svm->vmcb->control;
> 
> 	if (control->int_ctl & V_IRQ_MASK) {
> 		control->int_ctl &= ~V_IRQ_MASK;
> -		push_irq(vcpu, control->int_vector);
> +		kvm_vcpu_irq_push(vcpu, control->int_vector);
> 	}
> 
> 	vcpu->interrupt_window_open =
> 		!(control->int_state & SVM_INTERRUPT_SHADOW_MASK); }
> 
> -static void do_interrupt_requests(struct kvm_vcpu *vcpu,
> -				       struct kvm_run *kvm_run)
> +static void do_intr_requests(struct kvm_vcpu *vcpu,
> +			    struct kvm_run *kvm_run,
> +			    kvm_irqpin_t pin)
> {
> 	struct vmcb_control_area *control = &vcpu->svm->vmcb->control; +
int
> pending = 0; 
> 
> 	vcpu->interrupt_window_open =
> 		(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
> 		 (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF));
> 
> -	if (vcpu->interrupt_window_open && vcpu->irq_summary)
> +	if (vcpu->interrupt_window_open) {
> 		/*
> -		 * If interrupts enabled, and not blocked by
> sti or mov ss. Good.
> +		 * If interrupts enabled, and not blocked by
> sti or mov ss.
> +		 * Good.
> 		 */
> -		kvm_do_inject_irq(vcpu);
> +		struct kvm_irqack_data ack;
> +		int r = 0;
> +
> +		memset(&ack, 0, sizeof(ack));
> +
> +		switch (pin) {
> +		case kvm_irqpin_localint:
> +			r = kvm_vcpu_irq_pop(vcpu, &ack);
> +			break;
> +		case kvm_irqpin_extint:
> +			printk(KERN_WARNING "KVM:
> external-interrupts not " \
> +			       "handled yet\n");
> +			__clear_bit(pin, &vcpu->irq.pending);
> +			break;
> +		case kvm_irqpin_nmi:
> +			/*
> +			 * FIXME: Someday we will handle this using the
> +			 * specific SVN NMI features.  For now,
> just inject
> +			 * the NMI as a standard interrupt on vector 2
> +			 */
> +			ack.flags |= KVM_IRQACKDATA_VECTOR_VALID;
> +			ack.vector = 2;
> +			__clear_bit(pin, &vcpu->irq.pending);
> +			break;
> +		default:
> +			panic("KVM: unknown interrupt pin
> raised: %d\n", pin);
> +			break;
> +		}
> +
> +		BUG_ON(r < 0);
> +
> +		if (ack.flags & KVM_IRQACKDATA_VECTOR_VALID) {
> +			control = &vcpu->svm->vmcb->control;
> +			control->int_vector = ack.vector;
> +			control->int_ctl &= ~V_INTR_PRIO_MASK;
> +			control->int_ctl |= V_IRQ_MASK |
> +				((/*control->int_vector >> 4*/ 0xf) <<
> +				 V_INTR_PRIO_SHIFT);
> +		}
> +	}
> 
> 	/*
> -	 * Interrupts blocked.  Wait for unblock.
> +	 * Re-read the pending interrupt state.  If anything is still
> +	 * pending we need to cause an exit on the next window 	 */
> -	if (!vcpu->interrupt_window_open &&
> -	    (vcpu->irq_summary || kvm_run->request_interrupt_window)) {
> +	pending = __kvm_vcpu_irq_pending(vcpu);
> +
> +	if (test_bit(pin, &pending))
> +		/*
> +		 * Trigger a VMEXIT on the next IRQ window
> +		 */
> 		control->intercept |= 1ULL << INTERCEPT_VINTR;
> -	} else
> +}
> +
> +static void clear_pending_controls(struct kvm_vcpu *vcpu,
> +				  struct kvm_run *kvm_run)
> +{
> +	struct vmcb_control_area *control = &vcpu->svm->vmcb->control; +
> +	if (kvm_run->request_interrupt_window)
> +		control->intercept |= 1ULL << INTERCEPT_VINTR;
> +	else
> 		control->intercept &= ~(1ULL << INTERCEPT_VINTR); }
> 
> +static void do_interrupt_requests(struct kvm_vcpu *vcpu,
> +				  struct kvm_run *kvm_run)
> +{
> +	int pending = __kvm_vcpu_irq_pending(vcpu);
> +
> +	clear_pending_controls(vcpu, kvm_run);
> +
> +	while (pending) {
> +		kvm_irqpin_t pin = __fls(pending);
> +
> +		switch (pin) {
> +		case kvm_irqpin_localint:
> +		case kvm_irqpin_extint:
> +		case kvm_irqpin_nmi:
> +			do_intr_requests(vcpu, kvm_run, pin);
> +			break;
> +		case kvm_irqpin_smi:
> +			/* ignored (for now) */
> +			printk(KERN_WARNING "KVM: dropping
> unhandled SMI\n");
> +			__clear_bit(pin, &vcpu->irq.pending);
> +			break;
> +		case kvm_irqpin_invalid:
> +			/* drop */
> +			break;
> +		default:
> +			panic("KVM: unknown interrupt pin
> raised: %d\n", pin);
> +			break;
> +		}
> +
> +		__clear_bit(pin, &pending);
> +	}
> +}
> +
> static void post_kvm_run_save(struct kvm_vcpu *vcpu,
> 			      struct kvm_run *kvm_run)
> {
> -	kvm_run->ready_for_interrupt_injection =
> (vcpu->interrupt_window_open &&
> -
> vcpu->irq_summary == 0);
> +	struct kvm_irqack_data ack;
> +	kvm_irqdevice_ack(&vcpu->irq.dev, KVM_IRQACK_FLAG_PEEK, &ack); +
> +	kvm_run->ready_for_interrupt_injection =
> +		(vcpu->interrupt_window_open &&
> +		 !kvm_vcpu_irq_pending(vcpu) &&
> +		 !(ack.flags & KVM_IRQACKDATA_NEXT_VALID));
> 	kvm_run->if_flag = (vcpu->svm->vmcb->save.rflags &
> X86_EFLAGS_IF) != 0;
> 	kvm_run->cr8 = vcpu->cr8;
> 	kvm_run->apic_base = vcpu->apic_base;
> @@ -1452,7 +1522,7 @@ static void post_kvm_run_save(struct
> kvm_vcpu *vcpu,
> static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
> 					  struct kvm_run *kvm_run)
> {
> -	return (!vcpu->irq_summary &&
> +	return (!kvm_vcpu_irq_pending(vcpu) &&
> 		kvm_run->request_interrupt_window &&
> 		vcpu->interrupt_window_open &&
> 		(vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF));
> @@ -1482,9 +1552,17 @@ static int svm_vcpu_run(struct kvm_vcpu
> *vcpu, struct kvm_run *kvm_run)
> 	int r;
> 
> again:
> +	spin_lock(&vcpu->irq.lock);
> +
> +	/*
> +	 * We must inject interrupts (if any) while the irq_lock +
* is
> held +	 */
> 	if (!vcpu->mmio_read_completed)
> 		do_interrupt_requests(vcpu, kvm_run);
> 
> +	spin_unlock(&vcpu->irq.lock);
> +
> 	clgi();
> 
> 	pre_svm_run(vcpu);
> diff --git a/drivers/kvm/userint.c b/drivers/kvm/userint.c
> new file mode 100644
> index 0000000..a60707d
> --- /dev/null
> +++ b/drivers/kvm/userint.c
> @@ -0,0 +1,223 @@
> +/*
> + * User Interrupts IRQ device
> + *
> + * This acts as an extention of an interrupt controller that
> exists elsewhere
> + * (typically in userspace/QEMU).  Because this PIC is a
> pseudo device that
> + * is downstream from a real emulated PIC, the
> "IRQ-to-vector" mapping has
> + * already occured.  Therefore, this PIC has the following
> unusal properties:
> + *
> + * 1) It has 256 "pins" which are literal vectors (i.e. no
> translation) + * 2) It only supports "auto-EOI" behavior since it is
> expected that the
> + *    upstream emulated PIC will handle the real EOIs (if applicable)
> + * 3) It only listens to "asserts" on the pins (deasserts are
> dropped) + *    because its an auto-EOI device anyway.
> + *
> + * Copyright (C) 2007 Novell
> + *
> + * bitarray code based on original vcpu->irq_pending code,
> + *     Copyright (C) 2007 Qumranet
> + *
> + * Authors:
> + *   Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
> + *
> + * This work is licensed under the terms of the GNU GPL,
> version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "kvm.h"
> +
> +/*
> +
>
*----------------------------------------------------------------------
> + * optimized bitarray object - works like bitarrays in
> bitops, but uses
> + * a summary field to accelerate lookups.  Assumes external locking +
> *---------------------------------------------------------------------
> + */ +
> +struct bitarray {
> +	unsigned long summary; /* 1 per word in pending */
> +	unsigned long pending[NR_IRQ_WORDS];
> +};
> +
> +static inline int bitarray_pending(struct bitarray *this) +{
> +	return this->summary ? 1 : 0;
> +}
> +
> +static inline int bitarray_findhighest(struct bitarray *this) +{
> +	if (!this->summary)
> +		return -1;
> +	else {
> +		int word_index = __fls(this->summary);
> +		int bit_index  = __fls(this->pending[word_index]); +
> +		return word_index * BITS_PER_LONG + bit_index;
> +	}
> +}
> +
> +static inline void bitarray_set(struct bitarray *this, int nr) +{
> +	__set_bit(nr, &this->pending);
> +	__set_bit(nr / BITS_PER_LONG, &this->summary);
> +}
> +
> +static inline void bitarray_clear(struct bitarray *this, int nr) +{
> +	int word = nr / BITS_PER_LONG;
> +
> +	__clear_bit(nr, &this->pending);
> +	if (!this->pending[word])
> +		__clear_bit(word, &this->summary);
> +}
> +
> +static inline int bitarray_test(struct bitarray *this, int nr) +{
> +	return test_bit(nr, &this->pending);
> +}
> +
> +static inline int bitarray_test_and_set(struct bitarray *this, int
> nr, int val) +{
> +	if (bitarray_test(this, nr) != val) {
> +		if (val)
> +			bitarray_set(this, nr);
> +		else
> +			bitarray_clear(this, nr);
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> +
>
*----------------------------------------------------------------------
> + * userint interface - provides the actual kvm_irqdevice
> implementation +
> *---------------------------------------------------------------------
> + */ +
> +struct kvm_user_irqdev {
> +	spinlock_t      lock;
> +	atomic_t        ref_count;
> +	struct bitarray pending;
> +};
> +
> +static int user_irqdev_ack(struct kvm_irqdevice *this, int flags,
> +			   struct kvm_irqack_data *data)
> +{
> +	struct kvm_user_irqdev *s = (struct
> kvm_user_irqdev*)this->private;
> +
> +	spin_lock(&s->lock);
> +
> +	if (!(flags & KVM_IRQACK_FLAG_PEEK)) {
> +		int irq = bitarray_findhighest(&s->pending);
> +
> +		if (irq > -1) {
> +			/*
> +			 * Automatically clear the interrupt as the EOI
> +			 * mechanism (if any) will take place
> in userspace
> +			 */
> +			bitarray_clear(&s->pending, irq);
> +
> +			data->flags |= KVM_IRQACKDATA_VECTOR_VALID;
> +		}
> +
> +		data->vector = irq;
> +	}
> +
> +	if (bitarray_pending(&s->pending))
> +		data->flags |= KVM_IRQACKDATA_VECTOR_PENDING;
> +
> +	spin_unlock(&s->lock);
> +
> +	return 0;
> +}
> +
> +static int user_irqdev_set_pin(struct kvm_irqdevice *this,
> int irq, int level)
> +{
> +	struct kvm_user_irqdev *s = (struct
> kvm_user_irqdev*)this->private;
> +	int forward = 0;
> +
> +	spin_lock(&s->lock);
> +	forward = bitarray_test_and_set(&s->pending, irq, level);
> +	spin_unlock(&s->lock); +
> +	/*
> +	 * alert the higher layer software we have changes +	 */
> +	if (forward)
> +		kvm_irqdevice_set_intr(this, kvm_irqpin_localint); +
> +	return 0;
> +}
> +
> +static void user_irqdev_destructor(struct kvm_irqdevice *this) +{
> +	struct kvm_user_irqdev *s = (struct
> kvm_user_irqdev*)this->private;
> +
> +	if (atomic_dec_and_test(&s->ref_count))
> +		kfree(s);
> +}
> +
> +int kvm_user_irqdev_init(struct kvm_irqdevice *irqdev) +{
> +	struct kvm_user_irqdev *s;
> +
> +	s = kzalloc(sizeof(*s), GFP_KERNEL);
> +	if (!s)
> +		return -ENOMEM;
> +
> +	spin_lock_init(&s->lock);
> +
> +	irqdev->ack         = user_irqdev_ack;
> +	irqdev->set_pin     = user_irqdev_set_pin;
> +	irqdev->destructor  = user_irqdev_destructor;
> +
> +	irqdev->private = s;
> +	atomic_inc(&s->ref_count);
> +
> +	return 0;
> +}
> +
> +int kvm_user_irqdev_save(struct kvm_irqdevice *this, void *data) +{
> +	struct kvm_user_irqdev *s = (struct
> kvm_user_irqdev*)this->private;
> +
> +	spin_lock(&s->lock);
> +	memcpy(data, s->pending.pending, sizeof s->pending.pending);
> +	spin_unlock(&s->lock); +
> +	return 0;
> +}
> +
> +int kvm_user_irqdev_restore(struct kvm_irqdevice *this, void *data)
> +{ +	struct kvm_user_irqdev *s = (struct
> kvm_user_irqdev*)this->private;
> +	int i;
> +	int forward = 0;
> +
> +	spin_lock(&s->lock);
> +
> +	/*
> +	 * walk the interrupt-bitmap and inject an IRQ for each
> bit found
> +	 */
> +	for (i = 0; i < 256; ++i) {
> +		int val  = test_bit(i, data);
> +		forward |= bitarray_test_and_set(&s->pending, i, val); +
}
> +
> +	spin_unlock(&s->lock);
> +
> +	/*
> +	 * alert the higher layer software we have changes +	 */
> +	if (forward)
> +		kvm_irqdevice_set_intr(this, kvm_irqpin_localint); +
> +	return 0;
> +}
> +
> +int kvm_userint_init(struct kvm_vcpu *vcpu)
> +{
> +	return kvm_user_irqdev_init(&vcpu->irq.dev);
> +}
> +
> diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
> index 3411813..6c56ac0 100644
> --- a/drivers/kvm/vmx.c
> +++ b/drivers/kvm/vmx.c
> @@ -1454,52 +1454,124 @@ static void inject_rmode_irq(struct
> kvm_vcpu *vcpu, int irq)
> 	vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) &
> ~0xffff) | (sp - 6));
> }
> 
> -static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
> +static void do_intr_requests(struct kvm_vcpu *vcpu,
> +			    struct kvm_run *kvm_run,
> +			    kvm_irqpin_t pin)
> {
> -	int word_index = __ffs(vcpu->irq_summary);
> -	int bit_index = __ffs(vcpu->irq_pending[word_index]);
> -	int irq = word_index * BITS_PER_LONG + bit_index; -
> -	clear_bit(bit_index, &vcpu->irq_pending[word_index]);
> -	if (!vcpu->irq_pending[word_index])
> -		clear_bit(word_index, &vcpu->irq_summary);
> -
> -	if (vcpu->rmode.active) {
> -		inject_rmode_irq(vcpu, irq);
> -		return;
> -	}
> -	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
> -			irq | INTR_TYPE_EXT_INTR |
> INTR_INFO_VALID_MASK);
> -}
> -
> -
> -static void do_interrupt_requests(struct kvm_vcpu *vcpu,
> -				       struct kvm_run *kvm_run)
> -{
> -	u32 cpu_based_vm_exec_control;
> +	int pending = 0;
> 
> 	vcpu->interrupt_window_open =
> 		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
> 		 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
> 
> 	if (vcpu->interrupt_window_open &&
> -	    vcpu->irq_summary &&
> -	    !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) &
> INTR_INFO_VALID_MASK))
> +	    !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) &
> INTR_INFO_VALID_MASK)) {
> 		/*
> -		 * If interrupts enabled, and not blocked by
> sti or mov ss. Good.
> +		 * If interrupts enabled, and not blocked by
> sti or mov ss.
> +		 * Good.
> 		 */
> -		kvm_do_inject_irq(vcpu);
> +		struct kvm_irqack_data ack;
> +		int r = 0;
> +
> +		memset(&ack, 0, sizeof(ack));
> +
> +		switch (pin) {
> +		case kvm_irqpin_localint:
> +			r = kvm_vcpu_irq_pop(vcpu, &ack);
> +			break;
> +		case kvm_irqpin_extint:
> +			printk(KERN_WARNING "KVM:
> external-interrupts not " \
> +			       "handled yet\n");
> +			__clear_bit(pin, &vcpu->irq.pending);
> +			break;
> +		case kvm_irqpin_nmi:
> +			/*
> +			 * FIXME: Someday we will handle this using the
> +			 * specific VMX NMI features.  For now,
> just inject
> +			 * the NMI as a standard interrupt on vector 2
> +			 */
> +			ack.flags |= KVM_IRQACKDATA_VECTOR_VALID;
> +			ack.vector = 2;
> +			__clear_bit(pin, &vcpu->irq.pending);
> +			break;
> +		default:
> +			panic("KVM: unknown interrupt pin
> raised: %d\n", pin);
> +			break;
> +		}
> +
> +		BUG_ON(r < 0);
> 
> -	cpu_based_vm_exec_control =
> vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> -	if (!vcpu->interrupt_window_open &&
> -	    (vcpu->irq_summary || kvm_run->request_interrupt_window))
> +		if (ack.flags & KVM_IRQACKDATA_VECTOR_VALID) {
> +			if (vcpu->rmode.active)
> +				inject_rmode_irq(vcpu, ack.vector);
> +			else
> +				vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
> +					     ack.vector |
> +					     INTR_TYPE_EXT_INTR |
> +					     INTR_INFO_VALID_MASK);
> +		}
> +	}
> +
> +	/*
> +	 * Re-read the pending interrupt state.  If anything is still
> +	 * pending we need to cause an exit on the next window +
*/
> +	pending = __kvm_vcpu_irq_pending(vcpu);
> +
> +	if (test_bit(pin, &pending) ||
> kvm_run->request_interrupt_window) {
> 		/*
> -		 * Interrupts blocked.  Wait for unblock.
> +		 * Trigger a VMEXIT on the next IRQ window
> 		 */
> -		cpu_based_vm_exec_control |=
> CPU_BASED_VIRTUAL_INTR_PENDING;
> -	else
> -		cpu_based_vm_exec_control &=
> ~CPU_BASED_VIRTUAL_INTR_PENDING;
> -	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
> cpu_based_vm_exec_control);
> +		u32 cbvec = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> +		cbvec |= CPU_BASED_VIRTUAL_INTR_PENDING;
> +		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cbvec);
> +	}
> +}
> +
> +static void clear_pending_controls(struct kvm_vcpu *vcpu,
> +				  struct kvm_run *kvm_run)
> +{
> +	u32 cbvec = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); +
> + 	if (kvm_run->request_interrupt_window)
> + 		cbvec |= CPU_BASED_VIRTUAL_INTR_PENDING;
> + 	else
> + 		cbvec &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
> +
> +	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cbvec);
> +}
> +
> +static void do_interrupt_requests(struct kvm_vcpu *vcpu,
> +				  struct kvm_run *kvm_run)
> +{
> +	int pending = __kvm_vcpu_irq_pending(vcpu);
> +
> +	clear_pending_controls(vcpu, kvm_run);
> +
> +	while (pending) {
> +		kvm_irqpin_t pin = __fls(pending);
> +
> +		switch (pin) {
> +		case kvm_irqpin_localint:
> +		case kvm_irqpin_extint:
> +		case kvm_irqpin_nmi:
> +			do_intr_requests(vcpu, kvm_run, pin);
> +			break;
> +		case kvm_irqpin_smi:
> +			/* ignored (for now) */
> +			printk(KERN_WARNING "KVM: dropping
> unhandled SMI\n");
> +			__clear_bit(pin, &vcpu->irq.pending);
> +			break;
> +		case kvm_irqpin_invalid:
> +			/* drop */
> +			break;
> +		default:
> +			panic("KVM: unknown interrupt pin
> raised: %d\n", pin);
> +			break;
> +		}
> +
> +		__clear_bit(pin, &pending);
> +	}
> }
> 
> static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
> @@ -1554,9 +1626,13 @@ static int handle_exception(struct
> kvm_vcpu *vcpu, struct kvm_run *kvm_run)
> 	}
> 
> 	if (is_external_interrupt(vect_info)) {
> +		/*
> +		 * An exception was taken while we were trying
> to inject an
> +		 * IRQ.  We must defer the injection of the vector until
+		 * the
> next window. +		 */
> 		int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
> -		set_bit(irq, vcpu->irq_pending);
> -		set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
> +		kvm_vcpu_irq_push(vcpu, irq);
> 	}
> 
> 	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
> @@ -1869,11 +1945,16 @@ static int handle_wrmsr(struct
> kvm_vcpu *vcpu, struct kvm_run *kvm_run)
> static void post_kvm_run_save(struct kvm_vcpu *vcpu,
> 			      struct kvm_run *kvm_run)
> {
> + 	struct kvm_irqack_data ack;
> + 	kvm_irqdevice_ack(&vcpu->irq.dev, KVM_IRQACK_FLAG_PEEK, &ack); +
> 	kvm_run->if_flag = (vmcs_readl(GUEST_RFLAGS) &
> X86_EFLAGS_IF) != 0;
> 	kvm_run->cr8 = vcpu->cr8;
> 	kvm_run->apic_base = vcpu->apic_base;
> -	kvm_run->ready_for_interrupt_injection =
> (vcpu->interrupt_window_open &&
> -
> vcpu->irq_summary == 0);
> +	kvm_run->ready_for_interrupt_injection =
> +		(vcpu->interrupt_window_open &&
> +		 !kvm_vcpu_irq_pending(vcpu) &&
> +		 !(ack.flags & KVM_IRQACKDATA_NEXT_VALID));
> }
> 
> static int handle_interrupt_window(struct kvm_vcpu *vcpu,
> @@ -1884,7 +1965,7 @@ static int
> handle_interrupt_window(struct kvm_vcpu *vcpu,
> 	 * possible
> 	 */
> 	if (kvm_run->request_interrupt_window &&
> -	    !vcpu->irq_summary) {
> +	    !kvm_vcpu_irq_pending(vcpu)) {
> 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
> 		++vcpu->stat.irq_window_exits;
> 		return 0;
> @@ -1895,7 +1976,7 @@ static int
> handle_interrupt_window(struct kvm_vcpu *vcpu,
> static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run
> 	*kvm_run) { skip_emulated_instruction(vcpu);
> -	if (vcpu->irq_summary)
> +	if (kvm_vcpu_irq_pending(vcpu))
> 		return 1;
> 
> 	kvm_run->exit_reason = KVM_EXIT_HLT;
> @@ -1965,7 +2046,7 @@ static int kvm_handle_exit(struct
> kvm_run *kvm_run, struct kvm_vcpu *vcpu)
> static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
> 					  struct kvm_run *kvm_run)
> {
> -	return (!vcpu->irq_summary &&
> +	return (!kvm_vcpu_irq_pending(vcpu) &&
> 		kvm_run->request_interrupt_window &&
> 		vcpu->interrupt_window_open &&
> 		(vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
> @@ -1981,9 +2062,17 @@ preempted:
> 		kvm_guest_debug_pre(vcpu);
> 
> again:
> +	spin_lock(&vcpu->irq.lock);
> +
> +	/*
> +	 * We must inject interrupts (if any) while the irq.lock +
* is
> held +	 */
> 	if (!vcpu->mmio_read_completed)
> 		do_interrupt_requests(vcpu, kvm_run);
> 
> +	spin_unlock(&vcpu->irq.lock);
> +
> 	vmx_save_host_state(vcpu);
> 	kvm_load_guest_fpu(vcpu);
> 
> 
> 
> ---------------------------------------------------------------
> ----------
> This SF.net email is sponsored by DB2 Express
> Download DB2 Express C - the FREE version of DB2 express and take
> control of your XML. No limits. Just data. Click to get it now.
> http://sourceforge.net/powerbar/db2/
> _______________________________________________
> kvm-devel mailing list
> kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
> https://lists.sourceforge.net/lists/listinfo/kvm-devel

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 0/9] in-kernel APIC v9 (kernel side)
       [not found] ` <20070531180005.1810.23884.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
                     ` (10 preceding siblings ...)
  2007-06-03  9:28   ` Avi Kivity
@ 2007-06-07  8:20   ` Dong, Eddie
  11 siblings, 0 replies; 21+ messages in thread
From: Dong, Eddie @ 2007-06-07  8:20 UTC (permalink / raw)
  To: Gregory Haskins, kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

kvm-devel-bounces-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org wrote:
> Incorporates v8 plus the following changes:
> 
> 1) Fix for hang on AMD
> 2) Fixes issue where irq-windows are inaccurately reported to
> userspace 3) Fixed issue where irq-window-exiting requests can be
> ignored in some cases
> 
> Note that we no longer need the backlog.patch to handle a
> corner cases now.
> 
> As before, this has been tested on 32 bit XP w/ACPI and 64 bit
> windows.  It offers a 17% performance improvement over git HEAD in my
> testing.  Note that I
> am not able to fully verify that this works on AMD, as even
> git-head does not
> work on my system.  I am able to verify that it no longer
> hangs the kernel
> hard.  The guest hangs, but it hangs without my patches as
> well.  Perhaps
> someone with a known good environment on AMD can verify for me?
> 
> I am being pulled off of my KVM work for a little while, so I
> will not be able
> to contribute again until further notice. If there are any
> remaining issues
> that need to be addressed and someone wants to carry the
> torch, feel free to
> do so.  Otherwise, I will pick up the effort to get this
> merged in when I am
> able to return to KVM.
> 
> Thanks all for the feedback/comments/suggestions through all
> of this.  It has
> been very fun and quite a learning experience.
> 
Greg:
	Here are some detail comments towarding the LAPIC device model.
	1: irqabstraction layer
	vcpu->irq.pending holds the abstract processor interrupt request
(called localint, extint, nmi etc in V09).  But API kvm_vcpu_intr only
set the interrupt request, no clear.  So far I only noticed when an
interrupt request is pop or injected,  vcpu->irq.pending bit may be
cleared. But there is case when guest raise TPR bar, an localint could
be cleared. Same for extint when PIC IMR is masked. 
	In Xen, since there is no notion of abstract processor interrupt
request, VMM check interrupt directly from PIC and APIC, so Xen doesn't
have problem. I guess Windows will fail here.

	2: APIC timer
	a: V09 uses hrtimer for LAPIC timer, apic->timer.last_update is
updated every time when __apic_timer_fn is invoked at time of the APIC
timer fired. This impose an accumulated difference since the fire time
is already some ns later after expected time.
	Xen solve this issue by increase apic->timer.last_update with
the PERIOD, i.e. APIC_BUS_CYCLE_NS * apic->timer.divide_count *
APIC_TMICT.
	b: Seems current approach starts hrtimer whenever APIC_TMICT is
updated. Should we check APIC_LVT to see if it is masked here? (instead
of doing in its callback function:__apic_timer_fn). Also why APIC_TMCCT
is updated here? I think TMCCT is reloaded only when it reaches 0 and
LVTT works in periodic mode.
	c: I didn't see LVTT mask status refelect the hrtimer
cancel/start, do I miss something?

	3:  Assume a senario there is an valid IDT_VECTORING_INFO_FIELD,
following code(after patch) in handle_exception push back the failed
interrupt vector, i.e. vcpu->irq.deferred.

........
        if (is_external_interrupt(vect_info)) {
                /*
                 * An exception was taken while we were trying to inject
an
                 * IRQ.  We must defer the injection of the vector until
                 * the next window.
                 */
                int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
                kvm_vcpu_irq_push(vcpu, irq);
        }


	a:  Now, the abstracted processor interrupt, i.e.
vcpu->irq.pending, could be 0, __kvm_vcpu_irq_pending invoked at
beginning of do_interrupt_requests will set kvm_irqpin_localint in the
abstracted processor interrupt (vcpu->irq.pending). But if at same time,
we get an external IRQ, i.e. vcpu->irq.pending is set with both localint
& extint.
	From following code , kvm_irqpin_extint has higher priority than
kvm_irqpin_localint.

        while (pending) {
                kvm_irqpin_t pin = __fls(pending);

                switch (pin) {
                case kvm_irqpin_localint:
                case kvm_irqpin_extint:
                case kvm_irqpin_nmi:
                        do_intr_requests(vcpu, kvm_run, pin);
                        break;
		..............

	Now in do_intr_requests, we get an extirq vector instead of
vcpu->irq.deferred from following code since pin=kvm_irqpin_extint. That
means we inject an new irq instead of original failed irq in
IDT_VECTORING_INFO_FIELD.

		........
                switch (pin) {
                case kvm_irqpin_localint:
                        r = kvm_vcpu_irq_pop(vcpu, &ack);
                        break;
                case kvm_irqpin_extint:
                        r = kvm_irqdevice_ack(&vcpu->kvm->isa_irq, 0,
&ack);
                        if (!(ack.flags &
KVM_IRQACKDATA_VECTOR_PENDING))
                                __clear_bit(pin, &vcpu->irq.pending);
                        break;
                case kvm_irqpin_nmi:


	Anyway due to SMP & in-kernel APIC, I'd like to suggest we move
IDT_VECTORING_INFO_FIELD to do_interrupt_requests like vmx_intr_assist
in Xen where physical IRQ is disabled.



thx, eddie

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 0/9] in-kernel APIC v9 (kernel side)
@ 2007-06-11 11:56 Gregory Haskins
       [not found] ` <1181562984.4515.20.camel-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
  0 siblings, 1 reply; 21+ messages in thread
From: Gregory Haskins @ 2007-06-11 11:56 UTC (permalink / raw)
  To: Dong, Eddie; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Hi Eddie,
  Back from vacation. Just catching up on email now....

On Thu, 2007-06-07 at 16:20 +0800, Dong, Eddie wrote:
>
> Greg:
> 	Here are some detail comments towarding the LAPIC device model.
> 	1: irqabstraction layer
> 	vcpu->irq.pending holds the abstract processor interrupt request
> (called localint, extint, nmi etc in V09).  But API kvm_vcpu_intr only
> set the interrupt request, no clear.

By design

>   So far I only noticed when an
> interrupt request is pop or injected,  vcpu->irq.pending bit may be
> cleared. But there is case when guest raise TPR bar, an localint could
> be cleared.

Yep, by design.  And it will be set again when the TPR bar is lowered.

>  Same for extint when PIC IMR is masked.

Yeah, we have a hole here.  If IMR is changed to mask a pending vector,
it is conceivable that it could remain pending in the kernel.  Note that
this hole was always in KVM, even before my patch.  Because of this I am
of the opinion that it doesn't need to be fixed as part of the LAPIC
work per se before merging.  However, that being said, we can lay the
groundwork to support this now by adding an "int level" to the structure
that is passed in the KVM_ISA_INTERRUPT ioctl.  Going forward, someone
can patch the QEMU::i8259 to send clear events when IMR changes.  

>  
> 	In Xen, since there is no notion of abstract processor interrupt
> request, VMM check interrupt directly from PIC and APIC, so Xen doesn't
> have problem. I guess Windows will fail here.

No, this is incorrect (and Windows boots fine even w/ACPI enabled, BTW).
For the LAPIC, we do check the LAPIC model directly (and therefore
consider TPR, etc).  The abstract interrupt mechanism simply tells the
vcpu that it needs to check the LAPIC.  It is not authoritative in
deciding if something actually gets injected.

For the PIC, we don't check directly (the PICs vectors are cached in the
kernel), but again note that this is how KVM has always worked.  I think
fixing the KVM_ISA_INTERRUPT::level mechanism + adding IMR-clear support
shores up any issue there, however. 

> 
> 	2: APIC timer
> 	a: V09 uses hrtimer for LAPIC timer, apic->timer.last_update is
> updated every time when __apic_timer_fn is invoked at time of the APIC
> timer fired. This impose an accumulated difference since the fire time
> is already some ns later after expected time.
> 	Xen solve this issue by increase apic->timer.last_update with
> the PERIOD, i.e. APIC_BUS_CYCLE_NS * apic->timer.divide_count *
> APIC_TMICT.
> 	b: Seems current approach starts hrtimer whenever APIC_TMICT is
> updated. Should we check APIC_LVT to see if it is masked here? (instead
> of doing in its callback function:__apic_timer_fn). Also why APIC_TMCCT
> is updated here? I think TMCCT is reloaded only when it reaches 0 and
> LVTT works in periodic mode.
> 	c: I didn't see LVTT mask status refelect the hrtimer
> cancel/start, do I miss something?

I inherited most of lapic.c from Dor, and I believe he inherited most of
it from an older version of Xen.  While I have come to understand much
of the inner workings of the LAPIC during the course of developing this
patch, the timer is still a relative enigma to me.  Therefore, I do not
have any comment as to the reasons why something was done here the way
it was, nor to the validity of the problems you are highlighting.
Perhaps Dor will know.

But that being said, patches against v09 to fix problems you see are
always welcome.


> 
> 	3:  Assume a senario there is an valid IDT_VECTORING_INFO_FIELD,
> following code(after patch) in handle_exception push back the failed
> interrupt vector, i.e. vcpu->irq.deferred.
> 
> ........
>         if (is_external_interrupt(vect_info)) {
>                 /*
>                  * An exception was taken while we were trying to inject
> an
>                  * IRQ.  We must defer the injection of the vector until
>                  * the next window.
>                  */
>                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
>                 kvm_vcpu_irq_push(vcpu, irq);
>         }
> 
> 
> 	a:  Now, the abstracted processor interrupt, i.e.
> vcpu->irq.pending, could be 0, __kvm_vcpu_irq_pending invoked at
> beginning of do_interrupt_requests will set kvm_irqpin_localint in the
> abstracted processor interrupt (vcpu->irq.pending). But if at same time,
> we get an external IRQ, i.e. vcpu->irq.pending is set with both localint
> & extint.
> 	From following code , kvm_irqpin_extint has higher priority than
> kvm_irqpin_localint.

Yes, I understand this scenario.  It is the same problem I was trying to
describe earlier when I said extint/nmi can inadvertently get
prioritized over deferred.  I will fix this.

> 
>         while (pending) {
>                 kvm_irqpin_t pin = __fls(pending);
> 
>                 switch (pin) {
>                 case kvm_irqpin_localint:
>                 case kvm_irqpin_extint:
>                 case kvm_irqpin_nmi:
>                         do_intr_requests(vcpu, kvm_run, pin);
>                         break;
> 		..............
> 
> 	Now in do_intr_requests, we get an extirq vector instead of
> vcpu->irq.deferred from following code since pin=kvm_irqpin_extint. That
> means we inject an new irq instead of original failed irq in
> IDT_VECTORING_INFO_FIELD.
> 
> 		........
>                 switch (pin) {
>                 case kvm_irqpin_localint:
>                         r = kvm_vcpu_irq_pop(vcpu, &ack);
>                         break;
>                 case kvm_irqpin_extint:
>                         r = kvm_irqdevice_ack(&vcpu->kvm->isa_irq, 0,
> &ack);
>                         if (!(ack.flags &
> KVM_IRQACKDATA_VECTOR_PENDING))
>                                 __clear_bit(pin, &vcpu->irq.pending);
>                         break;
>                 case kvm_irqpin_nmi:
> 
> 
> 	Anyway due to SMP & in-kernel APIC, I'd like to suggest we move
> IDT_VECTORING_INFO_FIELD to do_interrupt_requests like vmx_intr_assist
> in Xen where physical IRQ is disabled.

I haven't looked at the new Xen code, but I will try to take a peek.
Its probably moot since I am confident that the fix I am suggesting
allows the deferred mechanism to work the way I intended, but I will
keep an open mind to alternative solutions.

> 
> 
> 
> thx, eddie


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 0/9] in-kernel APIC v9 (kernel side)
       [not found] ` <1181562984.4515.20.camel-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
@ 2007-06-11 14:41   ` Dong, Eddie
  2007-06-13 22:26   ` Dor Laor
  1 sibling, 0 replies; 21+ messages in thread
From: Dong, Eddie @ 2007-06-11 14:41 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Gregory Haskins wrote:
> Hi Eddie,
>  Back from vacation. Just catching up on email now....
> 
> On Thu, 2007-06-07 at 16:20 +0800, Dong, Eddie wrote:
>> 
>> Greg:
>> 	Here are some detail comments towarding the LAPIC device model.
1:
>> 	irqabstraction layer vcpu->irq.pending holds the abstract
processor
>> interrupt request (called localint, extint, nmi etc in V09).  But
>> API kvm_vcpu_intr only set the interrupt request, no clear.
> 
> By design
> 
>>   So far I only noticed when an
>> interrupt request is pop or injected,  vcpu->irq.pending bit may be
>> cleared. But there is case when guest raise TPR bar, an localint
>> could be cleared.
> 
> Yep, by design.  And it will be set again when the TPR bar is lowered.

Mmm, since the vcpu->irq.pending is not cleared, now KVM will think
there 
is an irq to inject before the TPR is lowered.


> 
>>  Same for extint when PIC IMR is masked.
> 
> Yeah, we have a hole here.  If IMR is changed to mask a pending
> vector, it is conceivable that it could remain pending in the kernel.
> Note that
> this hole was always in KVM, even before my patch.  Because of

No, in original KVM, this kind of hole doesn't exist since all the PIC 
logic are in Qemu. Each time if Qemu want to do KVM_INTERRUPT,
Qemu will check the eflag.if, interrupt window etc. I.e. each
KVM_INTERRUPT
injected IRQ will be injected to guest immediately.

> this I am
> of the opinion that it doesn't need to be fixed as part of the LAPIC
> work per se before merging.  However, that being said, we can lay the
> groundwork to support this now by adding an "int level" to the
> structure that is passed in the KVM_ISA_INTERRUPT ioctl.  Going
> forward, someone can patch the QEMU::i8259 to send clear events when
> IMR changes. 

No matter a new API or extend previous API, anyway the kernel interrupt
IRR should
be able to be set and cleared  by Qemu :-)

> 
>> 
>> 	In Xen, since there is no notion of abstract processor interrupt
>> request, VMM check interrupt directly from PIC and APIC, so Xen
>> doesn't have problem. I guess Windows will fail here.
> 
> No, this is incorrect (and Windows boots fine even w/ACPI
> enabled, BTW).
> For the LAPIC, we do check the LAPIC model directly (and therefore
> consider TPR, etc).  The abstract interrupt mechanism simply tells the
> vcpu that it needs to check the LAPIC.  It is not authoritative in
> deciding if something actually gets injected.
> 
> For the PIC, we don't check directly (the PICs vectors are
> cached in the
> kernel), but again note that this is how KVM has always
> worked.  I think
> fixing the KVM_ISA_INTERRUPT::level mechanism + adding
> IMR-clear support
> shores up any issue there, however.
> 
>> 
>> 	2: APIC timer
>> 	a: V09 uses hrtimer for LAPIC timer, apic->timer.last_update is
>> updated every time when __apic_timer_fn is invoked at time of the
>> APIC timer fired. This impose an accumulated difference since the
>> fire time is already some ns later after expected time.
>> 	Xen solve this issue by increase apic->timer.last_update with
>> the PERIOD, i.e. APIC_BUS_CYCLE_NS * apic->timer.divide_count *
>> 	APIC_TMICT. b: Seems current approach starts hrtimer whenever
>> APIC_TMICT is updated. Should we check APIC_LVT to see if it is
>> masked here? (instead of doing in its callback
>> function:__apic_timer_fn). Also why APIC_TMCCT is updated here? I
>> 	think TMCCT is reloaded only when it reaches 0 and LVTT works in
>> periodic mode. c: I didn't see LVTT mask status refelect the hrtimer 
>> cancel/start, do I miss something?
> 
> I inherited most of lapic.c from Dor, and I believe he
> inherited most of
> it from an older version of Xen.  While I have come to understand much
> of the inner workings of the LAPIC during the course of developing
> this patch, the timer is still a relative enigma to me.  Therefore, I
> do not have any comment as to the reasons why something was done here
> the way it was, nor to the validity of the problems you are
> highlighting. Perhaps Dor will know. 
> 
> But that being said, patches against v09 to fix problems you see are
> always welcome. 
> 
> 
>> 
>> 	3:  Assume a senario there is an valid IDT_VECTORING_INFO_FIELD,
>> following code(after patch) in handle_exception push back the failed
>> interrupt vector, i.e. vcpu->irq.deferred.
>> 
>> ........
>>         if (is_external_interrupt(vect_info)) {
>>                 /*
>>                  * An exception was taken while we were trying to
>> inject an 
>>                  * IRQ.  We must defer the injection of the vector
>> until 
>>                  * the next window.
>>                  */
>>                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
>>                 kvm_vcpu_irq_push(vcpu, irq);
>>         }
>> 
>> 
>> 	a:  Now, the abstracted processor interrupt, i.e.
>> vcpu->irq.pending, could be 0, __kvm_vcpu_irq_pending invoked at
>> beginning of do_interrupt_requests will set kvm_irqpin_localint in
>> the abstracted processor interrupt (vcpu->irq.pending). But if at
>> same time, we get an external IRQ, i.e. vcpu->irq.pending is set
>> 	with both localint & extint. From following code ,
>> kvm_irqpin_extint has higher priority than kvm_irqpin_localint.
> 
> Yes, I understand this scenario.  It is the same problem I was
> trying to
> describe earlier when I said extint/nmi can inadvertently get
> prioritized over deferred.  I will fix this.
> 
>> 
>>         while (pending) {
>>                 kvm_irqpin_t pin = __fls(pending);
>> 
>>                 switch (pin) {
>>                 case kvm_irqpin_localint:
>>                 case kvm_irqpin_extint:
>>                 case kvm_irqpin_nmi:
>>                         do_intr_requests(vcpu, kvm_run, pin);
>>                         break;
>> 		..............
>> 
>> 	Now in do_intr_requests, we get an extirq vector instead of
>> vcpu->irq.deferred from following code since pin=kvm_irqpin_extint.
>> That means we inject an new irq instead of original failed irq in
>> IDT_VECTORING_INFO_FIELD. 
>> 
>> 		........
>>                 switch (pin) {
>>                 case kvm_irqpin_localint:
>>                         r = kvm_vcpu_irq_pop(vcpu, &ack);
>>                         break;
>>                 case kvm_irqpin_extint:
>>                         r = kvm_irqdevice_ack(&vcpu->kvm->isa_irq,
>>                         0, &ack); if (!(ack.flags &
>> KVM_IRQACKDATA_VECTOR_PENDING))
>>                                 __clear_bit(pin, &vcpu->irq.pending);
>>                         break;
>>                 case kvm_irqpin_nmi:
>> 
>> 
>> 	Anyway due to SMP & in-kernel APIC, I'd like to suggest we move
>> IDT_VECTORING_INFO_FIELD to do_interrupt_requests like
>> vmx_intr_assist in Xen where physical IRQ is disabled.
> 
> I haven't looked at the new Xen code, but I will try to take a peek.
> Its probably moot since I am confident that the fix I am suggesting
> allows the deferred mechanism to work the way I intended, but I will
> keep an open mind to alternative solutions.

The Xen side doesn't support NMI yet though the patch for NMI is in hand
now :-(
Hopefully it will be out this week :-)

> 
>> 
>> 
>> 
>> thx, eddie

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 0/9] in-kernel APIC v9 (kernel side)
@ 2007-06-11 15:40 Gregory Haskins
  0 siblings, 0 replies; 21+ messages in thread
From: Gregory Haskins @ 2007-06-11 15:40 UTC (permalink / raw)
  To: Dong, Eddie, kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

On Mon, 2007-06-11 at 22:41 +0800, Dong, Eddie wrote:
> Gregory Haskins wrote:
> > Hi Eddie,
> >  Back from vacation. Just catching up on email now....
> > 
> > On Thu, 2007-06-07 at 16:20 +0800, Dong, Eddie wrote:
> >> 
> >> Greg:
> >> 	Here are some detail comments towarding the LAPIC device model.
> 1:
> >> 	irqabstraction layer vcpu->irq.pending holds the abstract
> processor
> >> interrupt request (called localint, extint, nmi etc in V09).  But
> >> API kvm_vcpu_intr only set the interrupt request, no clear.
> > 
> > By design
> > 
> >>   So far I only noticed when an
> >> interrupt request is pop or injected,  vcpu->irq.pending bit may be
> >> cleared. But there is case when guest raise TPR bar, an localint
> >> could be cleared.
> > 
> > Yep, by design.  And it will be set again when the TPR bar is lowered.
> 
> Mmm, since the vcpu->irq.pending is not cleared, now KVM will think
> there 
> is an irq to inject before the TPR is lowered.

This is incorrect.  If an interrupt is pending but masked by PPR,
irq.pending is cleared by the !(ack.flags & VECTOR_PENDING) check and no
event is injected to the guest. Later when PPR is modified to unmask the
vector, we re-raise the irq.pending bit.  Worst case scenario is we
perform a single superfluous ack cycle which reveals there are no
eligible vectors to inject.  No harm, no foul.  We clear the irq.pending
and move on. As far as the emulation is concerned, proper behavior is
obtained. 

This methodology for clearing irq.pending was implemented early on in
the review process to make the clearing atomic with the acknowledgment.

> 
> 
> > 
> >>  Same for extint when PIC IMR is masked.
> > 
> > Yeah, we have a hole here.  If IMR is changed to mask a pending
> > vector, it is conceivable that it could remain pending in the kernel.
> > Note that
> > this hole was always in KVM, even before my patch.  Because of
> 
> No, in original KVM, this kind of hole doesn't exist since all the PIC 
> logic are in Qemu. Each time if Qemu want to do KVM_INTERRUPT,
> Qemu will check the eflag.if, interrupt window etc. I.e. each
> KVM_INTERRUPT
> injected IRQ will be injected to guest immediately.

I agree that the current code *masks* the issue such that it is not an
actual problem today, yes.  However, I disagree that there isn't a hole
there.  ;)

The userspace code is predicated on the lock-step nature of the
user/kernel interaction.  It assumed that the vector would be injected
on the next KVM_RUN and therefore there was nothing that would change
its eligibility for injecting (and thus it was not possible for IMR to
change out from under it.  As evident by my patch, immediate injection
may not always be true in the future. ;) This is one of the issues which
should be fixed, IMHO.

In addition, I think we need to go back to the synchronous "try_to_push"
model that I had originally (before v9) so that the state of the vcpu is
properly considered before we ack the 8259.

Between these two things, I think it will work as expected.

> 
> > this I am
> > of the opinion that it doesn't need to be fixed as part of the LAPIC
> > work per se before merging.  However, that being said, we can lay the
> > groundwork to support this now by adding an "int level" to the
> > structure that is passed in the KVM_ISA_INTERRUPT ioctl.  Going
> > forward, someone can patch the QEMU::i8259 to send clear events when
> > IMR changes. 
> 
> No matter a new API or extend previous API, anyway the kernel interrupt
> IRR should
> be able to be set and cleared  by Qemu :-)
> 
> > 
> >> 
> >> 	In Xen, since there is no notion of abstract processor interrupt
> >> request, VMM check interrupt directly from PIC and APIC, so Xen
> >> doesn't have problem. I guess Windows will fail here.
> > 
> > No, this is incorrect (and Windows boots fine even w/ACPI
> > enabled, BTW).
> > For the LAPIC, we do check the LAPIC model directly (and therefore
> > consider TPR, etc).  The abstract interrupt mechanism simply tells the
> > vcpu that it needs to check the LAPIC.  It is not authoritative in
> > deciding if something actually gets injected.
> > 
> > For the PIC, we don't check directly (the PICs vectors are
> > cached in the
> > kernel), but again note that this is how KVM has always
> > worked.  I think
> > fixing the KVM_ISA_INTERRUPT::level mechanism + adding
> > IMR-clear support
> > shores up any issue there, however.
> > 
> >> 
> >> 	2: APIC timer
> >> 	a: V09 uses hrtimer for LAPIC timer, apic->timer.last_update is
> >> updated every time when __apic_timer_fn is invoked at time of the
> >> APIC timer fired. This impose an accumulated difference since the
> >> fire time is already some ns later after expected time.
> >> 	Xen solve this issue by increase apic->timer.last_update with
> >> the PERIOD, i.e. APIC_BUS_CYCLE_NS * apic->timer.divide_count *
> >> 	APIC_TMICT. b: Seems current approach starts hrtimer whenever
> >> APIC_TMICT is updated. Should we check APIC_LVT to see if it is
> >> masked here? (instead of doing in its callback
> >> function:__apic_timer_fn). Also why APIC_TMCCT is updated here? I
> >> 	think TMCCT is reloaded only when it reaches 0 and LVTT works in
> >> periodic mode. c: I didn't see LVTT mask status refelect the hrtimer 
> >> cancel/start, do I miss something?
> > 
> > I inherited most of lapic.c from Dor, and I believe he
> > inherited most of
> > it from an older version of Xen.  While I have come to understand much
> > of the inner workings of the LAPIC during the course of developing
> > this patch, the timer is still a relative enigma to me.  Therefore, I
> > do not have any comment as to the reasons why something was done here
> > the way it was, nor to the validity of the problems you are
> > highlighting. Perhaps Dor will know. 
> > 
> > But that being said, patches against v09 to fix problems you see are
> > always welcome. 
> > 
> > 
> >> 
> >> 	3:  Assume a senario there is an valid IDT_VECTORING_INFO_FIELD,
> >> following code(after patch) in handle_exception push back the failed
> >> interrupt vector, i.e. vcpu->irq.deferred.
> >> 
> >> ........
> >>         if (is_external_interrupt(vect_info)) {
> >>                 /*
> >>                  * An exception was taken while we were trying to
> >> inject an 
> >>                  * IRQ.  We must defer the injection of the vector
> >> until 
> >>                  * the next window.
> >>                  */
> >>                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
> >>                 kvm_vcpu_irq_push(vcpu, irq);
> >>         }
> >> 
> >> 
> >> 	a:  Now, the abstracted processor interrupt, i.e.
> >> vcpu->irq.pending, could be 0, __kvm_vcpu_irq_pending invoked at
> >> beginning of do_interrupt_requests will set kvm_irqpin_localint in
> >> the abstracted processor interrupt (vcpu->irq.pending). But if at
> >> same time, we get an external IRQ, i.e. vcpu->irq.pending is set
> >> 	with both localint & extint. From following code ,
> >> kvm_irqpin_extint has higher priority than kvm_irqpin_localint.
> > 
> > Yes, I understand this scenario.  It is the same problem I was
> > trying to
> > describe earlier when I said extint/nmi can inadvertently get
> > prioritized over deferred.  I will fix this.
> > 
> >> 
> >>         while (pending) {
> >>                 kvm_irqpin_t pin = __fls(pending);
> >> 
> >>                 switch (pin) {
> >>                 case kvm_irqpin_localint:
> >>                 case kvm_irqpin_extint:
> >>                 case kvm_irqpin_nmi:
> >>                         do_intr_requests(vcpu, kvm_run, pin);
> >>                         break;
> >> 		..............
> >> 
> >> 	Now in do_intr_requests, we get an extirq vector instead of
> >> vcpu->irq.deferred from following code since pin=kvm_irqpin_extint.
> >> That means we inject an new irq instead of original failed irq in
> >> IDT_VECTORING_INFO_FIELD. 
> >> 
> >> 		........
> >>                 switch (pin) {
> >>                 case kvm_irqpin_localint:
> >>                         r = kvm_vcpu_irq_pop(vcpu, &ack);
> >>                         break;
> >>                 case kvm_irqpin_extint:
> >>                         r = kvm_irqdevice_ack(&vcpu->kvm->isa_irq,
> >>                         0, &ack); if (!(ack.flags &
> >> KVM_IRQACKDATA_VECTOR_PENDING))
> >>                                 __clear_bit(pin, &vcpu->irq.pending);
> >>                         break;
> >>                 case kvm_irqpin_nmi:
> >> 
> >> 
> >> 	Anyway due to SMP & in-kernel APIC, I'd like to suggest we move
> >> IDT_VECTORING_INFO_FIELD to do_interrupt_requests like
> >> vmx_intr_assist in Xen where physical IRQ is disabled.
> > 
> > I haven't looked at the new Xen code, but I will try to take a peek.
> > Its probably moot since I am confident that the fix I am suggesting
> > allows the deferred mechanism to work the way I intended, but I will
> > keep an open mind to alternative solutions.
> 
> The Xen side doesn't support NMI yet though the patch for NMI is in hand
> now :-(
> Hopefully it will be out this week :-)
> 
> > 
> >> 
> >> 
> >> 
> >> thx, eddie


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 0/9] in-kernel APIC v9 (kernel side)
       [not found] ` <1181562984.4515.20.camel-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
  2007-06-11 14:41   ` Dong, Eddie
@ 2007-06-13 22:26   ` Dor Laor
  1 sibling, 0 replies; 21+ messages in thread
From: Dor Laor @ 2007-06-13 22:26 UTC (permalink / raw)
  To: Gregory Haskins, Dong, Eddie; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

>> 	2: APIC timer
>> 	a: V09 uses hrtimer for LAPIC timer, apic->timer.last_update is
>> updated every time when __apic_timer_fn is invoked at time of the
APIC
>> timer fired. This impose an accumulated difference since the fire
time
>> is already some ns later after expected time.
>> 	Xen solve this issue by increase apic->timer.last_update with
>> the PERIOD, i.e. APIC_BUS_CYCLE_NS * apic->timer.divide_count *
>> APIC_TMICT.

In theory you're right but in practice, if many hrtimers pop between
TMCCT read than this is insignificant.
But, I tend to agree and since this calculation is done anyway in the
timer function.

>> 	b: Seems current approach starts hrtimer whenever APIC_TMICT is
>> updated. Should we check APIC_LVT to see if it is masked here?
(instead
>> of doing in its callback function:__apic_timer_fn). Also why
APIC_TMCCT

I'm not sure the spec says something about not running the timer while
the 
LVTT is masked. Since it is checked anyway in the timer_fn it is
practically identical. This also free us of starting/stopping the timer
on mask changes.

>> is updated here? I think TMCCT is reloaded only when it reaches 0 and
>> LVTT works in periodic mode.

Good catch. As Greg pointed out below, it is a code he inherited from me
and I inherited from Xen. 
Did you check the apic spec w.r.t TMCCT value on TMICT updates?


>> 	c: I didn't see LVTT mask status refelect the hrtimer
>> cancel/start, do I miss something?

Since the timer_fn check the mask it should be covered as explained
above.
Do you see any issues with it?

>I inherited most of lapic.c from Dor, and I believe he inherited most
of
>it from an older version of Xen.  While I have come to understand much
>of the inner workings of the LAPIC during the course of developing this
>patch, the timer is still a relative enigma to me.  Therefore, I do not
>have any comment as to the reasons why something was done here the way
>it was, nor to the validity of the problems you are highlighting.
>Perhaps Dor will know.

HTH ;)

>
>But that being said, patches against v09 to fix problems you see are
>always welcome.
>

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 9/9] KVM: Adds support for halting in the kernel
       [not found]     ` <20070531180934.1810.45024.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
@ 2007-06-19 14:48       ` Avi Kivity
  0 siblings, 0 replies; 21+ messages in thread
From: Avi Kivity @ 2007-06-19 14:48 UTC (permalink / raw)
  To: Gregory Haskins; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Gregory Haskins wrote:
> Halting in userspace requires a relatively cumbersome mechanism to signal the
> halted VCPU.  Implementing halt in kernel should be relatively straight
> forward and it eliminates the need for the signaling
>
>   

Merging this one in, found some nits:

> +/*
>   * This function is invoked whenever we want to interrupt a vcpu that is
>   * currently executing in guest-mode.  It currently is a no-op because
>   * the simple delivery of the IPI to execute this function accomplishes our
> @@ -2481,6 +2556,16 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this,
>  			BUG_ON(direct_ipi == smp_processor_id());
>  			++vcpu->stat.guest_preempt;
>  		}
> +
> +		/*
> +		 * If the CPU is halted it will be waiting for a wake-up
> +		 */
> +		if (waitqueue_active(&vcpu->irq.wq)) {
>   

Why do the check?  The only reason I can see is to keep the stats 
correct.  Otherwise we can do the body of the if unconditionally.

> +			wake_up_interruptible_sync(&vcpu->irq.wq);
> +			set_tsk_need_resched(current);
>   

This is unneeded?  I'd expect wake_up_interruptible_sync() to take care 
of any rescheduling needed.

> +			++vcpu->stat.halt_wakeup;
> +		}
> +
>   


-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2007-06-19 14:48 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-05-31 18:08 [PATCH 0/9] in-kernel APIC v9 (kernel side) Gregory Haskins
     [not found] ` <20070531180005.1810.23884.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
2007-05-31 18:08   ` [PATCH 1/9] KVM: Adds support for in-kernel mmio handlers Gregory Haskins
2007-05-31 18:08   ` [PATCH 2/9] KVM: VMX - fix interrupt checking on light-exit Gregory Haskins
2007-05-31 18:09   ` [PATCH 3/9] KVM: Add irqdevice object Gregory Haskins
     [not found]     ` <20070531180903.1810.87474.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
2007-06-04  7:13       ` Dong, Eddie
2007-05-31 18:09   ` [PATCH 4/9] KVM: Adds ability to preempt an executing VCPU Gregory Haskins
2007-05-31 18:09   ` [PATCH 5/9] KVM: Add support for in-kernel LAPIC model Gregory Haskins
2007-05-31 18:09   ` [PATCH 6/9] KVM: Adds support for real NMI injection on VMX processors Gregory Haskins
     [not found]     ` <20070531180919.1810.30009.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
2007-06-01  2:43       ` [PATCH 6/9] KVM: Adds support for real NMI injection onVMX processors Li, Xin B
2007-05-31 18:09   ` [PATCH 7/9] KVM: Adds basic plumbing to support TPR shadow features Gregory Haskins
2007-05-31 18:09   ` [PATCH 8/9] KVM: Add statistics from interrupt subsystem Gregory Haskins
2007-05-31 18:09   ` [PATCH 9/9] KVM: Adds support for halting in the kernel Gregory Haskins
     [not found]     ` <20070531180934.1810.45024.stgit-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
2007-06-19 14:48       ` Avi Kivity
2007-06-02 22:04   ` [PATCH 0/9] in-kernel APIC v9 (kernel side) Dor Laor
2007-06-03  9:28   ` Avi Kivity
     [not found]     ` <466289A4.9000201-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-06-03 15:39       ` Avi Kivity
2007-06-07  8:20   ` Dong, Eddie
  -- strict thread matches above, loose matches on Subject: below --
2007-06-11 11:56 Gregory Haskins
     [not found] ` <1181562984.4515.20.camel-5CR4LY5GPkvLDviKLk5550HKjMygAv58XqFh9Ls21Oc@public.gmane.org>
2007-06-11 14:41   ` Dong, Eddie
2007-06-13 22:26   ` Dor Laor
2007-06-11 15:40 Gregory Haskins

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox