* irqdevice INTR example
@ 2007-04-12 4:02 Gregory Haskins
[not found] ` <461D7702.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
0 siblings, 1 reply; 13+ messages in thread
From: Gregory Haskins @ 2007-04-12 4:02 UTC (permalink / raw)
To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
[-- Attachment #1: Type: text/plain, Size: 571 bytes --]
Hi All,
Attached are the first three patches in my queue. The first two you are likely familiar with at this point (though I have made some more of the requested changes to 02-irqdevice.patch). The last item (03-preemptible-cpu.patch) adds an implementation to the previously unused kvm_vcpu_intr() callback. This acts as a functional example of the INTR callback mechanism as Avi requested. Note that the work related to IF/NMI/TPR classification of interrupts happens later in my queue and is not mature enough to share yet, but hopefully soon.
Thoughts?
-Greg
[-- Attachment #2: 01-mmio_handler.patch --]
[-- Type: text/plain, Size: 5730 bytes --]
KVM: Adds support for in-kernel mmio handlers
From: <>
Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
---
drivers/kvm/kvm.h | 31 ++++++++++++++++++
drivers/kvm/kvm_main.c | 82 +++++++++++++++++++++++++++++++++++++++++-------
2 files changed, 101 insertions(+), 12 deletions(-)
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index fceeb84..181099f 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -236,6 +236,36 @@ struct kvm_pio_request {
int rep;
};
+struct kvm_io_device {
+ unsigned long (*read)(struct kvm_io_device *this,
+ gpa_t addr,
+ int length);
+ void (*write)(struct kvm_io_device *this,
+ gpa_t addr,
+ int length,
+ unsigned long val);
+ int (*in_range)(struct kvm_io_device *this, gpa_t addr);
+
+ void *private;
+};
+
+/*
+ * It would be nice to use something smarter than a linear search, TBD...
+ * Thankfully we dont expect many devices to register (famous last words :),
+ * so until then it will suffice. At least its abstracted so we can change
+ * in one place.
+ */
+struct kvm_io_bus {
+ int dev_count;
+#define NR_IOBUS_DEVS 6
+ struct kvm_io_device *devs[NR_IOBUS_DEVS];
+};
+
+void kvm_io_bus_init(struct kvm_io_bus *bus);
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
+ struct kvm_io_device *dev);
+
struct kvm_vcpu {
struct kvm *kvm;
union {
@@ -345,6 +375,7 @@ struct kvm {
unsigned long rmap_overflow;
struct list_head vm_list;
struct file *filp;
+ struct kvm_io_bus mmio_bus;
};
struct kvm_stat {
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 4473174..c3c0059 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -294,6 +294,7 @@ static struct kvm *kvm_create_vm(void)
spin_lock_init(&kvm->lock);
INIT_LIST_HEAD(&kvm->active_mmu_pages);
+ kvm_io_bus_init(&kvm->mmio_bus);
for (i = 0; i < KVM_MAX_VCPUS; ++i) {
struct kvm_vcpu *vcpu = &kvm->vcpus[i];
@@ -1015,12 +1016,25 @@ static int emulator_write_std(unsigned long addr,
return X86EMUL_UNHANDLEABLE;
}
+static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
+ gpa_t addr)
+{
+ /*
+ * Note that its important to have this wrapper function because
+ * in the very near future we will be checking for MMIOs against
+ * the LAPIC as well as the general MMIO bus
+ */
+ return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
+}
+
static int emulator_read_emulated(unsigned long addr,
unsigned long *val,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
- struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct kvm_io_device *mmio_dev;
+ gpa_t gpa;
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
@@ -1029,18 +1043,26 @@ static int emulator_read_emulated(unsigned long addr,
} else if (emulator_read_std(addr, val, bytes, ctxt)
== X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
- else {
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
- vcpu->mmio_needed = 1;
- vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->mmio_is_write = 0;
+ gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
- return X86EMUL_UNHANDLEABLE;
+ /*
+ * Is this MMIO handled locally?
+ */
+ mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+ if (mmio_dev) {
+ *val = mmio_dev->read(mmio_dev, gpa, bytes);
+ return X86EMUL_CONTINUE;
}
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_phys_addr = gpa;
+ vcpu->mmio_size = bytes;
+ vcpu->mmio_is_write = 0;
+
+ return X86EMUL_UNHANDLEABLE;
}
static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1068,8 +1090,9 @@ static int emulator_write_emulated(unsigned long addr,
unsigned int bytes,
struct x86_emulate_ctxt *ctxt)
{
- struct kvm_vcpu *vcpu = ctxt->vcpu;
- gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct kvm_io_device *mmio_dev;
+ gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
@@ -1077,6 +1100,15 @@ static int emulator_write_emulated(unsigned long addr,
if (emulator_write_phys(vcpu, gpa, val, bytes))
return X86EMUL_CONTINUE;
+ /*
+ * Is this MMIO handled locally?
+ */
+ mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+ if (mmio_dev) {
+ mmio_dev->write(mmio_dev, gpa, bytes, val);
+ return X86EMUL_CONTINUE;
+ }
+
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
vcpu->mmio_size = bytes;
@@ -2911,6 +2943,32 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
return NOTIFY_OK;
}
+void kvm_io_bus_init(struct kvm_io_bus *bus)
+{
+ memset(bus, 0, sizeof(*bus));
+}
+
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
+{
+ int i;
+
+ for (i = 0; i < bus->dev_count; i++) {
+ struct kvm_io_device *pos = bus->devs[i];
+
+ if (pos->in_range(pos, addr))
+ return pos;
+ }
+
+ return NULL;
+}
+
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
+{
+ BUG_ON(bus->dev_count >= (NR_IOBUS_DEVS-1));
+
+ bus->devs[bus->dev_count++] = dev;
+}
+
static struct notifier_block kvm_cpu_notifier = {
.notifier_call = kvm_cpu_hotplug,
.priority = 20, /* must be > scheduler priority */
[-- Attachment #3: 02-irqdevice.patch --]
[-- Type: text/plain, Size: 23734 bytes --]
KVM: Add irqdevice object
From: <>
The current code is geared towards using a user-mode (A)PIC. This patch adds
an "irqdevice" abstraction, and implements a "userint" model to handle the
duties of the original code. Later, we can develop other irqdevice models
to handle objects like LAPIC, IOAPIC, i8259, etc, as appropriate
Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
---
drivers/kvm/Makefile | 2
drivers/kvm/irqdevice.h | 170 ++++++++++++++++++++++++++++++++++++++++
drivers/kvm/kvm.h | 9 +-
drivers/kvm/kvm_main.c | 57 ++++++++++---
drivers/kvm/svm.c | 33 ++++----
drivers/kvm/userint.c | 202 +++++++++++++++++++++++++++++++++++++++++++++++
drivers/kvm/vmx.c | 29 +++----
7 files changed, 449 insertions(+), 53 deletions(-)
diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile
index c0a789f..540afbc 100644
--- a/drivers/kvm/Makefile
+++ b/drivers/kvm/Makefile
@@ -2,7 +2,7 @@
# Makefile for Kernel-based Virtual Machine module
#
-kvm-objs := kvm_main.o mmu.o x86_emulate.o
+kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o
obj-$(CONFIG_KVM) += kvm.o
kvm-intel-objs = vmx.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/drivers/kvm/irqdevice.h b/drivers/kvm/irqdevice.h
new file mode 100644
index 0000000..fe284bc
--- /dev/null
+++ b/drivers/kvm/irqdevice.h
@@ -0,0 +1,170 @@
+/*
+ * Defines an interface for an abstract interrupt controller. The model
+ * consists of a unit with an arbitrary number of input lines (IRQ0-N), an
+ * output line (INTR), and methods for completing an interrupt-acknowledge
+ * cycle (INTA). A particular implementation of this model will define
+ * various policies, such as irq-to-vector translation, INTA/auto-EOI policy,
+ * etc.
+ *
+ * In addition, the INTR callback mechanism allows the unit to be "wired" to
+ * an interruptible source in a very flexible manner. For instance, an
+ * irqdevice could have its INTR wired to a VCPU (ala LAPIC), or another
+ * interrupt controller (ala cascaded i8259s)
+ *
+ * Copyright (C) 2007 Novell
+ *
+ * Authors:
+ * Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef __IRQDEVICE_H
+#define __IRQDEVICE_H
+
+#define KVM_IRQFLAGS_NMI (1 << 0)
+#define KVM_IRQFLAGS_PEEK (1 << 1)
+
+struct kvm_irqdevice;
+
+struct kvm_irqsink {
+ void (*raise_intr)(struct kvm_irqsink *this,
+ struct kvm_irqdevice *dev);
+
+ void *private;
+};
+
+struct kvm_irqdevice {
+ int (*pending)(struct kvm_irqdevice *this, int flags);
+ int (*read_vector)(struct kvm_irqdevice *this, int flags);
+ int (*set_pin)(struct kvm_irqdevice *this, int pin, int level);
+ int (*summary)(struct kvm_irqdevice *this, void *data);
+ void (*destructor)(struct kvm_irqdevice *this);
+
+ void *private;
+ struct kvm_irqsink sink;
+};
+
+/**
+ * kvm_irqdevice_init - initialize the kvm_irqdevice for use
+ * @dev: The device
+ *
+ * Description: Initialize the kvm_irqdevice for use. Should be called before
+ * calling any derived implementation init functions
+ *
+ * Returns: (void)
+ */
+static inline void kvm_irqdevice_init(struct kvm_irqdevice *dev)
+{
+ memset(dev, 0, sizeof(*dev));
+}
+
+/**
+ * kvm_irqdevice_pending - efficiently determines if an interrupt is pending
+ * @dev: The device
+ * @flags: Modifies the behavior as follows:
+ * [+ KVM_IRQFLAGS_NMI: Mask everything but NMIs]
+ *
+ * Description: Efficiently determines if an interrupt is pending on an
+ * irqdevice
+ *
+ * Returns: (int)
+ * [0 = no iterrupts pending (per "flags" criteria)]
+ * [1 = one or more interrupts are pending]
+ */
+static inline int kvm_irqdevice_pending(struct kvm_irqdevice *dev, int flags)
+{
+ return dev->pending(dev, flags);
+}
+
+/**
+ * kvm_irqdevice_read_vector - read the highest priority vector from the device
+ * @dev: The device
+ * @flags: Modifies the behavior as follows:
+ * [+ KVM_IRQFLAGS_NMI: Mask everything but NMIs]
+ * [+ KVM_IRQFLAGS_PEEK: Do not auto-acknowledge interrupt]
+ *
+ * Description: Read the highest priority pending vector from the device,
+ * potentially invoking auto-EOI depending on device policy
+ *
+ * Returns: (int)
+ * [ -1 = no interrupts pending (per "flags" criteria)]
+ * [>=0 = the highest pending vector]
+ */
+static inline int kvm_irqdevice_read_vector(struct kvm_irqdevice *dev,
+ int flags)
+{
+ return dev->read_vector(dev, flags);
+}
+
+/**
+ * kvm_irqdevice_set_pin - allows the caller to assert/deassert an IRQ
+ * @dev: The device
+ * @pin: The input pin to alter
+ * @level: The value to set (1 = assert, 0 = deassert)
+ *
+ * Description: Allows the caller to assert/deassert an IRQ input pin to the
+ * device according to device policy.
+ *
+ * Returns: (int)
+ * [-1 = failure]
+ * [ 0 = success]
+ */
+static inline int kvm_irqdevice_set_pin(struct kvm_irqdevice *dev, int pin,
+ int level)
+{
+ return dev->set_pin(dev, pin, level);
+}
+
+/**
+ * kvm_irqdevice_summary - loads a summary bitmask
+ * @dev: The device
+ * @data: A pointer to a region capable of holding a 256 bit bitmap
+ *
+ * Description: Loads a summary bitmask of all pending vectors (0-255)
+ *
+ * Returns: (int)
+ * [-1 = failure]
+ * [ 0 = success]
+ */
+static inline int kvm_irqdevice_summary(struct kvm_irqdevice *dev, void *data)
+{
+ return dev->summary(dev, data);
+}
+
+/**
+ * kvm_irqdevice_register_sink - registers an kvm_irqsink object
+ * @dev: The device
+ * @sink: The sink to register. Data will be copied so building object from
+ * transient storage is ok.
+ *
+ * Description: Registers an kvm_irqsink object as an INTR callback
+ *
+ * Returns: (void)
+ */
+static inline void kvm_irqdevice_register_sink(struct kvm_irqdevice *dev,
+ const struct kvm_irqsink *sink)
+{
+ dev->sink = *sink;
+}
+
+/*
+ * kvm_irqdevice_raise_intr - invokes a registered INTR callback
+ * @dev: The device
+ *
+ * Description: Invokes a registered INTR callback (if present). This
+ * function is meant to be used privately by a irqdevice
+ * implementation.
+ *
+ * Returns: (void)
+ */
+static inline void kvm_irqdevice_raise_intr(struct kvm_irqdevice *dev)
+{
+ struct kvm_irqsink *sink = &dev->sink;
+ if (sink->raise_intr)
+ sink->raise_intr(sink, dev);
+}
+
+#endif /* __IRQDEVICE_H */
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 181099f..58966d9 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -13,6 +13,7 @@
#include <linux/mm.h>
#include "vmx.h"
+#include "irqdevice.h"
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -157,6 +158,8 @@ struct vmcs {
struct kvm_vcpu;
+int kvm_userint_init(struct kvm_irqdevice *dev);
+
/*
* x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
* 32-bit). The kvm_mmu structure abstracts the details of the current mmu
@@ -266,6 +269,8 @@ struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
struct kvm_io_device *dev);
+#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
+
struct kvm_vcpu {
struct kvm *kvm;
union {
@@ -278,9 +283,7 @@ struct kvm_vcpu {
u64 host_tsc;
struct kvm_run *run;
int interrupt_window_open;
- unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
-#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
- unsigned long irq_pending[NR_IRQ_WORDS];
+ struct kvm_irqdevice irq_dev;
unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
unsigned long rip; /* needs vcpu_load_rsp_rip() */
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index c3c0059..7e00412 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -1989,8 +1989,7 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
sregs->efer = vcpu->shadow_efer;
sregs->apic_base = vcpu->apic_base;
- memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
- sizeof sregs->interrupt_bitmap);
+ kvm_irqdevice_summary(&vcpu->irq_dev, &sregs->interrupt_bitmap);
vcpu_put(vcpu);
@@ -2044,13 +2043,11 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
if (mmu_reset_needed)
kvm_mmu_reset_context(vcpu);
- memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
- sizeof vcpu->irq_pending);
- vcpu->irq_summary = 0;
- for (i = 0; i < NR_IRQ_WORDS; ++i)
- if (vcpu->irq_pending[i])
- __set_bit(i, &vcpu->irq_summary);
-
+ /* walk the interrupt-bitmap and inject an IRQ for each bit found */
+ for (i = 0; i < 256; ++i)
+ if (test_bit(i, &sregs->interrupt_bitmap[0]))
+ kvm_irqdevice_set_pin(&vcpu->irq_dev, i, 1);
+
set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -2210,14 +2207,8 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
{
if (irq->irq < 0 || irq->irq >= 256)
return -EINVAL;
- vcpu_load(vcpu);
-
- set_bit(irq->irq, vcpu->irq_pending);
- set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
- vcpu_put(vcpu);
-
- return 0;
+ return kvm_irqdevice_set_pin(&vcpu->irq_dev, irq->irq, 1);
}
static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
@@ -2319,6 +2310,36 @@ out1:
}
/*
+ * This function will be invoked whenever the vcpu->irq_dev raises its INTR
+ * line
+ */
+static void kvm_vcpu_intr(struct kvm_irqsink *this,
+ struct kvm_irqdevice *dev)
+{
+ /*
+ * Our irq device is requesting to interrupt the vcpu. If it is
+ * currently running, we should inject a host IPI to force a VMEXIT
+ */
+
+ /*
+ * FIXME: Implement this or the CPU wont notice the interrupt until
+ * the next natural VMEXIT. Note that this is how the system
+ * has always worked, so nothing is broken here. This is a future
+ * enhancement
+ */
+}
+
+static void kvm_vcpu_irqsink_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_irqsink sink = {
+ .raise_intr = kvm_vcpu_intr,
+ .private = vcpu
+ };
+
+ kvm_irqdevice_register_sink(&vcpu->irq_dev, &sink);
+}
+
+/*
* Creates some virtual cpus. Good luck creating more than one.
*/
static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
@@ -2364,6 +2385,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
if (r < 0)
goto out_free_vcpus;
+ kvm_irqdevice_init(&vcpu->irq_dev);
+ kvm_vcpu_irqsink_init(vcpu);
+ kvm_userint_init(&vcpu->irq_dev);
+
kvm_arch_ops->vcpu_load(vcpu);
r = kvm_mmu_setup(vcpu);
if (r >= 0)
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index b7e1410..e59a548 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -108,20 +108,16 @@ static unsigned get_addr_size(struct kvm_vcpu *vcpu)
static inline u8 pop_irq(struct kvm_vcpu *vcpu)
{
- int word_index = __ffs(vcpu->irq_summary);
- int bit_index = __ffs(vcpu->irq_pending[word_index]);
- int irq = word_index * BITS_PER_LONG + bit_index;
-
- clear_bit(bit_index, &vcpu->irq_pending[word_index]);
- if (!vcpu->irq_pending[word_index])
- clear_bit(word_index, &vcpu->irq_summary);
- return irq;
+ return kvm_irqdevice_read_vector(&vcpu->irq_dev, 0);
}
static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
{
- set_bit(irq, vcpu->irq_pending);
- set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
+ /* FIXME: We probably want to reserve the "set_pin" verb for
+ * actual interrupt requests, not for putting back something
+ * previously pending. Lets revisit this
+ */
+ kvm_irqdevice_set_pin(&vcpu->irq_dev, irq, 1);
}
static inline void clgi(void)
@@ -1092,7 +1088,7 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1;
skip_emulated_instruction(vcpu);
- if (vcpu->irq_summary)
+ if (kvm_irqdevice_pending(&vcpu->irq_dev, 0))
return 1;
kvm_run->exit_reason = KVM_EXIT_HLT;
@@ -1263,7 +1259,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu,
* possible
*/
if (kvm_run->request_interrupt_window &&
- !vcpu->irq_summary) {
+ !kvm_irqdevice_pending(&vcpu->irq_dev, 0)) {
++kvm_stat.irq_window_exits;
kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
return 0;
@@ -1399,7 +1395,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
(vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF));
- if (vcpu->interrupt_window_open && vcpu->irq_summary)
+ if (vcpu->interrupt_window_open &&
+ kvm_irqdevice_pending(&vcpu->irq_dev, 0))
/*
* If interrupts enabled, and not blocked by sti or mov ss. Good.
*/
@@ -1409,7 +1406,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
* Interrupts blocked. Wait for unblock.
*/
if (!vcpu->interrupt_window_open &&
- (vcpu->irq_summary || kvm_run->request_interrupt_window)) {
+ (kvm_irqdevice_pending(&vcpu->irq_dev, 0) ||
+ kvm_run->request_interrupt_window)) {
control->intercept |= 1ULL << INTERCEPT_VINTR;
} else
control->intercept &= ~(1ULL << INTERCEPT_VINTR);
@@ -1418,8 +1416,9 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
static void post_kvm_run_save(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
{
- kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open &&
- vcpu->irq_summary == 0);
+ kvm_run->ready_for_interrupt_injection =
+ (vcpu->interrupt_window_open &&
+ !kvm_irqdevice_pending(&vcpu->irq_dev, 0));
kvm_run->if_flag = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF) != 0;
kvm_run->cr8 = vcpu->cr8;
kvm_run->apic_base = vcpu->apic_base;
@@ -1434,7 +1433,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
{
- return (!vcpu->irq_summary &&
+ return (!kvm_irqdevice_pending(&vcpu->irq_dev, 0) &&
kvm_run->request_interrupt_window &&
vcpu->interrupt_window_open &&
(vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF));
diff --git a/drivers/kvm/userint.c b/drivers/kvm/userint.c
new file mode 100644
index 0000000..8363060
--- /dev/null
+++ b/drivers/kvm/userint.c
@@ -0,0 +1,202 @@
+/*
+ * User Interrupts IRQ device
+ *
+ * This acts as an extention of an interrupt controller that exists elsewhere
+ * (typically in userspace/QEMU). Because this PIC is a pseudo device that
+ * is downstream from a real emulated PIC, the "IRQ-to-vector" mapping has
+ * already occured. Therefore, this PIC has the following unusal properties:
+ *
+ * 1) It has 256 "pins" which are literal vectors (i.e. no translation)
+ * 2) It only supports "auto-EOI" behavior since it is expected that the
+ * upstream emulated PIC will handle the real EOIs (if applicable)
+ * 3) It only listens to "asserts" on the pins (deasserts are dropped)
+ * because its an auto-EOI device anyway.
+ *
+ * Copyright (C) 2007 Novell
+ *
+ * bitarray code based on original vcpu->irq_pending code,
+ * Copyright (C) 2007 Qumranet
+ *
+ * Authors:
+ * Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "kvm.h"
+
+/*
+ *----------------------------------------------------------------------
+ * optimized bitarray object - works like bitarrays in bitops, but uses
+ * a summary field to accelerate lookups. Assumes external locking
+ *---------------------------------------------------------------------
+ */
+
+struct bitarray {
+ unsigned long summary; /* 1 per word in pending */
+ unsigned long pending[NR_IRQ_WORDS];
+};
+
+static inline int bitarray_pending(struct bitarray *this)
+{
+ return this->summary ? 1 : 0;
+}
+
+static inline int bitarray_findhighest(struct bitarray *this)
+{
+ if (!this->summary)
+ return -1;
+ else {
+ int word_index = __fls(this->summary);
+ int bit_index = __fls(this->pending[word_index]);
+
+ return word_index * BITS_PER_LONG + bit_index;
+ }
+}
+
+static inline void bitarray_set(struct bitarray *this, int nr)
+{
+ __set_bit(nr, &this->pending);
+ __set_bit(nr / BITS_PER_LONG, &this->summary);
+}
+
+static inline void bitarray_clear(struct bitarray *this, int nr)
+{
+ int word = nr / BITS_PER_LONG;
+
+ __clear_bit(nr, &this->pending);
+ if (!this->pending[word])
+ __clear_bit(word, &this->summary);
+}
+
+static inline int bitarray_test(struct bitarray *this, int nr)
+{
+ return test_bit(nr, &this->pending);
+}
+
+/*
+ *----------------------------------------------------------------------
+ * userint interface - provides the actual kvm_irqdevice implementation
+ *---------------------------------------------------------------------
+ */
+
+struct kvm_userint {
+ spinlock_t lock;
+ struct bitarray irq_pending;
+ int nmi_pending;
+};
+
+static int userint_pending(struct kvm_irqdevice *this, int flags)
+{
+ struct kvm_userint *s = (struct kvm_userint*)this->private;
+ int ret;
+
+ spin_lock_irq(&s->lock);
+
+ if (flags & KVM_IRQFLAGS_NMI)
+ ret = s->nmi_pending;
+ else
+ ret = bitarray_pending(&s->irq_pending);
+
+ spin_unlock_irq(&s->lock);
+
+ return ret;
+}
+
+static int userint_read_vector(struct kvm_irqdevice *this, int flags)
+{
+ struct kvm_userint *s = (struct kvm_userint*)this->private;
+ int irq;
+
+ spin_lock_irq(&s->lock);
+
+ /*
+ * NMIs take priority, so if there is an NMI pending, or
+ * if we are filtering out NMIs, only consider them
+ */
+ if (s->nmi_pending || (flags & KVM_IRQFLAGS_NMI))
+ irq = s->nmi_pending ? 2 : -1;
+ else
+ irq = bitarray_findhighest(&s->irq_pending);
+
+ if ((irq > -1) && !(flags & KVM_IRQFLAGS_PEEK)) {
+ /*
+ * If the "peek" flag is not set, automatically clear the
+ * interrupt as the EOI mechanism (if any) will take place
+ * in userspace
+ */
+ bitarray_clear(&s->irq_pending, irq);
+ if (irq == 2)
+ s->nmi_pending = 0;
+ }
+
+ spin_unlock_irq(&s->lock);
+
+ return irq;
+}
+
+static int userint_set_pin(struct kvm_irqdevice* this, int irq, int level)
+{
+ struct kvm_userint *s = (struct kvm_userint*)this->private;
+
+ if (!level)
+ return 0; /* We dont care about deasserts */
+
+ spin_lock_irq(&s->lock);
+
+ /*
+ * Update the local state
+ */
+ bitarray_set(&s->irq_pending, irq);
+ if (irq == 2)
+ s->nmi_pending = 1;
+
+ spin_unlock_irq(&s->lock);
+
+ /*
+ * And then alert the higher layer software we have changes
+ */
+ kvm_irqdevice_raise_intr(this);
+
+ return 0;
+}
+
+static int userint_summary(struct kvm_irqdevice* this, void *data)
+{
+ struct kvm_userint *s = (struct kvm_userint*)this->private;
+
+ spin_lock_irq(&s->lock);
+ memcpy(data, s->irq_pending.pending, sizeof s->irq_pending.pending);
+ spin_unlock_irq(&s->lock);
+
+ return 0;
+}
+
+static void userint_destructor(struct kvm_irqdevice *this)
+{
+ kfree(this->private);
+}
+
+int kvm_userint_init(struct kvm_irqdevice *dev)
+{
+ struct kvm_userint *s;
+
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ spin_lock_init(&s->lock);
+
+ dev->pending = userint_pending;
+ dev->read_vector = userint_read_vector;
+ dev->set_pin = userint_set_pin;
+ dev->summary = userint_summary;
+ dev->destructor = userint_destructor;
+
+ dev->private = s;
+
+ return 0;
+}
+
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 61a6116..a0fdf02 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -1219,13 +1219,8 @@ static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
{
- int word_index = __ffs(vcpu->irq_summary);
- int bit_index = __ffs(vcpu->irq_pending[word_index]);
- int irq = word_index * BITS_PER_LONG + bit_index;
-
- clear_bit(bit_index, &vcpu->irq_pending[word_index]);
- if (!vcpu->irq_pending[word_index])
- clear_bit(word_index, &vcpu->irq_summary);
+ int irq = kvm_irqdevice_read_vector(&vcpu->irq_dev, 0);
+ BUG_ON(irq < 0);
if (vcpu->rmode.active) {
inject_rmode_irq(vcpu, irq);
@@ -1246,7 +1241,7 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
if (vcpu->interrupt_window_open &&
- vcpu->irq_summary &&
+ kvm_irqdevice_pending(&vcpu->irq_dev, 0) &&
!(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
/*
* If interrupts enabled, and not blocked by sti or mov ss. Good.
@@ -1255,7 +1250,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
if (!vcpu->interrupt_window_open &&
- (vcpu->irq_summary || kvm_run->request_interrupt_window))
+ (kvm_irqdevice_pending(&vcpu->irq_dev, 0) ||
+ kvm_run->request_interrupt_window))
/*
* Interrupts blocked. Wait for unblock.
*/
@@ -1314,8 +1310,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (is_external_interrupt(vect_info)) {
int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
- set_bit(irq, vcpu->irq_pending);
- set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
+ /* FIXME: Is this right? */
+ kvm_irqdevice_set_pin(&vcpu->irq_dev, irq, 1);
}
if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
@@ -1619,8 +1615,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
kvm_run->if_flag = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) != 0;
kvm_run->cr8 = vcpu->cr8;
kvm_run->apic_base = vcpu->apic_base;
- kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open &&
- vcpu->irq_summary == 0);
+ kvm_run->ready_for_interrupt_injection =
+ (vcpu->interrupt_window_open &&
+ !kvm_irqdevice_pending(&vcpu->irq_dev, 0));
}
static int handle_interrupt_window(struct kvm_vcpu *vcpu,
@@ -1631,7 +1628,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
* possible
*/
if (kvm_run->request_interrupt_window &&
- !vcpu->irq_summary) {
+ !kvm_irqdevice_pending(&vcpu->irq_dev, 0)) {
kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
++kvm_stat.irq_window_exits;
return 0;
@@ -1642,7 +1639,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
skip_emulated_instruction(vcpu);
- if (vcpu->irq_summary)
+ if (kvm_irqdevice_pending(&vcpu->irq_dev, 0))
return 1;
kvm_run->exit_reason = KVM_EXIT_HLT;
@@ -1713,7 +1710,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
{
- return (!vcpu->irq_summary &&
+ return (!kvm_irqdevice_pending(&vcpu->irq_dev, 0) &&
kvm_run->request_interrupt_window &&
vcpu->interrupt_window_open &&
(vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
[-- Attachment #4: 03-preemptible-cpu.patch --]
[-- Type: text/plain, Size: 10834 bytes --]
KVM: Preemptible VCPU
From: <>
This adds support for interrupting an executing CPU
Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
---
drivers/kvm/Makefile | 2 -
drivers/kvm/condvar.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++
drivers/kvm/condvar.h | 36 ++++++++++++++++
drivers/kvm/kvm.h | 12 +++++
drivers/kvm/kvm_main.c | 47 ++++++++++++++++++---
drivers/kvm/svm.c | 35 +++++++++++++++
drivers/kvm/vmx.c | 35 +++++++++++++++
7 files changed, 270 insertions(+), 6 deletions(-)
diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile
index 540afbc..b3bef0e 100644
--- a/drivers/kvm/Makefile
+++ b/drivers/kvm/Makefile
@@ -2,7 +2,7 @@
# Makefile for Kernel-based Virtual Machine module
#
-kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o
+kvm-objs := kvm_main.o mmu.o x86_emulate.o userint.o condvar.o
obj-$(CONFIG_KVM) += kvm.o
kvm-intel-objs = vmx.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/drivers/kvm/condvar.c b/drivers/kvm/condvar.c
new file mode 100644
index 0000000..87e464a
--- /dev/null
+++ b/drivers/kvm/condvar.c
@@ -0,0 +1,109 @@
+/*
+ * Condition Variable
+ *
+ * Copyright (C) 2007, Novell
+ *
+ * Authors:
+ * Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "condvar.h"
+
+void condvar_init(struct condvar *cv)
+{
+ wait_queue_head_t __head = __WAIT_QUEUE_HEAD_INITIALIZER(cv->queue);
+
+ memset(cv, 0, sizeof(*cv));
+ cv->queue = __head;
+}
+EXPORT_SYMBOL_GPL(condvar_init);
+
+/*
+ * Assumes the lock is already held
+ */
+int condvar_wait(struct condvar *cv, void *l, long timeout)
+{
+ DEFINE_WAIT(__wait);
+ int _ret = 0;
+
+ BUG_ON(!cv->lock_ops);
+
+ /*
+ * first place ourselves on the waitqueue before releasing the lock
+ */
+ prepare_to_wait(&cv->queue, &__wait, TASK_UNINTERRUPTIBLE);
+
+ /*
+ * now actually release the lock to unblock any potential signalers
+ */
+ cv->lock_ops->unlock(l);
+
+ /*
+ * finally, reschedule until we are re-awoken
+ */
+ if (timeout > -1)
+ schedule_timeout(timeout);
+ else
+ schedule();
+ finish_wait(&cv->queue, &__wait);
+
+ /*
+ * if we get here, its because someone signaled us.
+ * reaquire the lock
+ */
+ cv->lock_ops->lock(l);
+
+ return _ret;
+}
+EXPORT_SYMBOL_GPL(condvar_wait);
+
+/*
+ * Assumes the lock is already held
+ */
+int condvar_signal(struct condvar *cv)
+{
+ wake_up(&cv->queue);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(condvar_signal);
+
+/*
+ *------------------------------------------------------------------------
+ * spinlock_condvar
+ *
+ * spinlock_lock/unlock can sometimes be implemented as macros, so
+ * assigning them as function pointers directly is probably not going to
+ * work. Therefore we need these lightweight wrappers
+ *------------------------------------------------------------------------
+ */
+
+static void spinlock_condvar_lock(void *l)
+{
+ spinlock_t *lock = (spinlock_t*)l;
+
+ spin_lock(lock);
+}
+
+static void spinlock_condvar_unlock(void *l)
+{
+ spinlock_t *lock = (spinlock_t*)l;
+
+ spin_unlock(lock);
+}
+
+static struct cv_lock_ops spinlock_ops = {
+ .lock = spinlock_condvar_lock,
+ .unlock = spinlock_condvar_unlock
+};
+
+void spinlock_condvar_init(struct condvar *cv)
+{
+ condvar_init(cv);
+
+ cv->lock_ops = &spinlock_ops;
+}
+
diff --git a/drivers/kvm/condvar.h b/drivers/kvm/condvar.h
new file mode 100644
index 0000000..58ed523
--- /dev/null
+++ b/drivers/kvm/condvar.h
@@ -0,0 +1,36 @@
+/*
+ * Condition Variable
+ *
+ * Copyright (C) 2007, Novell
+ *
+ * Authors:
+ * Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+struct cv_lock_ops {
+ void (*lock)(void *);
+ void (*unlock)(void *);
+};
+
+struct condvar {
+ wait_queue_head_t queue;
+ struct cv_lock_ops *lock_ops;
+};
+
+void condvar_init(struct condvar *cv);
+int condvar_wait(struct condvar *cv, void *l, long timeout);
+int condvar_signal(struct condvar *cv);
+
+/*
+ *------------------------------------------------------------------------
+ * spinlock_condvar
+ *------------------------------------------------------------------------
+ */
+
+void spinlock_condvar_init(struct condvar *cv);
+
+
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 58966d9..703ffe0 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -14,6 +14,7 @@
#include "vmx.h"
#include "irqdevice.h"
+#include "condvar.h"
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -271,6 +272,16 @@ void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
+/*
+ * structure for maintaining info for interrupting an executing VCPU
+ */
+struct kvm_vcpu_irq {
+ spinlock_t lock;
+ struct condvar cv;
+ struct task_struct *task;
+ int pending;
+};
+
struct kvm_vcpu {
struct kvm *kvm;
union {
@@ -284,6 +295,7 @@ struct kvm_vcpu {
struct kvm_run *run;
int interrupt_window_open;
struct kvm_irqdevice irq_dev;
+ struct kvm_vcpu_irq irq;
unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
unsigned long rip; /* needs vcpu_load_rsp_rip() */
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 7e00412..ea3609e 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -299,6 +299,11 @@ static struct kvm *kvm_create_vm(void)
struct kvm_vcpu *vcpu = &kvm->vcpus[i];
mutex_init(&vcpu->mutex);
+
+ memset(&vcpu->irq, 0, sizeof(vcpu->irq));
+ spin_lock_init(&vcpu->irq.lock);
+ spinlock_condvar_init(&vcpu->irq.cv);
+
vcpu->cpu = -1;
vcpu->kvm = kvm;
vcpu->mmu.root_hpa = INVALID_PAGE;
@@ -2320,13 +2325,45 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this,
* Our irq device is requesting to interrupt the vcpu. If it is
* currently running, we should inject a host IPI to force a VMEXIT
*/
-
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu*)this->private;
+
/*
- * FIXME: Implement this or the CPU wont notice the interrupt until
- * the next natural VMEXIT. Note that this is how the system
- * has always worked, so nothing is broken here. This is a future
- * enhancement
+ * HACK ALERT!
+ *
+ * We want to send a virtual interrupt signal to the task that owns
+ * the guest. However, the signal will only force a VMEXIT (via
+ * a reschedule IPI) if the task is currently in GUEST mode. There
+ * is a race condition between the time that we mark the vcpu as
+ * running and the time the system actually enter guest mode. Since
+ * there doesnt appear to be any way to help with this situation from
+ * the VT hardware, we are forced to wait to make sure the guest
+ * actually gets interrupted in a reasonable amount of time. If it
+ * does not, we assume that the IPI failed because it was too early
+ * and must try again until it does.
+ *
+ * This condvar/spinlock/timeout/retry eliminate the race in a safe
+ * manner, at the expense of making the INTR delivery synchronous
*/
+ spin_lock(&vcpu->irq.lock);
+
+ if (vcpu->irq.task) {
+ struct timespec tmo = {
+ .tv_sec = 0,
+ .tv_nsec = 100000 /* 100us */
+ };
+
+ BUG_ON(vcpu->irq.task == current);
+
+ while (vcpu->irq.task) {
+ send_sig(SIGSTOP, vcpu->irq.task, 0);
+ condvar_wait(&vcpu->irq.cv, &vcpu->irq.lock,
+ timespec_to_jiffies(&tmo));
+ }
+
+ vcpu->irq.pending = 1;
+ }
+
+ spin_unlock(&vcpu->irq.lock);
}
static void kvm_vcpu_irqsink_init(struct kvm_vcpu *vcpu)
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index e59a548..6bc2fb1 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -1463,9 +1463,25 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int r;
again:
+ spin_lock(&vcpu->irq.lock);
+
+ /*
+ * Setting vcpu->task signals to outsiders that the VMCS is
+ * effectively in GUEST mode, and therefore must be signalled
+ * to transition the task back to HOST mode if any new interrupts
+ * arrive.
+ */
+ vcpu->irq.task = current;
+
+ /*
+ * We also must inject interrupts (if any) while the irq_lock
+ * is held
+ */
if (!vcpu->mmio_read_completed)
do_interrupt_requests(vcpu, kvm_run);
+ spin_unlock(&vcpu->irq.lock);
+
clgi();
pre_svm_run(vcpu);
@@ -1617,6 +1633,25 @@ again:
reload_tss(vcpu);
/*
+ * Signal that we have transitioned back to host mode
+ */
+ spin_lock(&vcpu->irq.lock);
+
+ vcpu->irq.task = NULL;
+ condvar_signal(&vcpu->irq.cv);
+
+ /*
+ * If irqpending is asserted someone undoubtedly has sent us a SIGSTOP
+ * signal. Counter it with a SIGCONT
+ */
+ if(vcpu->irq.pending) {
+ send_sig(SIGCONT, current, 0);
+ vcpu->irq.pending = 0;
+ }
+
+ spin_unlock(&vcpu->irq.lock);
+
+ /*
* Profile KVM exit RIPs:
*/
if (unlikely(prof_on == KVM_PROFILING))
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index a0fdf02..f7b716b 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -1748,9 +1748,25 @@ again:
vmcs_writel(HOST_GS_BASE, segment_base(gs_sel));
#endif
+ spin_lock(&vcpu->irq.lock);
+
+ /*
+ * Setting vcpu->task signals to outsiders that the VMCS is
+ * effectively in GUEST mode, and therefore must be signalled
+ * to transition the task back to HOST mode if any new interrupts
+ * arrive.
+ */
+ vcpu->irq.task = current;
+
+ /*
+ * We also must inject interrupts (if any) while the irq_lock
+ * is held
+ */
if (!vcpu->mmio_read_completed)
do_interrupt_requests(vcpu, kvm_run);
+ spin_unlock(&vcpu->irq.lock);
+
if (vcpu->guest_debug.enabled)
kvm_guest_debug_pre(vcpu);
@@ -1911,6 +1927,25 @@ again:
asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
+ /*
+ * Signal that we have transitioned back to host mode
+ */
+ spin_lock(&vcpu->irq.lock);
+
+ vcpu->irq.task = NULL;
+ condvar_signal(&vcpu->irq.cv);
+
+ /*
+ * If irqpending is asserted someone undoubtedly has sent us a SIGSTOP
+ * signal. Counter it with a SIGCONT
+ */
+ if(vcpu->irq.pending) {
+ send_sig(SIGCONT, current, 0);
+ vcpu->irq.pending = 0;
+ }
+
+ spin_unlock(&vcpu->irq.lock);
+
if (fail) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
kvm_run->fail_entry.hardware_entry_failure_reason
[-- Attachment #5: series --]
[-- Type: application/octet-stream, Size: 143 bytes --]
# This series applies on GIT commit 0ea6eecef44923d66409a49d71e4fa87fa0f5bed
01-mmio_handler.patch
02-irqdevice.patch
03-preemptible-cpu.patch
[-- Attachment #6: Type: text/plain, Size: 345 bytes --]
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
[-- Attachment #7: Type: text/plain, Size: 186 bytes --]
_______________________________________________
kvm-devel mailing list
kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
https://lists.sourceforge.net/lists/listinfo/kvm-devel
^ permalink raw reply related [flat|nested] 13+ messages in thread[parent not found: <461D7702.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>]
* Re: irqdevice INTR example [not found] ` <461D7702.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> @ 2007-04-12 8:02 ` Avi Kivity [not found] ` <461DE791.1040707-atKUWr5tajBWk0Htik3J/w@public.gmane.org> 0 siblings, 1 reply; 13+ messages in thread From: Avi Kivity @ 2007-04-12 8:02 UTC (permalink / raw) To: Gregory Haskins; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f Gregory Haskins wrote: > Hi All, > Attached are the first three patches in my queue. The first two you are likely familiar with at this point (though I have made some more of the requested changes to 02-irqdevice.patch). The last item (03-preemptible-cpu.patch) adds an implementation to the previously unused kvm_vcpu_intr() callback. This acts as a functional example of the INTR callback mechanism as Avi requested. Note that the work related to IF/NMI/TPR classification of interrupts happens later in my queue and is not mature enough to share yet, but hopefully soon. > > > KVM: Preemptible VCPU > > From: <> > > This adds support for interrupting an executing CPU > > diff --git a/drivers/kvm/condvar.c b/drivers/kvm/condvar.c > new file mode 100644 > index 0000000..87e464a > --- /dev/null > +++ b/drivers/kvm/condvar.c > @@ -0,0 +1,109 @@ > +/* > + * Condition Variable > + * > + * Copyright (C) 2007, Novell > + * > + * Authors: > + * Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + */ > If you want condition variables, activate your cryogenically-cooled suit and post on it on lkml. This cannot be added to the kernel via kvm. > + > /* > - * FIXME: Implement this or the CPU wont notice the interrupt until > - * the next natural VMEXIT. Note that this is how the system > - * has always worked, so nothing is broken here. This is a future > - * enhancement > + * HACK ALERT! > + * > + * We want to send a virtual interrupt signal to the task that owns > + * the guest. However, the signal will only force a VMEXIT (via > + * a reschedule IPI) if the task is currently in GUEST mode. There > + * is a race condition between the time that we mark the vcpu as > + * running and the time the system actually enter guest mode. Since > + * there doesnt appear to be any way to help with this situation from > + * the VT hardware, we are forced to wait to make sure the guest > We support AMD too. > + * actually gets interrupted in a reasonable amount of time. If it > + * does not, we assume that the IPI failed because it was too early > + * and must try again until it does. > Waiting seems totally broken. I'm not even sure what we are waiting for -- certainly you can't wait for re-entry into guest mode, since there's not guarantee that's coming. Many times the guest will be sleeping on a hlt instruction waiting for an interrupt. If the guest is in guest mode, an IPI is right. If it is elsewhere, we need to use one of the many (alas) standard wakeup mechanisms to wake up userspace and let it know the vcpu needs service. Qemu uses signals, but I think that making the vcpu fd writable is more generic. For qemu, we can attach a signal to the fd (using fcntl(2)); that has the benefit of leaving the choice of which signal to use to userspace. > + * > + * This condvar/spinlock/timeout/retry eliminate the race in a safe > + * manner, at the expense of making the INTR delivery synchronous > */ > + spin_lock(&vcpu->irq.lock); > + > + if (vcpu->irq.task) { > + struct timespec tmo = { > + .tv_sec = 0, > + .tv_nsec = 100000 /* 100us */ > + }; > + > + BUG_ON(vcpu->irq.task == current); > + > + while (vcpu->irq.task) { > + send_sig(SIGSTOP, vcpu->irq.task, 0); > SIGSTOP?! I lost the plot here. -- error compiling committee.c: too many arguments to function ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV ^ permalink raw reply [flat|nested] 13+ messages in thread
[parent not found: <461DE791.1040707-atKUWr5tajBWk0Htik3J/w@public.gmane.org>]
* Re: irqdevice INTR example [not found] ` <461DE791.1040707-atKUWr5tajBWk0Htik3J/w@public.gmane.org> @ 2007-04-12 8:18 ` Christoph Hellwig 2007-04-12 11:55 ` Gregory Haskins 1 sibling, 0 replies; 13+ messages in thread From: Christoph Hellwig @ 2007-04-12 8:18 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f On Thu, Apr 12, 2007 at 11:02:25AM +0300, Avi Kivity wrote: > If you want condition variables, activate your cryogenically-cooled suit > and post on it on lkml. This cannot be added to the kernel via kvm. Better just rip it out. The code would be a lot simpler with opencoded waitqueue use, and faster aswell due to getting rid of the indirect calls. e.g. this: + if (vcpu->irq.task) { + struct timespec tmo = { + .tv_sec = 0, + .tv_nsec = 100000 /* 100us */ + }; + + BUG_ON(vcpu->irq.task == current); + + while (vcpu->irq.task) { + send_sig(SIGSTOP, vcpu->irq.task, 0); + condvar_wait(&vcpu->irq.cv, &vcpu->irq.lock, + timespec_to_jiffies(&tmo)); + } + + vcpu->irq.pending = 1; + } + would simply become if (vcpu->irq.task) { struct timespec tmo = { .tv_sec = 0, .tv_nsec = 100000 /* 100us */ }; BUG_ON(vcpu->irq.task == current); for (;;) { send_sig(SIGSTOP, vcpu->irq.task, 0); prepare_to_wait(&vcpu->irq.wq, &wait, TASK_UNINTERRUPTIBLE); if (!vcpu->irq.task) break; spin_unlock(&vcpu->irq.lock); schedule_timeout(timespec_to_jiffies(&tmo)); spin_lock(&vcpu->irq.lock); } finish_wait(&vcpu->irq.wq, &wait, TASK_UNINTERRUPTIBLE); vcpu->irq.pending = 1; } with equal behaviour. although the send_sig in the loop and the very short sleep time still look rather odd to me. ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: irqdevice INTR example [not found] ` <461DE791.1040707-atKUWr5tajBWk0Htik3J/w@public.gmane.org> 2007-04-12 8:18 ` Christoph Hellwig @ 2007-04-12 11:55 ` Gregory Haskins [not found] ` <461DE5C9.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> 1 sibling, 1 reply; 13+ messages in thread From: Gregory Haskins @ 2007-04-12 11:55 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f >>> On Thu, Apr 12, 2007 at 4:02 AM, in message <461DE791.1040707-atKUWr5tajBWk0Htik3J/w@public.gmane.org>, Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote: > Gregory Haskins wrote: >> Hi All, >> Attached are the first three patches in my queue. The first two you are > likely familiar with at this point (though I have made some more of the > requested changes to 02- irqdevice.patch). The last item > (03- preemptible- cpu.patch) adds an implementation to the previously unused > kvm_vcpu_intr() callback. This acts as a functional example of the INTR > callback mechanism as Avi requested. Note that the work related to > IF/NMI/TPR classification of interrupts happens later in my queue and is not > mature enough to share yet, but hopefully soon. >> >> >> KVM: Preemptible VCPU >> >> From: <> >> >> This adds support for interrupting an executing CPU >> >> diff -- git a/drivers/kvm/condvar.c b/drivers/kvm/condvar.c >> new file mode 100644 >> index 0000000..87e464a >> --- /dev/null >> +++ b/drivers/kvm/condvar.c >> @@ - 0,0 +1,109 @@ >> +/* >> + * Condition Variable >> + * >> + * Copyright (C) 2007, Novell >> + * >> + * Authors: >> + * Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> >> + * >> + * This work is licensed under the terms of the GNU GPL, version 2. See >> + * the COPYING file in the top- level directory. >> + * >> + */ >> > > If you want condition variables, activate your cryogenically- cooled suit > and post on it on lkml. This cannot be added to the kernel via kvm. Hehe. I wasn't aware of the contention surrounding them but that does explain why they are notably missing. I will rip it out this code in favor of a pure waitqueue as Christof suggested to avoid more controversy. > > > >> + >> /* >> - * FIXME: Implement this or the CPU wont notice the interrupt until >> - * the next natural VMEXIT. Note that this is how the system >> - * has always worked, so nothing is broken here. This is a future >> - * enhancement >> + * HACK ALERT! >> + * >> + * We want to send a virtual interrupt signal to the task that owns >> + * the guest. However, the signal will only force a VMEXIT (via >> + * a reschedule IPI) if the task is currently in GUEST mode. There >> + * is a race condition between the time that we mark the vcpu as >> + * running and the time the system actually enter guest mode. Since >> + * there doesnt appear to be any way to help with this situation from >> + * the VT hardware, we are forced to wait to make sure the guest >> > > We support AMD too. My patch actually supports both, and I didn't mean to infer that AMD was skipped. I was speaking generally when I said "VT". Is there a better term to refer to the concept of VMX/SVN? > >> + * actually gets interrupted in a reasonable amount of time. If it >> + * does not, we assume that the IPI failed because it was too early >> + * and must try again until it does. >> > > Waiting seems totally broken. I'm not even sure what we are waiting for > -- certainly you can't wait for re- entry into guest mode, since there's > not guarantee that's coming. Actually, there is ;) (unless I missed something) If you look at the other side of this implementation in vcpu_XXX_run(), I only set the vcpu->irq.task when the vcpu is just about to enter guest mode. And I clear it when it leaves. In other words, the kvn_vcpu_intr() function is a no-op if the vcpu is not in guest mode (or close to it). > If it is elsewhere, we need to use one of the many (alas) standard wakeup mechanisms to > wake up userspace and let it know the vcpu needs service. My code assumes we only need to kick the vcpu if its in guest mode because it figures it will just pick up the interrupt on the next run cycle otherwise. I haven't studied QEMU deep enough to know that it can suspend the execution of the run-cycle...i just thought it looped indefinitely. This makes sense now that I think about it because something like hlt should cause a suspension of CPU activity until further notice. If we need to be able to signal the vcpu *anytime* I will have to go back and refactor some things. > > Qemu uses signals, but I think that making the vcpu fd writable is more > generic. For qemu, we can attach a signal to the fd (using fcntl(2)); > that has the benefit of leaving the choice of which signal to use to > userspace. Ack, I will look into this design. > >> + * >> + * This condvar/spinlock/timeout/retry eliminate the race in a safe >> + * manner, at the expense of making the INTR delivery synchronous >> */ >> + spin_lock(&vcpu- >irq.lock); >> + >> + if (vcpu- >irq.task) { >> + struct timespec tmo = { >> + .tv_sec = 0, >> + .tv_nsec = 100000 /* 100us */ >> + }; >> + >> + BUG_ON(vcpu- >irq.task == current); >> + >> + while (vcpu- >irq.task) { >> + send_sig(SIGSTOP, vcpu- >irq.task, 0); >> > > SIGSTOP?! I lost the plot here. Well, I needed a signal to send that would reliably IPI the guest. I originally wrote the code to send RT signal 33 (which I defined as KVM_SIGNAL_VIRTUAL_INTERRUPT). In both cases, the design calls for sending a signal (purely for the IPI function of the kernel) and then to remove the signal before userspace sees it. I had some trouble with the code that used dequeue_signal() to strip out the RT signal. So I then changed to sending SIGSTOP to IPI, and SIGCONT to effectively strip out the pending SIGSTOP. Keep in mind that this is really just proof-of-concept code. The general idea of this POC is to demonstrate how I envisioned the irqdevice API to be used to deliver interrupts even once concurrent access is possible (e.g., SMP/PV). The details of which signal are used, etc, are incidental for now until we agree that the irqdevice API approach is acceptable. Regards, -Greg ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV ^ permalink raw reply [flat|nested] 13+ messages in thread
[parent not found: <461DE5C9.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>]
* Re: irqdevice INTR example [not found] ` <461DE5C9.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> @ 2007-04-12 12:49 ` Avi Kivity [not found] ` <461E2AD5.7070905-atKUWr5tajBWk0Htik3J/w@public.gmane.org> 0 siblings, 1 reply; 13+ messages in thread From: Avi Kivity @ 2007-04-12 12:49 UTC (permalink / raw) To: Gregory Haskins; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f Gregory Haskins wrote: >>> >>> >> If you want condition variables, activate your cryogenically- cooled suit >> and post on it on lkml. This cannot be added to the kernel via kvm. >> > > Hehe. I wasn't aware of the contention surrounding them but that does explain why they are notably missing. I will rip it out this code in favor of a pure waitqueue as Christof suggested to avoid more controversy. > > Actually, I am in favor of having well-defined synchronization primitives. The only issues I see with condition variables are: - there are many variants of mutexes in the kernel (mutex, spinlock, spinlock with irqs disabled, ...), so many condvar variants are needed. - they're very close to waitqueues, so perhaps there's not much justification I'm no synchronization primitive expert, though. My comment had nothing to do with my opinion on condition variables or your implementation thereof. kvm is simply not the place to introduce them. >> >> >>> + >>> /* >>> - * FIXME: Implement this or the CPU wont notice the interrupt until >>> - * the next natural VMEXIT. Note that this is how the system >>> - * has always worked, so nothing is broken here. This is a future >>> - * enhancement >>> + * HACK ALERT! >>> + * >>> + * We want to send a virtual interrupt signal to the task that owns >>> + * the guest. However, the signal will only force a VMEXIT (via >>> + * a reschedule IPI) if the task is currently in GUEST mode. There >>> + * is a race condition between the time that we mark the vcpu as >>> + * running and the time the system actually enter guest mode. Since >>> + * there doesnt appear to be any way to help with this situation from >>> + * the VT hardware, we are forced to wait to make sure the guest >>> >>> >> We support AMD too. >> > > My patch actually supports both, and I didn't mean to infer that AMD was skipped. I was speaking generally when I said "VT". Is there a better term to refer to the concept of VMX/SVN? > I referred to the comment. Maybe just "the hardware"? > >>> + * actually gets interrupted in a reasonable amount of time. If it >>> + * does not, we assume that the IPI failed because it was too early >>> + * and must try again until it does. >>> >>> >> Waiting seems totally broken. I'm not even sure what we are waiting for >> -- certainly you can't wait for re- entry into guest mode, since there's >> not guarantee that's coming. >> > > Actually, there is ;) (unless I missed something) If you look at the other side of this implementation in vcpu_XXX_run(), I only set the vcpu->irq.task when the vcpu is just about to enter guest mode. And I clear it when it leaves. In other words, the kvn_vcpu_intr() function is a no-op if the vcpu is not in guest mode (or close to it). > > Ah, ok -- I misunderstood the whole thing. The way to avoid the race is to disable interrupts before entering the guest. This way the IPI is delayed until you enter guest mode: irq_disable(); spin_lock(); vcpu->guest_mode = 1; check_whether_an_irq_is_pending_and_if_so_inject_it(); spin_unlock(); asm ("vm_entry_sequence"); vcpu->guest_mode = 0; irq_enable(); // currently in the vm entry sequence // recheck here? If the interrupt happens before the spin_lock(), we'll get a non-ipi wakeup and then see it in check_whether(). If it happens after it we'll get an IPI which will be ignored until we're snugly in guest mode. >> If it is elsewhere, we need to use one of the many (alas) standard wakeup mechanisms to >> wake up userspace and let it know the vcpu needs service. >> > > My code assumes we only need to kick the vcpu if its in guest mode because it figures it will just pick up the interrupt on the next run cycle otherwise. I haven't studied QEMU deep enough to know that it can suspend the execution of the run-cycle...i just thought it looped indefinitely. In general I find it useful to pretend there are many userspaces being written for kvm, otherwise we get locked into qemu's current mode of operation. > This makes sense now that I think about it because something like hlt should cause a suspension of CPU activity until further notice. An alternative is to handle hlt in the kernel in a place where we're ready for the IPI wakeup. The downside to that is that we have to be prepared for external wakeup sources as well (signal, poll, aio... messy). > >>> + * >>> + * This condvar/spinlock/timeout/retry eliminate the race in a safe >>> + * manner, at the expense of making the INTR delivery synchronous >>> */ >>> + spin_lock(&vcpu- >irq.lock); >>> + >>> + if (vcpu- >irq.task) { >>> + struct timespec tmo = { >>> + .tv_sec = 0, >>> + .tv_nsec = 100000 /* 100us */ >>> + }; >>> + >>> + BUG_ON(vcpu- >irq.task == current); >>> + >>> + while (vcpu- >irq.task) { >>> + send_sig(SIGSTOP, vcpu- >irq.task, 0); >>> >>> >> SIGSTOP?! I lost the plot here. >> > > Well, I needed a signal to send that would reliably IPI the guest. I originally wrote the code to send RT signal 33 (which I defined as KVM_SIGNAL_VIRTUAL_INTERRUPT). In both cases, the design calls for sending a signal (purely for the IPI function of the kernel) and then to remove the signal before userspace sees it. I had some trouble with the code that used dequeue_signal() to strip out the RT signal. So I then changed to sending SIGSTOP to IPI, and SIGCONT to effectively strip out the pending SIGSTOP. > > It's best not to use signals internally. Qemu relies on them and we have to support them, but in kernel we should use existing kernel mechanisms. > Keep in mind that this is really just proof-of-concept code. The general idea of this POC is to demonstrate how I envisioned the irqdevice API to be used to deliver interrupts even once concurrent access is possible (e.g., SMP/PV). The details of which signal are used, etc, are incidental for now until we agree that the irqdevice API approach is acceptable. > I was interested in how ->pending() and ->read_vector() and the raise callback interact, but got distracted by the, err, uniqueness of the signal thing. -- error compiling committee.c: too many arguments to function ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV ^ permalink raw reply [flat|nested] 13+ messages in thread
[parent not found: <461E2AD5.7070905-atKUWr5tajBWk0Htik3J/w@public.gmane.org>]
* Re: irqdevice INTR example [not found] ` <461E2AD5.7070905-atKUWr5tajBWk0Htik3J/w@public.gmane.org> @ 2007-04-12 13:43 ` Gregory Haskins [not found] ` <461DFF1C.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> 0 siblings, 1 reply; 13+ messages in thread From: Gregory Haskins @ 2007-04-12 13:43 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f [-- Attachment #1: Type: text/plain, Size: 4568 bytes --] I have attached a new version of the patch which eliminates the condition variable (if only by name, anyway ;) >>> On Thu, Apr 12, 2007 at 8:49 AM, in message <461E2AD5.7070905-atKUWr5tajBWk0Htik3J/w@public.gmane.org>, Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote: > > > Actually, I am in favor of having well- defined synchronization > primitives. The only issues I see with condition variables are: > > - there are many variants of mutexes in the kernel (mutex, spinlock, > spinlock with irqs disabled, ...), so many condvar variants are needed. Yeah, I tried to address that with lock_ops, but I agree. > - they're very close to waitqueues, so perhaps there's not much > justification Agreed they are very close. I won't go into my opinion of waitqueues vs cond-vars in this forum ;) > > I'm no synchronization primitive expert, though. My comment had nothing > to do with my opinion on condition variables or your implementation > thereof. kvm is simply not the place to introduce them. No problem. I think your argument is a good one. > > I referred to the comment. Maybe just "the hardware"? Done > > Ah, ok -- I misunderstood the whole thing. The way to avoid the race is > to disable interrupts before entering the guest. This way the IPI is > delayed until you enter guest mode: > > irq_disable(); > > spin_lock(); > vcpu- >guest_mode = 1; > check_whether_an_irq_is_pending_and_if_so_inject_it(); > spin_unlock(); > > asm ("vm_entry_sequence"); > > vcpu- >guest_mode = 0; > > irq_enable(); // currently in the vm entry sequence > > // recheck here? > > If the interrupt happens before the spin_lock(), we'll get a non- ipi > wakeup and then see it in check_whether(). If it happens after it we'll > get an IPI which will be ignored until we're snugly in guest mode. When I first read this I thought "whoa! you want to disable interrupts during the whole time we are in GUEST mode?" But then it dawned on me what you are suggesting: Interrupts would be re-enabled after the context switch because we re-load the processor state? Cool! The thing I can't wrap my head around is what happens when the guest has IF=0 and and external interrupt comes in? Would we still exit? But that aside, a problem that I see is that (IIUC) IPIs use NMIs not EXT-INTs. Assuming that is right, I suppose we might be able to do a similar trick except we also disable NMIs first (is this possible/recommended/forbidden?). > > In general I find it useful to pretend there are many userspaces being > written for kvm, otherwise we get locked into qemu's current mode of > operation. Sounds reasonable. I will start to do the same. > >> This makes sense now that I think about it because something like hlt should > cause a suspension of CPU activity until further notice. > > An alternative is to handle hlt in the kernel in a place where we're > ready for the IPI wakeup. The downside to that is that we have to be > prepared for external wakeup sources as well (signal, poll, aio... messy). Hmmm...interesting. I wonder if there are advantages that make this worth exploring. Oh well, I will back burner these thoughts until the SMP/PV/APIC stuff is sorted out. > > It's best not to use signals internally. Qemu relies on them and we > have to support them, but in kernel we should use existing kernel > mechanisms. But Avi, this was *your* idea to use signals ;) But in all seriousness, I don't know if I have a choice. I have two requirements which constrain me: 1) I need an IPI to cause a VMEXIT 2) I need to support 2.6.16, strongly preferable as a loadable module. According to my research (which is undoubtedly not 100% definitive), 2.6.16 or even the newer kernels do not export most of the IPI facilities. send_sig() happens to be exported and it happens to invoke a reschedule_IPI under the hood, so its convenient. If there is another way to get access to the IPI facility without playing games with signals, I am all ears. But until then I don't know what else to do. If I had the luxury of modifying the kernel source, we could just export what we needed and be done with it. > > I was interested in how - >pending() and - >read_vector() and the raise > callback interact, but got distracted by the, err, uniqueness of the > signal thing. You still haven't weighed in here ;) Hopefully you have a clearer picture of what I was trying to do now at least. Whether you agree or not is another matter. Regards, -Greg [-- Attachment #2: preemptible-cpu-2.patch --] [-- Type: text/plain, Size: 6675 bytes --] KVM: Preemptible VCPU From: <> This adds support for interrupting an executing CPU Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> --- drivers/kvm/kvm.h | 11 ++++++++++ drivers/kvm/kvm_main.c | 54 ++++++++++++++++++++++++++++++++++++++++++++---- drivers/kvm/svm.c | 35 +++++++++++++++++++++++++++++++ drivers/kvm/vmx.c | 35 +++++++++++++++++++++++++++++++ 4 files changed, 130 insertions(+), 5 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 58966d9..70d1bb9 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -271,6 +271,16 @@ void kvm_io_bus_register_dev(struct kvm_io_bus *bus, #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) +/* + * structure for maintaining info for interrupting an executing VCPU + */ +struct kvm_vcpu_irq { + spinlock_t lock; + wait_queue_head_t wq; + struct task_struct *task; + int pending; +}; + struct kvm_vcpu { struct kvm *kvm; union { @@ -284,6 +294,7 @@ struct kvm_vcpu { struct kvm_run *run; int interrupt_window_open; struct kvm_irqdevice irq_dev; + struct kvm_vcpu_irq irq; unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ unsigned long rip; /* needs vcpu_load_rsp_rip() */ diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 7e00412..1cf4060 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -299,6 +299,11 @@ static struct kvm *kvm_create_vm(void) struct kvm_vcpu *vcpu = &kvm->vcpus[i]; mutex_init(&vcpu->mutex); + + memset(&vcpu->irq, 0, sizeof(vcpu->irq)); + spin_lock_init(&vcpu->irq.lock); + init_waitqueue_head(&vcpu->irq.wq); + vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->mmu.root_hpa = INVALID_PAGE; @@ -2320,13 +2325,52 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this, * Our irq device is requesting to interrupt the vcpu. If it is * currently running, we should inject a host IPI to force a VMEXIT */ - + struct kvm_vcpu *vcpu = (struct kvm_vcpu*)this->private; + /* - * FIXME: Implement this or the CPU wont notice the interrupt until - * the next natural VMEXIT. Note that this is how the system - * has always worked, so nothing is broken here. This is a future - * enhancement + * HACK ALERT! + * + * We want to send a virtual interrupt signal to the task that owns + * the guest. However, the signal will only force a VMEXIT (via + * a reschedule IPI) if the task is currently in GUEST mode. There + * is a race condition between the time that we mark the vcpu as + * running and the time the system actually enter guest mode. Since + * there doesnt appear to be any way to help with this situation from + * the hardware, we are forced to wait to make sure the guest + * actually gets interrupted in a reasonable amount of time. If it + * does not, we assume that the IPI failed because it was too early + * and must try again until it does. + * + * This condvar/spinlock/timeout/retry eliminate the race in a safe + * manner, at the expense of making the INTR delivery synchronous */ + spin_lock(&vcpu->irq.lock); + + if (vcpu->irq.task) { + struct timespec tmo = { + .tv_sec = 0, + .tv_nsec = 100000 /* 100us */ + }; + + BUG_ON(vcpu->irq.task == current); + + while (vcpu->irq.task) { + DEFINE_WAIT(__wait); + + send_sig(SIGSTOP, vcpu->irq.task, 0); + + prepare_to_wait(&vcpu->irq.wq, &__wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&vcpu->irq.lock); + schedule_timeout(timespec_to_jiffies(&tmo)); + spin_lock(&vcpu->irq.lock); + finish_wait(&vcpu->irq.wq, &__wait); + } + + vcpu->irq.pending = 1; + } + + spin_unlock(&vcpu->irq.lock); } static void kvm_vcpu_irqsink_init(struct kvm_vcpu *vcpu) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index e59a548..41765bd 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1463,9 +1463,25 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; again: + spin_lock(&vcpu->irq.lock); + + /* + * Setting vcpu->task signals to outsiders that the VMCS is + * effectively in GUEST mode, and therefore must be signalled + * to transition the task back to HOST mode if any new interrupts + * arrive. + */ + vcpu->irq.task = current; + + /* + * We also must inject interrupts (if any) while the irq_lock + * is held + */ if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); + spin_unlock(&vcpu->irq.lock); + clgi(); pre_svm_run(vcpu); @@ -1617,6 +1633,25 @@ again: reload_tss(vcpu); /* + * Signal that we have transitioned back to host mode + */ + spin_lock(&vcpu->irq.lock); + + vcpu->irq.task = NULL; + wake_up(&vcpu->irq.wq); + + /* + * If irqpending is asserted someone undoubtedly has sent us a SIGSTOP + * signal. Counter it with a SIGCONT + */ + if(vcpu->irq.pending) { + send_sig(SIGCONT, current, 0); + vcpu->irq.pending = 0; + } + + spin_unlock(&vcpu->irq.lock); + + /* * Profile KVM exit RIPs: */ if (unlikely(prof_on == KVM_PROFILING)) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index a0fdf02..1d5ce85 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1748,9 +1748,25 @@ again: vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); #endif + spin_lock(&vcpu->irq.lock); + + /* + * Setting vcpu->task signals to outsiders that the VMCS is + * effectively in GUEST mode, and therefore must be signalled + * to transition the task back to HOST mode if any new interrupts + * arrive. + */ + vcpu->irq.task = current; + + /* + * We also must inject interrupts (if any) while the irq_lock + * is held + */ if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); + spin_unlock(&vcpu->irq.lock); + if (vcpu->guest_debug.enabled) kvm_guest_debug_pre(vcpu); @@ -1911,6 +1927,25 @@ again: asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); + /* + * Signal that we have transitioned back to host mode + */ + spin_lock(&vcpu->irq.lock); + + vcpu->irq.task = NULL; + wake_up(&vcpu->irq.wq); + + /* + * If irqpending is asserted someone undoubtedly has sent us a SIGSTOP + * signal. Counter it with a SIGCONT + */ + if(vcpu->irq.pending) { + send_sig(SIGCONT, current, 0); + vcpu->irq.pending = 0; + } + + spin_unlock(&vcpu->irq.lock); + if (fail) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason [-- Attachment #3: Type: text/plain, Size: 345 bytes --] ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV [-- Attachment #4: Type: text/plain, Size: 186 bytes --] _______________________________________________ kvm-devel mailing list kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org https://lists.sourceforge.net/lists/listinfo/kvm-devel ^ permalink raw reply related [flat|nested] 13+ messages in thread
[parent not found: <461DFF1C.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>]
* Re: irqdevice INTR example [not found] ` <461DFF1C.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> @ 2007-04-12 14:14 ` Avi Kivity [not found] ` <461E3EDB.3080002-atKUWr5tajBWk0Htik3J/w@public.gmane.org> 0 siblings, 1 reply; 13+ messages in thread From: Avi Kivity @ 2007-04-12 14:14 UTC (permalink / raw) To: Gregory Haskins; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f Gregory Haskins wrote: > >> Ah, ok -- I misunderstood the whole thing. The way to avoid the race is >> to disable interrupts before entering the guest. This way the IPI is >> delayed until you enter guest mode: >> >> irq_disable(); >> >> spin_lock(); >> vcpu- >guest_mode = 1; >> check_whether_an_irq_is_pending_and_if_so_inject_it(); >> spin_unlock(); >> >> asm ("vm_entry_sequence"); >> >> vcpu- >guest_mode = 0; >> >> irq_enable(); // currently in the vm entry sequence >> >> // recheck here? >> >> If the interrupt happens before the spin_lock(), we'll get a non- ipi >> wakeup and then see it in check_whether(). If it happens after it we'll >> get an IPI which will be ignored until we're snugly in guest mode. >> > > When I first read this I thought "whoa! you want to disable interrupts during the whole time we are in GUEST mode?" But then it dawned on me what you are suggesting: Interrupts would be re-enabled after the context switch because we re-load the processor state? Cool! The thing I can't wrap my head around is what happens when the guest has IF=0 and and external interrupt comes in? Would we still exit? > It's really subtle. With respect to interrupts, VT^H^Hthe hardware provides an override over IF. If an interrupt happens while this override is enabled, we exit guest mode regardless of the state of IF. In effect interrupts are enabled but _delivery_ is disabled. Once we exit guest mode, interrupts become disabled again until we set IF explicitly. > But that aside, a problem that I see is that (IIUC) IPIs use NMIs not EXT-INTs. Assuming that is right, I suppose we might be able to do a similar trick except we also disable NMIs first (is this possible/recommended/forbidden?). > > IPIs are maskable, but are not exactly EXT-INT, I think. >> It's best not to use signals internally. Qemu relies on them and we >> have to support them, but in kernel we should use existing kernel >> mechanisms. >> > > But Avi, this was *your* idea to use signals ;) But in all seriousness, I don't know if I have a choice. I have two requirements which constrain me: > > 1) I need an IPI to cause a VMEXIT > kick_process() can do that in guest mode. Or even smp_call_function_single(). Outside guest mode, we do need a signal, but having it requested via fcntl() seems more flexible to me. Starting out with a hardcoded signal (and no special treatment for guest mode) is okay to get things rolling. > 2) I need to support 2.6.16, strongly preferable as a loadable module. > > According to my research (which is undoubtedly not 100% definitive), 2.6.16 or even the newer kernels do not export most of the IPI facilities. send_sig() happens to be exported and it happens to invoke a reschedule_IPI under the hood, so its convenient. If there is another way to get access to the IPI facility without playing games with signals, I am all ears. But until then I don't know what else to do. If I had the luxury of modifying the kernel source, we could just export what we needed and be done with it. > > Look at the magic that is kernel/external-module-compat.h and the 'sync' target of kernel/Makefile. The idea is that mainline kernel code is as clear and as use-the-bleedingest-edge-apis as we want, with the external module code brutally hacked into something that will compile on older kernels. We go as far back as 2.6.9 IIRC. >> I was interested in how - >pending() and - >read_vector() and the raise >> callback interact, but got distracted by the, err, uniqueness of the >> signal thing. >> > > You still haven't weighed in here ;) Hopefully you have a clearer picture of what I was trying to do now at least. Whether you agree or not is another matter. > My eyes were clouded by the signal thing :) -- error compiling committee.c: too many arguments to function ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV ^ permalink raw reply [flat|nested] 13+ messages in thread
[parent not found: <461E3EDB.3080002-atKUWr5tajBWk0Htik3J/w@public.gmane.org>]
* Re: irqdevice INTR example [not found] ` <461E3EDB.3080002-atKUWr5tajBWk0Htik3J/w@public.gmane.org> @ 2007-04-12 16:01 ` Gregory Haskins 2007-04-13 13:05 ` Fwd: " Gregory Haskins [not found] ` <461E1F73.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> 0 siblings, 2 replies; 13+ messages in thread From: Gregory Haskins @ 2007-04-12 16:01 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f [-- Attachment #1: Type: text/plain, Size: 729 bytes --] >>> On Thu, Apr 12, 2007 at 10:14 AM, in message <461E3EDB.3080002-atKUWr5tajBWk0Htik3J/w@public.gmane.org>, Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote: > > It's really subtle. > > With respect to interrupts, VT^H^Hthe hardware provides an override over > IF. If an interrupt happens while this override is enabled, we exit > guest mode regardless of the state of IF. In effect interrupts are > enabled but _delivery_ is disabled. Once we exit guest mode, interrupts > become disabled again until we set IF explicitly. Heres a version that implements that idea. Note that its still a POC, as the kick_process still needs to be exported and handled in extern-module-compat (or whatever its called) [-- Attachment #2: preemptible-cpu-3.patch --] [-- Type: text/plain, Size: 8176 bytes --] KVM: Preemptible VCPU From: <> This adds support for interrupting an executing CPU Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> --- drivers/kvm/kvm.h | 14 +++++++++++++ drivers/kvm/kvm_main.c | 50 ++++++++++++++++++++++++++++++++++++++++++------ drivers/kvm/svm.c | 43 +++++++++++++++++++++++++++++++++++++++++ drivers/kvm/vmx.c | 47 +++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 146 insertions(+), 8 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 58966d9..56a3d58 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -271,6 +271,19 @@ void kvm_io_bus_register_dev(struct kvm_io_bus *bus, #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) +#define KVM_SIGNAL_VIRTUAL_INTERRUPT 33 /* Hardcoded for now */ + +/* + * structure for maintaining info for interrupting an executing VCPU + */ +struct kvm_vcpu_irq { + spinlock_t lock; + struct task_struct *task; + int signo; + int guest_mode; + int pending; +}; + struct kvm_vcpu { struct kvm *kvm; union { @@ -284,6 +297,7 @@ struct kvm_vcpu { struct kvm_run *run; int interrupt_window_open; struct kvm_irqdevice irq_dev; + struct kvm_vcpu_irq irq; unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ unsigned long rip; /* needs vcpu_load_rsp_rip() */ diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 7e00412..d2a9025 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -299,6 +299,14 @@ static struct kvm *kvm_create_vm(void) struct kvm_vcpu *vcpu = &kvm->vcpus[i]; mutex_init(&vcpu->mutex); + + memset(&vcpu->irq, 0, sizeof(vcpu->irq)); + spin_lock_init(&vcpu->irq.lock); + /* + * This should be settable by userspace someday + */ + vcpu->irq.signo = KVM_SIGNAL_VIRTUAL_INTERRUPT; + vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->mmu.root_hpa = INVALID_PAGE; @@ -1866,6 +1874,19 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_arch_ops->decache_regs(vcpu); } + /* + * Make sure the current task is accurately recorded. Most of the + * time, it will be so we first check if its necessary without taking + * the lock. If there is a mismatch, we must aquire the lock and + * check again to eliminate races + */ + if (unlikely(vcpu->irq.task != current)) { + spin_lock(&vcpu->irq.lock); + if (vcpu->irq.task != current) + vcpu->irq.task = current; + spin_unlock(&vcpu->irq.lock); + } + r = kvm_arch_ops->run(vcpu, kvm_run); out: @@ -2320,13 +2341,30 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this, * Our irq device is requesting to interrupt the vcpu. If it is * currently running, we should inject a host IPI to force a VMEXIT */ + struct kvm_vcpu *vcpu = (struct kvm_vcpu*)this->private; + + spin_lock(&vcpu->irq.lock); - /* - * FIXME: Implement this or the CPU wont notice the interrupt until - * the next natural VMEXIT. Note that this is how the system - * has always worked, so nothing is broken here. This is a future - * enhancement - */ + if (vcpu->irq.task) { + if (vcpu->irq.guest_mode) + /* + * If we are in guest mode, we can optimize the IPI + * by only waking the vcpu. + */ + kick_process(vcpu->irq.task); + else + /* + * If we are not in guest mode, we must assume that + * we could be blocked anywhere, including userspace. + * Send a signal to give everyone a chance to get + * notification + */ + send_sig(vcpu->irq.signo, vcpu->irq.task, 0); + + vcpu->irq.pending = 1; + } + + spin_unlock(&vcpu->irq.lock); } static void kvm_vcpu_irqsink_init(struct kvm_vcpu *vcpu) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index e59a548..641e8e4 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1461,11 +1461,37 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u16 gs_selector; u16 ldt_selector; int r; + unsigned long irq_flags; again: + /* + * We disable interrupts until the next VMEXIT to eliminate a race + * condition for delivery of virtual interrutps. Note that this is + * probably not as bad as it sounds, as interrupts will invoke a + * VMEXIT once transitioned to GUEST mode (and thus exit this lock + * scope) even if they are disabled. + * + * FIXME: Do we need to do anything additional to mask IPI/NMIs? + */ + local_irq_save(irq_flags); + + spin_lock(&vcpu->irq.lock); + + /* + * There are optimizations we can make when signaling interrupts + * if we know the VCPU is in GUEST mode, so mark that here + */ + vcpu->irq.guest_mode = 1; + + /* + * We also must inject interrupts (if any) while the irq_lock + * is held + */ if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); + spin_unlock(&vcpu->irq.lock); + clgi(); pre_svm_run(vcpu); @@ -1597,6 +1623,13 @@ again: #endif : "cc", "memory" ); + /* + * FIXME: We'd like to turn on interrupts ASAP, but is this so early + * that we will mess up the state of the CPU before we fully + * transition from guest to host? + */ + local_irq_restore(irq_flags); + fx_save(vcpu->guest_fx_image); fx_restore(vcpu->host_fx_image); @@ -1617,6 +1650,16 @@ again: reload_tss(vcpu); /* + * Signal that we have transitioned back to host mode + */ + spin_lock(&vcpu->irq.lock); + + vcpu->irq.guest_mode = 0; + vcpu->irq.pending = 0; + + spin_unlock(&vcpu->irq.lock); + + /* * Profile KVM exit RIPs: */ if (unlikely(prof_on == KVM_PROFILING)) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index a0fdf02..b67ad9e 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1722,6 +1722,7 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u16 fs_sel, gs_sel, ldt_sel; int fs_gs_ldt_reload_needed; int r; + unsigned long irq_flags; again: /* @@ -1748,11 +1749,36 @@ again: vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); #endif + if (vcpu->guest_debug.enabled) + kvm_guest_debug_pre(vcpu); + + /* + * We disable interrupts until the next VMEXIT to eliminate a race + * condition for delivery of virtual interrutps. Note that this is + * probably not as bad as it sounds, as interrupts will invoke a + * VMEXIT once transitioned to GUEST mode (and thus exit this lock + * scope) even if they are disabled. + * + * FIXME: Do we need to do anything additional to mask IPI/NMIs? + */ + local_irq_save(irq_flags); + + spin_lock(&vcpu->irq.lock); + + /* + * There are optimizations we can make when signaling interrupts + * if we know the VCPU is in GUEST mode, so mark that here + */ + vcpu->irq.guest_mode = 1; + + /* + * We also must inject interrupts (if any) while the irq.lock + * is held + */ if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); - if (vcpu->guest_debug.enabled) - kvm_guest_debug_pre(vcpu); + spin_unlock(&vcpu->irq.lock); fx_save(vcpu->host_fx_image); fx_restore(vcpu->guest_fx_image); @@ -1880,6 +1906,13 @@ again: : "cc", "memory" ); /* + * FIXME: We'd like to turn on interrupts ASAP, but is this so early + * that we will mess up the state of the CPU before we fully + * transition from guest to host? + */ + local_irq_restore(irq_flags); + + /* * Reload segment selectors ASAP. (it's needed for a functional * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64 * relies on having 0 in %gs for the CPU PDA to work.) @@ -1911,6 +1944,16 @@ again: asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); + /* + * Signal that we have transitioned back to host mode + */ + spin_lock(&vcpu->irq.lock); + + vcpu->irq.guest_mode = 0; + vcpu->irq.pending = 0; + + spin_unlock(&vcpu->irq.lock); + if (fail) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason [-- Attachment #3: Type: text/plain, Size: 345 bytes --] ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV [-- Attachment #4: Type: text/plain, Size: 186 bytes --] _______________________________________________ kvm-devel mailing list kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org https://lists.sourceforge.net/lists/listinfo/kvm-devel ^ permalink raw reply related [flat|nested] 13+ messages in thread
* Fwd: Re: irqdevice INTR example 2007-04-12 16:01 ` Gregory Haskins @ 2007-04-13 13:05 ` Gregory Haskins [not found] ` <461E1F73.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> 1 sibling, 0 replies; 13+ messages in thread From: Gregory Haskins @ 2007-04-13 13:05 UTC (permalink / raw) To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f [-- Attachment #1: Type: text/plain, Size: 1462 bytes --] >>> On Thu, Apr 12, 2007 at 12:01 PM, in message <461E1F73.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>, Gregory Haskins wrote: > >>> On Thu, Apr 12, 2007 at 10:14 AM, in message <461E3EDB.3080002-atKUWr5tajBWk0Htik3J/w@public.gmane.org>, > Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote: >> >> It's really subtle. >> >> With respect to interrupts, VT^H^Hthe hardware provides an override over >> IF. If an interrupt happens while this override is enabled, we exit >> guest mode regardless of the state of IF. In effect interrupts are >> enabled but _delivery_ is disabled. Once we exit guest mode, interrupts >> become disabled again until we set IF explicitly. > > Heres a version that implements that idea. Note that its still a POC, as > the kick_process still needs to be exported and handled in > extern-module-compat (or whatever its called) Here are more refinements to this logic which make it actually usable. It turns out that smp_call_function_single() is exported under at least 2.6.16 (perhaps even earlier, not sure) so I refactored the code to use that instead of kick_process() (thanks for the hint, Avi). I also added checks so that the current task doesn't unnecessarily signal itself (this covers the case we would hit today with the interrupt controller still in QEMU). Also, we check to see if there are any signals pending before VMENTER as an optimization. Comments please. Regards, -Greg [-- Attachment #2: preemptible-cpu-4.patch --] [-- Type: text/plain, Size: 9875 bytes --] KVM: Preemptible VCPU From: <> This adds support for interrupting an executing CPU Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> --- drivers/kvm/kvm.h | 14 +++++++++ drivers/kvm/kvm_main.c | 77 ++++++++++++++++++++++++++++++++++++++++++++---- drivers/kvm/svm.c | 54 ++++++++++++++++++++++++++++++++++ drivers/kvm/vmx.c | 58 +++++++++++++++++++++++++++++++++++- 4 files changed, 195 insertions(+), 8 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 58966d9..56a3d58 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -271,6 +271,19 @@ void kvm_io_bus_register_dev(struct kvm_io_bus *bus, #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) +#define KVM_SIGNAL_VIRTUAL_INTERRUPT 33 /* Hardcoded for now */ + +/* + * structure for maintaining info for interrupting an executing VCPU + */ +struct kvm_vcpu_irq { + spinlock_t lock; + struct task_struct *task; + int signo; + int guest_mode; + int pending; +}; + struct kvm_vcpu { struct kvm *kvm; union { @@ -284,6 +297,7 @@ struct kvm_vcpu { struct kvm_run *run; int interrupt_window_open; struct kvm_irqdevice irq_dev; + struct kvm_vcpu_irq irq; unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ unsigned long rip; /* needs vcpu_load_rsp_rip() */ diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 7e00412..3e12c9c 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -299,6 +299,14 @@ static struct kvm *kvm_create_vm(void) struct kvm_vcpu *vcpu = &kvm->vcpus[i]; mutex_init(&vcpu->mutex); + + memset(&vcpu->irq, 0, sizeof(vcpu->irq)); + spin_lock_init(&vcpu->irq.lock); + /* + * This should be settable by userspace someday + */ + vcpu->irq.signo = KVM_SIGNAL_VIRTUAL_INTERRUPT; + vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->mmu.root_hpa = INVALID_PAGE; @@ -1866,6 +1874,19 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_arch_ops->decache_regs(vcpu); } + /* + * Make sure the current task is accurately recorded. Most of the + * time, it will be so we first check if its necessary without taking + * the lock. If there is a mismatch, we must aquire the lock and + * check again to eliminate races + */ + if (unlikely(vcpu->irq.task != current)) { + spin_lock(&vcpu->irq.lock); + if (vcpu->irq.task != current) + vcpu->irq.task = current; + spin_unlock(&vcpu->irq.lock); + } + r = kvm_arch_ops->run(vcpu, kvm_run); out: @@ -2310,6 +2331,20 @@ out1: } /* + * This function is invoked whenever we want to interrupt a vcpu that is + * currently executing in guest-mode. It currently is a no-op because + * the simple delivery of the IPI to execute this function accomplishes our + * goal: To cause a VMEXIT. We pass the vcpu (which contains the + * vcpu->irq.task, etc) for future use + */ +static void kvm_vcpu_guest_intr(void *info) +{ +#ifdef NOT_YET + struct kvm_vcpu *vcpu = (struct kvm_vcpu*)info; +#endif +} + +/* * This function will be invoked whenever the vcpu->irq_dev raises its INTR * line */ @@ -2320,13 +2355,43 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this, * Our irq device is requesting to interrupt the vcpu. If it is * currently running, we should inject a host IPI to force a VMEXIT */ + struct kvm_vcpu *vcpu = (struct kvm_vcpu*)this->private; + + spin_lock(&vcpu->irq.lock); - /* - * FIXME: Implement this or the CPU wont notice the interrupt until - * the next natural VMEXIT. Note that this is how the system - * has always worked, so nothing is broken here. This is a future - * enhancement - */ + if (vcpu->irq.task && (vcpu->irq.task != current)) { + if (vcpu->irq.guest_mode) { + /* + * If we are in guest mode, we can optimize the IPI + * by executing a function on the owning processor. + */ + int cpu; + + /* + * Not sure if disabling preemption is needed + * since we are in a spin-lock anyway? The + * kick_process() code does this so I copied it + */ + preempt_disable(); + cpu = task_cpu(vcpu->irq.task); + BUG_ON(cpu == smp_processor_id()); + smp_call_function_single(cpu, + kvm_vcpu_guest_intr, + vcpu, 0, 0); + preempt_enable(); + } else + /* + * If we are not in guest mode, we must assume that + * we could be blocked anywhere, including userspace. + * Send a signal to give everyone a chance to get + * notification + */ + send_sig(vcpu->irq.signo, vcpu->irq.task, 0); + + vcpu->irq.pending = 1; + } + + spin_unlock(&vcpu->irq.lock); } static void kvm_vcpu_irqsink_init(struct kvm_vcpu *vcpu) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index e59a548..aa92a7c 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1461,11 +1461,48 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u16 gs_selector; u16 ldt_selector; int r; + unsigned long irq_flags; again: + /* + * We disable interrupts until the next VMEXIT to eliminate a race + * condition for delivery of virtual interrutps. Note that this is + * probably not as bad as it sounds, as interrupts will still invoke + * a VMEXIT once transitioned to GUEST mode (and thus exit this lock + * scope) even if they are disabled. + * + * FIXME: Do we need to do anything additional to mask IPI/NMIs? + */ + local_irq_save(irq_flags); + + spin_lock(&vcpu->irq.lock); + + /* + * If there are any signals pending (virtual interrupt related or + * otherwise), don't even bother trying to enter guest mode... + */ + if (signal_pending(current)) { + kvm_run->exit_reason = KVM_EXIT_INTR; + spin_unlock(&vcpu->irq.lock); + local_irq_restore(irq_flags); + return -EINTR; + } + + /* + * There are optimizations we can make when signaling interrupts + * if we know the VCPU is in GUEST mode, so mark that here + */ + vcpu->irq.guest_mode = 1; + + /* + * We also must inject interrupts (if any) while the irq_lock + * is held + */ if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); + spin_unlock(&vcpu->irq.lock); + clgi(); pre_svm_run(vcpu); @@ -1597,6 +1634,13 @@ again: #endif : "cc", "memory" ); + /* + * FIXME: We'd like to turn on interrupts ASAP, but is this so early + * that we will mess up the state of the CPU before we fully + * transition from guest to host? + */ + local_irq_restore(irq_flags); + fx_save(vcpu->guest_fx_image); fx_restore(vcpu->host_fx_image); @@ -1617,6 +1661,16 @@ again: reload_tss(vcpu); /* + * Signal that we have transitioned back to host mode + */ + spin_lock(&vcpu->irq.lock); + + vcpu->irq.guest_mode = 0; + vcpu->irq.pending = 0; + + spin_unlock(&vcpu->irq.lock); + + /* * Profile KVM exit RIPs: */ if (unlikely(prof_on == KVM_PROFILING)) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index a0fdf02..7d5a650 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1722,6 +1722,7 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u16 fs_sel, gs_sel, ldt_sel; int fs_gs_ldt_reload_needed; int r; + unsigned long irq_flags; again: /* @@ -1748,11 +1749,47 @@ again: vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); #endif + if (vcpu->guest_debug.enabled) + kvm_guest_debug_pre(vcpu); + + /* + * We disable interrupts until the next VMEXIT to eliminate a race + * condition for delivery of virtual interrutps. Note that this is + * probably not as bad as it sounds, as interrupts will still invoke + * a VMEXIT once transitioned to GUEST mode (and thus exit this lock + * scope) even if they are disabled. + * + * FIXME: Do we need to do anything additional to mask IPI/NMIs? + */ + local_irq_save(irq_flags); + + spin_lock(&vcpu->irq.lock); + + /* + * If there are any signals pending (virtual interrupt related or + * otherwise), don't even bother trying to enter guest mode... + */ + if (signal_pending(current)) { + kvm_run->exit_reason = KVM_EXIT_INTR; + spin_unlock(&vcpu->irq.lock); + local_irq_restore(irq_flags); + return -EINTR; + } + + /* + * There are optimizations we can make when signaling interrupts + * if we know the VCPU is in GUEST mode, so mark that here + */ + vcpu->irq.guest_mode = 1; + + /* + * We also must inject interrupts (if any) while the irq.lock + * is held + */ if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); - if (vcpu->guest_debug.enabled) - kvm_guest_debug_pre(vcpu); + spin_unlock(&vcpu->irq.lock); fx_save(vcpu->host_fx_image); fx_restore(vcpu->guest_fx_image); @@ -1880,6 +1917,13 @@ again: : "cc", "memory" ); /* + * FIXME: We'd like to turn on interrupts ASAP, but is this so early + * that we will mess up the state of the CPU before we fully + * transition from guest to host? + */ + local_irq_restore(irq_flags); + + /* * Reload segment selectors ASAP. (it's needed for a functional * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64 * relies on having 0 in %gs for the CPU PDA to work.) @@ -1911,6 +1955,16 @@ again: asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); + /* + * Signal that we have transitioned back to host mode + */ + spin_lock(&vcpu->irq.lock); + + vcpu->irq.guest_mode = 0; + vcpu->irq.pending = 0; + + spin_unlock(&vcpu->irq.lock); + if (fail) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason [-- Attachment #3: Type: text/plain, Size: 345 bytes --] ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV [-- Attachment #4: Type: text/plain, Size: 186 bytes --] _______________________________________________ kvm-devel mailing list kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org https://lists.sourceforge.net/lists/listinfo/kvm-devel ^ permalink raw reply related [flat|nested] 13+ messages in thread
[parent not found: <461E1F73.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>]
* Re: irqdevice INTR example [not found] ` <461E1F73.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> @ 2007-04-14 14:30 ` Avi Kivity [not found] ` <4620E56A.7040207-atKUWr5tajBWk0Htik3J/w@public.gmane.org> 0 siblings, 1 reply; 13+ messages in thread From: Avi Kivity @ 2007-04-14 14:30 UTC (permalink / raw) To: Gregory Haskins; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f Gregory Haskins wrote: >> It's really subtle. >> >> With respect to interrupts, VT^H^Hthe hardware provides an override over >> IF. If an interrupt happens while this override is enabled, we exit >> guest mode regardless of the state of IF. In effect interrupts are >> enabled but _delivery_ is disabled. Once we exit guest mode, interrupts >> become disabled again until we set IF explicitly. >> > > Heres a version that implements that idea. Note that its still a POC, as the kick_process still needs to be exported and handled in extern-module-compat (or whatever its called) > Hopefully Ingo will ack such a patch, as he suggested kick_process() originally. > @@ -1866,6 +1874,19 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) > kvm_arch_ops->decache_regs(vcpu); > } > > + /* > + * Make sure the current task is accurately recorded. Most of the > + * time, it will be so we first check if its necessary without taking > + * the lock. If there is a mismatch, we must aquire the lock and > + * check again to eliminate races > + */ > + if (unlikely(vcpu->irq.task != current)) { > + spin_lock(&vcpu->irq.lock); > + if (vcpu->irq.task != current) > + vcpu->irq.task = current; > + spin_unlock(&vcpu->irq.lock); > + } > + > Isn't this equivalent to vcpu->irq.task = current ? btw, these double-check things are broken, see http://www.cs.umd.edu/~pugh/java/memoryModel/DoubleCheckedLocking.html. It will usually work on x86, and can be made to work elsewhere with memory barriers, but it's less simple than it appears. > + * FIXME: Do we need to do anything additional to mask IPI/NMIs? > I'm fairly certain you don't. I keep hearing about the deadlockability of sending ipis with interrupts disabled. > + */ > + local_irq_save(irq_flags); > + > + spin_lock(&vcpu->irq.lock); > + > + /* > + * There are optimizations we can make when signaling interrupts > + * if we know the VCPU is in GUEST mode, so mark that here > + */ > + vcpu->irq.guest_mode = 1; > + > + /* > + * We also must inject interrupts (if any) while the irq_lock > + * is held > + */ > if (!vcpu->mmio_read_completed) > do_interrupt_requests(vcpu, kvm_run); > > + spin_unlock(&vcpu->irq.lock); > + > clgi(); > stgi() / clgi() is what implements interrupt handling under svm. I think you got this right, but please tell me you read the relevant sections in the manual a few times. > @@ -1617,6 +1650,16 @@ again: > reload_tss(vcpu); > > /* > + * Signal that we have transitioned back to host mode > + */ > + spin_lock(&vcpu->irq.lock); > + > + vcpu->irq.guest_mode = 0; > + vcpu->irq.pending = 0; > + > + spin_unlock(&vcpu->irq.lock); > + > If an interrupt happened just before here, then we need to check it, otherwise it's lost. Also an opportunity to see ->read_vector in action, which was the purpose of the exercise IIRC. > > asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); > > + /* > + * Signal that we have transitioned back to host mode > + */ > + spin_lock(&vcpu->irq.lock); > + > + vcpu->irq.guest_mode = 0; > + vcpu->irq.pending = 0; > + > + spin_unlock(&vcpu->irq.lock); > + > if (fail) { > kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; > kvm_run->fail_entry.hardware_entry_failure_reason > Ditto. You also want spin_lock_irq() when interrupts are enabled, since some callers will lock it in interrupt context. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ^ permalink raw reply [flat|nested] 13+ messages in thread
[parent not found: <4620E56A.7040207-atKUWr5tajBWk0Htik3J/w@public.gmane.org>]
* Re: irqdevice INTR example [not found] ` <4620E56A.7040207-atKUWr5tajBWk0Htik3J/w@public.gmane.org> @ 2007-04-15 22:32 ` Gregory Haskins 2007-04-15 23:32 ` Gregory Haskins [not found] ` <46226FBC.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> 0 siblings, 2 replies; 13+ messages in thread From: Gregory Haskins @ 2007-04-15 22:32 UTC (permalink / raw) To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f >>> On Sat, Apr 14, 2007 at 10:30 AM, in message <4620E56A.7040207-atKUWr5tajBWk0Htik3J/w@public.gmane.org>, Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> wrote: > Gregory Haskins wrote: >>> It's really subtle. >>> >>> With respect to interrupts, VT^H^Hthe hardware provides an override over >>> IF. If an interrupt happens while this override is enabled, we exit >>> guest mode regardless of the state of IF. In effect interrupts are >>> enabled but _delivery_ is disabled. Once we exit guest mode, interrupts >>> become disabled again until we set IF explicitly. >>> >> >> Heres a version that implements that idea. Note that its still a POC, as > the kick_process still needs to be exported and handled in > extern-module-compat (or whatever its called) >> > > Hopefully Ingo will ack such a patch, as he suggested kick_process() > originally. Out of curiosity, did you review my kick_process() version or the newer smp_call_function_single() based version of the patch? > > >> @@ -1866,6 +1874,19 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, > struct kvm_run *kvm_run) >> kvm_arch_ops->decache_regs(vcpu); >> } >> >> + /* >> + * Make sure the current task is accurately recorded. Most of the >> + * time, it will be so we first check if its necessary without taking >> + * the lock. If there is a mismatch, we must aquire the lock and >> + * check again to eliminate races >> + */ >> + if (unlikely(vcpu->irq.task != current)) { >> + spin_lock(&vcpu->irq.lock); >> + if (vcpu->irq.task != current) >> + vcpu->irq.task = current; >> + spin_unlock(&vcpu->irq.lock); >> + } >> + >> > > Isn't this equivalent to > > vcpu->irq.task = current Yeah, good point. I originally had a printk in there to report when it changes, thus the conditional. I didn't collapse it when I took the printk out. I suppose you could argue that updating the variable indiscriminately will unnecessarily dirty a cache-line, but I doubt it would matter in the grand scheme of things. > > ? > > btw, these double-check things are broken, see > http://www.cs.umd.edu/~pugh/java/memoryModel/DoubleCheckedLocking.html. > It will usually work on x86, and can be made to work elsewhere with > memory barriers, but it's less simple than it appears. Interesting. Thanks for the link! > stgi() / clgi() is what implements interrupt handling under svm. I > think you got this right, but please tell me you read the relevant > sections in the manual a few times. Actually, no. I have read the VMX doc pretty thoroughly but I haven't looked at the SVN doc at all yet. I'm still drinking from a fire hose between VMX, KVM, QEMU etc. ;) I will certainly rectify this situation before I submit the patch for actual inclusion. If I do anything particularly egregious in SVN, pointers are more than welcome. > > >> @@ -1617,6 +1650,16 @@ again: >> reload_tss(vcpu); >> >> /* >> + * Signal that we have transitioned back to host mode >> + */ >> + spin_lock(&vcpu->irq.lock); >> + >> + vcpu->irq.guest_mode = 0; >> + vcpu->irq.pending = 0; >> + >> + spin_unlock(&vcpu->irq.lock); >> + >> > > If an interrupt happened just before here, then we need to check it, > otherwise it's lost. Also an opportunity to see ->read_vector in > action, which was the purpose of the exercise IIRC. I'm not sure if you noticed this, but the read_vector() call already happened as part of the do_interrupt_requests() call before the VMENTRY (assuming you have applied the previous patch in the series, of course ;). You would never "lose" the interrupt even if the interrupt came in before the critical section because it would be safely queued in the irqdevice model for the next pass through the run() loop. However, you do highlight a bug in this code that could cause the interrupt to be deferred indefinitely until a second interrupt came in if the processor happens to halt after this VMEXIT. I.e., since a signal isn't being delivered while in irq.guest_mode = 1, we do need to handle this at this layer by looping back into the CPU. I will fix this. Good find! > >> >> asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); >> >> + /* >> + * Signal that we have transitioned back to host mode >> + */ >> + spin_lock(&vcpu->irq.lock); >> + >> + vcpu->irq.guest_mode = 0; >> + vcpu->irq.pending = 0; >> + >> + spin_unlock(&vcpu->irq.lock); >> + >> if (fail) { >> kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; >> kvm_run->fail_entry.hardware_entry_failure_reason >> > > Ditto. > > You also want spin_lock_irq() when interrupts are enabled, since some > callers will lock it in interrupt context. > Good catch. I forgot to update the locks after the changeover to disabling IRQs. ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: irqdevice INTR example 2007-04-15 22:32 ` Gregory Haskins @ 2007-04-15 23:32 ` Gregory Haskins [not found] ` <46226FBC.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> 1 sibling, 0 replies; 13+ messages in thread From: Gregory Haskins @ 2007-04-15 23:32 UTC (permalink / raw) To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f [-- Attachment #1: Type: text/plain, Size: 1308 bytes --] >>> On Sun, Apr 15, 2007 at 6:32 PM, in message <46226FBC.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>, Gregory Haskins wrote: > However, you do highlight a bug in this code that could cause the interrupt > to be deferred indefinitely until a second interrupt came in if the processor > happens to halt after this VMEXIT. I.e., since a signal isn't being delivered while > in irq.guest_mode = 1, we do need to handle this at this layer by looping back > into the CPU. I think I misspoke. I inspected the code after writing this and realized that I think it was fine the way it was. The point of confusion I think was the irq.pending flag, which really wasn't doing anything useful in this context. But other than the superfluous variable, I think the design would work as it was written. Basically, the calls to pending/read_vector would indicate the appropriate vector to inject...so all we really need is a "kick" via the IPI to knock the task out of GUEST mode (if applicable). Once that happens, the normal processing will pick up on the posted vector, even if a HLT were in progress. I removed the irq.pending flag for clarity. If I truly need it downstream in my patch series I will add it there. Attached is an updated patch incorporating the latest feedback. Regards, -Greg [-- Attachment #2: preemptible-cpu-5.patch --] [-- Type: text/plain, Size: 9769 bytes --] KVM: Preemptible VCPU From: <> This adds support for interrupting an executing CPU Signed-off-by: Gregory Haskins <ghaskins-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> --- drivers/kvm/kvm.h | 13 +++++++++ drivers/kvm/kvm_main.c | 68 ++++++++++++++++++++++++++++++++++++++++++++---- drivers/kvm/svm.c | 51 ++++++++++++++++++++++++++++++++++++ drivers/kvm/vmx.c | 55 +++++++++++++++++++++++++++++++++++++-- 4 files changed, 179 insertions(+), 8 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 58966d9..87357cc 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -271,6 +271,18 @@ void kvm_io_bus_register_dev(struct kvm_io_bus *bus, #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) +#define KVM_SIGNAL_VIRTUAL_INTERRUPT 33 /* Hardcoded for now */ + +/* + * structure for maintaining info for interrupting an executing VCPU + */ +struct kvm_vcpu_irq { + spinlock_t lock; + struct task_struct *task; + int signo; + int guest_mode; +}; + struct kvm_vcpu { struct kvm *kvm; union { @@ -284,6 +296,7 @@ struct kvm_vcpu { struct kvm_run *run; int interrupt_window_open; struct kvm_irqdevice irq_dev; + struct kvm_vcpu_irq irq; unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ unsigned long rip; /* needs vcpu_load_rsp_rip() */ diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 7e00412..6d7178b 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -299,6 +299,14 @@ static struct kvm *kvm_create_vm(void) struct kvm_vcpu *vcpu = &kvm->vcpus[i]; mutex_init(&vcpu->mutex); + + memset(&vcpu->irq, 0, sizeof(vcpu->irq)); + spin_lock_init(&vcpu->irq.lock); + /* + * This should be settable by userspace someday + */ + vcpu->irq.signo = KVM_SIGNAL_VIRTUAL_INTERRUPT; + vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->mmu.root_hpa = INVALID_PAGE; @@ -1838,6 +1846,7 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; sigset_t sigsaved; + unsigned long irqsaved; vcpu_load(vcpu); @@ -1866,6 +1875,10 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_arch_ops->decache_regs(vcpu); } + spin_lock_irqsave(&vcpu->irq.lock, irqsaved); + vcpu->irq.task = current; + spin_unlock_irqrestore(&vcpu->irq.lock, irqsaved); + r = kvm_arch_ops->run(vcpu, kvm_run); out: @@ -2310,6 +2323,20 @@ out1: } /* + * This function is invoked whenever we want to interrupt a vcpu that is + * currently executing in guest-mode. It currently is a no-op because + * the simple delivery of the IPI to execute this function accomplishes our + * goal: To cause a VMEXIT. We pass the vcpu (which contains the + * vcpu->irq.task, etc) for future use + */ +static void kvm_vcpu_guest_intr(void *info) +{ +#ifdef NOT_YET + struct kvm_vcpu *vcpu = (struct kvm_vcpu*)info; +#endif +} + +/* * This function will be invoked whenever the vcpu->irq_dev raises its INTR * line */ @@ -2320,13 +2347,42 @@ static void kvm_vcpu_intr(struct kvm_irqsink *this, * Our irq device is requesting to interrupt the vcpu. If it is * currently running, we should inject a host IPI to force a VMEXIT */ + struct kvm_vcpu *vcpu = (struct kvm_vcpu*)this->private; + unsigned long flags; + + spin_lock_irqsave(&vcpu->irq.lock, flags); - /* - * FIXME: Implement this or the CPU wont notice the interrupt until - * the next natural VMEXIT. Note that this is how the system - * has always worked, so nothing is broken here. This is a future - * enhancement - */ + if (vcpu->irq.task && (vcpu->irq.task != current)) { + if (vcpu->irq.guest_mode) { + /* + * If we are in guest mode, we can optimize the IPI + * by executing a function on the owning processor. + */ + int cpu; + + /* + * Not sure if disabling preemption is needed + * since we are in a spin-lock anyway? The + * kick_process() code does this so I copied it + */ + preempt_disable(); + cpu = task_cpu(vcpu->irq.task); + BUG_ON(cpu == smp_processor_id()); + smp_call_function_single(cpu, + kvm_vcpu_guest_intr, + vcpu, 0, 0); + preempt_enable(); + } else + /* + * If we are not in guest mode, we must assume that + * we could be blocked anywhere, including userspace. + * Send a signal to give everyone a chance to get + * notification + */ + send_sig(vcpu->irq.signo, vcpu->irq.task, 0); + } + + spin_unlock_irqrestore(&vcpu->irq.lock, flags); } static void kvm_vcpu_irqsink_init(struct kvm_vcpu *vcpu) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index e59a548..f5cdffe 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1461,11 +1461,48 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u16 gs_selector; u16 ldt_selector; int r; + unsigned long irq_flags; again: + /* + * We disable interrupts until the next VMEXIT to eliminate a race + * condition for delivery of virtual interrutps. Note that this is + * probably not as bad as it sounds, as interrupts will still invoke + * a VMEXIT once transitioned to GUEST mode (and thus exit this lock + * scope) even if they are disabled. + * + * FIXME: Do we need to do anything additional to mask IPI/NMIs? + */ + local_irq_save(irq_flags); + + spin_lock(&vcpu->irq.lock); + + /* + * If there are any signals pending (virtual interrupt related or + * otherwise), don't even bother trying to enter guest mode... + */ + if (signal_pending(current)) { + kvm_run->exit_reason = KVM_EXIT_INTR; + spin_unlock(&vcpu->irq.lock); + local_irq_restore(irq_flags); + return -EINTR; + } + + /* + * There are optimizations we can make when signaling interrupts + * if we know the VCPU is in GUEST mode, so mark that here + */ + vcpu->irq.guest_mode = 1; + + /* + * We also must inject interrupts (if any) while the irq_lock + * is held + */ if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); + spin_unlock(&vcpu->irq.lock); + clgi(); pre_svm_run(vcpu); @@ -1597,6 +1634,13 @@ again: #endif : "cc", "memory" ); + /* + * FIXME: We'd like to turn on interrupts ASAP, but is this so early + * that we will mess up the state of the CPU before we fully + * transition from guest to host? + */ + local_irq_restore(irq_flags); + fx_save(vcpu->guest_fx_image); fx_restore(vcpu->host_fx_image); @@ -1617,6 +1661,13 @@ again: reload_tss(vcpu); /* + * Signal that we have transitioned back to host mode + */ + spin_lock_irqsave(&vcpu->irq.lock, irq_flags); + vcpu->irq.guest_mode = 0; + spin_unlock_irqrestore(&vcpu->irq.lock, irq_flags); + + /* * Profile KVM exit RIPs: */ if (unlikely(prof_on == KVM_PROFILING)) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index a0fdf02..654c4d6 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1722,6 +1722,7 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u16 fs_sel, gs_sel, ldt_sel; int fs_gs_ldt_reload_needed; int r; + unsigned long irq_flags; again: /* @@ -1748,11 +1749,47 @@ again: vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); #endif + if (vcpu->guest_debug.enabled) + kvm_guest_debug_pre(vcpu); + + /* + * We disable interrupts until the next VMEXIT to eliminate a race + * condition for delivery of virtual interrutps. Note that this is + * probably not as bad as it sounds, as interrupts will still invoke + * a VMEXIT once transitioned to GUEST mode (and thus exit this lock + * scope) even if they are disabled. + * + * FIXME: Do we need to do anything additional to mask IPI/NMIs? + */ + local_irq_save(irq_flags); + + spin_lock(&vcpu->irq.lock); + + /* + * If there are any signals pending (virtual interrupt related or + * otherwise), don't even bother trying to enter guest mode... + */ + if (signal_pending(current)) { + kvm_run->exit_reason = KVM_EXIT_INTR; + spin_unlock(&vcpu->irq.lock); + local_irq_restore(irq_flags); + return -EINTR; + } + + /* + * There are optimizations we can make when signaling interrupts + * if we know the VCPU is in GUEST mode, so mark that here + */ + vcpu->irq.guest_mode = 1; + + /* + * We also must inject interrupts (if any) while the irq.lock + * is held + */ if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); - if (vcpu->guest_debug.enabled) - kvm_guest_debug_pre(vcpu); + spin_unlock(&vcpu->irq.lock); fx_save(vcpu->host_fx_image); fx_restore(vcpu->guest_fx_image); @@ -1880,6 +1917,13 @@ again: : "cc", "memory" ); /* + * FIXME: We'd like to turn on interrupts ASAP, but is this so early + * that we will mess up the state of the CPU before we fully + * transition from guest to host? + */ + local_irq_restore(irq_flags); + + /* * Reload segment selectors ASAP. (it's needed for a functional * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64 * relies on having 0 in %gs for the CPU PDA to work.) @@ -1911,6 +1955,13 @@ again: asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); + /* + * Signal that we have transitioned back to host mode + */ + spin_lock_irqsave(&vcpu->irq.lock, irq_flags); + vcpu->irq.guest_mode = 0; + spin_unlock_irqrestore(&vcpu->irq.lock, irq_flags); + if (fail) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason [-- Attachment #3: Type: text/plain, Size: 286 bytes --] ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ [-- Attachment #4: Type: text/plain, Size: 186 bytes --] _______________________________________________ kvm-devel mailing list kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org https://lists.sourceforge.net/lists/listinfo/kvm-devel ^ permalink raw reply related [flat|nested] 13+ messages in thread
[parent not found: <46226FBC.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>]
* Re: irqdevice INTR example [not found] ` <46226FBC.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org> @ 2007-04-16 5:46 ` Avi Kivity 0 siblings, 0 replies; 13+ messages in thread From: Avi Kivity @ 2007-04-16 5:46 UTC (permalink / raw) To: Gregory Haskins; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f Gregory Haskins wrote: > Out of curiosity, did you review my kick_process() version or the newer smp_call_function_single() based version of the patch? > > The former. >> >>> @@ -1866,6 +1874,19 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, >>> >> struct kvm_run *kvm_run) >> >>> kvm_arch_ops->decache_regs(vcpu); >>> } >>> >>> + /* >>> + * Make sure the current task is accurately recorded. Most of the >>> + * time, it will be so we first check if its necessary without taking >>> + * the lock. If there is a mismatch, we must aquire the lock and >>> + * check again to eliminate races >>> + */ >>> + if (unlikely(vcpu->irq.task != current)) { >>> + spin_lock(&vcpu->irq.lock); >>> + if (vcpu->irq.task != current) >>> + vcpu->irq.task = current; >>> + spin_unlock(&vcpu->irq.lock); >>> + } >>> + >>> >>> >> Isn't this equivalent to >> >> vcpu->irq.task = current >> > > Yeah, good point. I originally had a printk in there to report when it changes, thus the conditional. I didn't collapse it when I took the printk out. I suppose you could argue that updating the variable indiscriminately will unnecessarily dirty a cache-line, but I doubt it would matter in the grand scheme of things. > > The vcpu cachelines are already dirty. Please repost the patchset (one patch per message) so I can see it together. Thanks. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ ^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2007-04-16 5:46 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-04-12 4:02 irqdevice INTR example Gregory Haskins
[not found] ` <461D7702.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
2007-04-12 8:02 ` Avi Kivity
[not found] ` <461DE791.1040707-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-12 8:18 ` Christoph Hellwig
2007-04-12 11:55 ` Gregory Haskins
[not found] ` <461DE5C9.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
2007-04-12 12:49 ` Avi Kivity
[not found] ` <461E2AD5.7070905-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-12 13:43 ` Gregory Haskins
[not found] ` <461DFF1C.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
2007-04-12 14:14 ` Avi Kivity
[not found] ` <461E3EDB.3080002-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-12 16:01 ` Gregory Haskins
2007-04-13 13:05 ` Fwd: " Gregory Haskins
[not found] ` <461E1F73.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
2007-04-14 14:30 ` Avi Kivity
[not found] ` <4620E56A.7040207-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-04-15 22:32 ` Gregory Haskins
2007-04-15 23:32 ` Gregory Haskins
[not found] ` <46226FBC.BA47.005A.0-Et1tbQHTxzrQT0dZR+AlfA@public.gmane.org>
2007-04-16 5:46 ` Avi Kivity
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox