From: "Michael S. Tsirkin" <mst@redhat.com>
To: kvm@vger.kernel.org
Subject: [PATCHv1 dont apply] RFC: kvm eoi PV using shared memory
Date: Sun, 15 Apr 2012 19:18:58 +0300 [thread overview]
Message-ID: <20120415161857.GA8710@redhat.com> (raw)
In-Reply-To: <20120410132756.GA14101@redhat.com>
I got lots of useful feedback from v0 so I thought
sending out a brain dump again would be a good idea.
This is mainly to show how I'm trying to address the
comments I got from the previous round. Flames/feedback
are wellcome!
Changes from v0:
- Tweaked setup MSRs a bit
- Keep ISR bit set. Before reading ISR, test EOI in guest memory and clear
- Check priority for nested interrupts, we can
enable optimization if new one is high priority
- Disable optimization for any interrupt handled by ioapic
(This is because ioapic handles notifiers and pic and it generally gets messy.
It's possible that we can optimize some ioapic-handled
edge interrupts - but is it worth it?)
- A less intrusive change for guest apic (0 overhead without kvm)
---
I took a stub at implementing PV EOI using shared memory.
This should reduce the number of exits an interrupt
causes as much as by half.
A partially complete draft for both host and guest parts
is below.
The idea is simple: there's a bit, per APIC, in guest memory,
that tells the guest that it does not need EOI.
We set it before injecting an interrupt and clear
before injecting a nested one. Guest tests it using
a test and clear operation - this is necessary
so that host can detect interrupt nesting -
and if set, it can skip the EOI MSR.
There's a new MSR to set the address of said register
in guest memory. Otherwise not much changed:
- Guest EOI is not required
- Register is tested ISR is automatically cleared before injection
qemu support is incomplete - mostly for feature negotiation.
need to add some trace points to enable profiling.
No testing was done beyond compiling the kernel.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index b97596e..c9c70ea 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -26,11 +26,13 @@
#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
/* Technically wrong, but this avoids compilation errors on some gcc
versions. */
-#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
+#define BITOP_ADDR_CONSTRAINT "=m"
#else
-#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
+#define BITOP_ADDR_CONSTRAINT "+m"
#endif
+#define BITOP_ADDR(x) BITOP_ADDR_CONSTRAINT (*(volatile long *) (x))
+
#define ADDR BITOP_ADDR(addr)
/*
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e216ba0..3d09ef1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -481,6 +481,12 @@ struct kvm_vcpu_arch {
u64 length;
u64 status;
} osvw;
+
+ struct {
+ u64 msr_val;
+ struct gfn_to_hva_cache data;
+ bool pending;
+ } eoi;
};
struct kvm_lpage_info {
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 734c376..164376a 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -22,6 +22,7 @@
#define KVM_FEATURE_CLOCKSOURCE2 3
#define KVM_FEATURE_ASYNC_PF 4
#define KVM_FEATURE_STEAL_TIME 5
+#define KVM_FEATURE_EOI 6
/* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored.
@@ -37,6 +38,8 @@
#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
#define MSR_KVM_STEAL_TIME 0x4b564d03
+#define MSR_KVM_EOI_EN 0x4b564d04
+#define MSR_KVM_EOI_DISABLED 0x0L
struct kvm_steal_time {
__u64 steal;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b8ba6e4..450aae4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -39,6 +39,7 @@
#include <asm/desc.h>
#include <asm/tlbflush.h>
#include <asm/idle.h>
+#include <asm/apic.h>
static int kvmapf = 1;
@@ -290,6 +291,33 @@ static void kvm_register_steal_time(void)
cpu, __pa(st));
}
+/* TODO: needs to be early? aligned? */
+static DEFINE_EARLY_PER_CPU(u8, apic_eoi, 0);
+
+/* Our own copy of __test_and_clear_bit to make sure
+ * it is done with a single instruction */
+static inline int kvm_test_and_clear_bit(int nr, volatile u8* addr)
+{
+ int oldbit;
+
+ asm volatile("btr %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit),
+ BITOP_ADDR_CONSTRAINT (*(volatile u8 *) (addr))
+ : "Ir" (nr));
+ return oldbit;
+}
+
+static void (*kvm_guest_native_apic_write)(u32 reg, u32 val);
+static void kvm_guest_apic_write(u32 reg, u32 val)
+{
+ if (reg == APIC_EOI &&
+ kvm_test_and_clear_bit(0, &__get_cpu_var(apic_eoi)))
+ return;
+
+ kvm_guest_native_apic_write(reg, val);
+}
+
void __cpuinit kvm_guest_cpu_init(void)
{
if (!kvm_para_available())
@@ -307,11 +335,18 @@ void __cpuinit kvm_guest_cpu_init(void)
smp_processor_id());
}
+
+ if (kvm_para_has_feature(KVM_FEATURE_EOI)) {
+ kvm_guest_native_apic_write = apic->write;
+ apic->write = kvm_guest_apic_write;
+ wrmsrl(MSR_KVM_EOI_EN, __pa(&__get_cpu_var(apic_eoi)));
+ }
+
if (has_steal_clock)
kvm_register_steal_time();
}
-static void kvm_pv_disable_apf(void *unused)
+static void kvm_pv_disable_apf(void)
{
if (!__get_cpu_var(apf_reason).enabled)
return;
@@ -323,11 +358,18 @@ static void kvm_pv_disable_apf(void *unused)
smp_processor_id());
}
+static void kvm_pv_guest_cpu_reboot(void *unused)
+{
+ if (kvm_para_has_feature(KVM_FEATURE_EOI))
+ wrmsrl(MSR_KVM_EOI_EN, MSR_KVM_EOI_DISABLED);
+ kvm_pv_disable_apf();
+}
+
static int kvm_pv_reboot_notify(struct notifier_block *nb,
unsigned long code, void *unused)
{
if (code == SYS_RESTART)
- on_each_cpu(kvm_pv_disable_apf, NULL, 1);
+ on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
return NOTIFY_DONE;
}
@@ -378,7 +420,9 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
static void kvm_guest_cpu_offline(void *dummy)
{
kvm_disable_steal_time();
- kvm_pv_disable_apf(NULL);
+ if (kvm_para_has_feature(KVM_FEATURE_EOI))
+ wrmsrl(MSR_KVM_EOI_EN, MSR_KVM_EOI_DISABLED);
+ kvm_pv_disable_apf();
apf_task_wake_all();
}
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 9fed5be..7d00d2d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -408,6 +408,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
(1 << KVM_FEATURE_NOP_IO_DELAY) |
(1 << KVM_FEATURE_CLOCKSOURCE2) |
(1 << KVM_FEATURE_ASYNC_PF) |
+ (1 << KVM_FEATURE_EOI) |
(1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
if (sched_info_on())
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 7e06ba1..07bdfab 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -48,7 +48,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
if (!irqchip_in_kernel(v->kvm))
return v->arch.interrupt.pending;
- if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
+ if (kvm_apic_has_interrupt(v) < 0) { /* LAPIC */
if (kvm_apic_accept_pic_intr(v)) {
s = pic_irqchip(v->kvm); /* PIC */
return s->output;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 992b4ea..c2118ab 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -270,6 +270,54 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
irq->level, irq->trig_mode);
}
+static int eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
+{
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.eoi.data, &val,
+ sizeof(val));
+}
+
+static int eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
+{
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.eoi.data, val,
+ sizeof(*val));
+}
+
+static inline bool eoi_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.eoi.msr_val != MSR_KVM_EOI_DISABLED;
+}
+
+static bool eoi_get_pending(struct kvm_vcpu *vcpu)
+{
+ u8 val;
+ if (eoi_get_user(vcpu, &val) < 0)
+ apic_debug("Can't read EOI MSR value: 0x%llx\n",
+ (unsigned long long)vcpi->arch.eoi.msr_val);
+ return val & 0x1;
+}
+
+static void eoi_set_pending(struct kvm_vcpu *vcpu)
+{
+ if (eoi_put_user(vcpu, 0x1) < 0) {
+ apic_debug("Can't set EOI MSR value: 0x%llx\n",
+ (unsigned long long)vcpi->arch.eoi.msr_val);
+ return;
+ }
+ vcpu->arch.eoi.pending = true;
+}
+
+static void eoi_clr_pending(struct kvm_vcpu *vcpu)
+{
+ if (eoi_put_user(vcpu, 0x0) < 0) {
+ apic_debug("Can't clear EOI MSR value: 0x%llx\n",
+ (unsigned long long)vcpi->arch.eoi.msr_val);
+ return;
+ }
+ vcpu->arch.eoi.pending = false;
+}
+
static inline int apic_find_highest_isr(struct kvm_lapic *apic)
{
int result;
@@ -280,6 +328,20 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
return result;
}
+static void apic_update_isr(struct kvm_lapic *apic)
+{
+ int vector;
+ if (!eoi_enabled(apic->vcpu) ||
+ !apic->vcpu->arch.eoi.pending ||
+ eoi_get_pending(apic->vcpu))
+ return;
+ apic->vcpu->arch.eoi.pending = false;
+ vector = apic_find_highest_isr(apic);
+ if (vector == -1)
+ return;
+ apic_clear_vector(vector, apic->regs + APIC_ISR);
+}
+
static void apic_update_ppr(struct kvm_lapic *apic)
{
u32 tpr, isrv, ppr, old_ppr;
@@ -287,6 +349,7 @@ static void apic_update_ppr(struct kvm_lapic *apic)
old_ppr = apic_get_reg(apic, APIC_PROCPRI);
tpr = apic_get_reg(apic, APIC_TASKPRI);
+ apic_update_isr(apic);
isr = apic_find_highest_isr(apic);
isrv = (isr != -1) ? isr : 0;
@@ -492,6 +555,8 @@ static void apic_set_eoi(struct kvm_lapic *apic)
if (vector == -1)
return;
+ if (eoi_enabled(apic->vcpu))
+ eoi_clr_pending(apic->vcpu);
apic_clear_vector(vector, apic->regs + APIC_ISR);
apic_update_ppr(apic);
@@ -1085,6 +1150,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
atomic_set(&apic->lapic_timer.pending, 0);
if (kvm_vcpu_is_bsp(vcpu))
vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ vcpu->arch.eoi.msr_val = MSR_KVM_EOI_DISABLED;
+ vcpu->arch.eoi.pending = false;
apic_update_ppr(apic);
vcpu->arch.apic_arb_prio = 0;
@@ -1210,9 +1277,10 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
apic_update_ppr(apic);
highest_irr = apic_find_highest_irr(apic);
- if ((highest_irr == -1) ||
- ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
+ if (highest_irr == -1)
return -1;
+ if (((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
+ return -2;
return highest_irr;
}
@@ -1244,9 +1312,20 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
int vector = kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic = vcpu->arch.apic;
- if (vector == -1)
+ /* Detect interrupt nesting and disable EOI optimization */
+ if (eoi_enabled(vcpu) && vector == -2)
+ eoi_clr_pending(vcpu);
+
+ if (vector < 0)
return -1;
+ if (eoi_enabled(vcpu)) {
+ if (kvm_ioapic_handles_vector(vcpu->kvm, vector))
+ eoi_clr_pending(vcpu);
+ else
+ eoi_set_pending(vcpu);
+ }
+
apic_set_vector(vector, apic->regs + APIC_ISR);
apic_update_ppr(apic);
apic_clear_irr(vector, apic);
@@ -1261,6 +1340,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
MSR_IA32_APICBASE_BASE;
kvm_apic_set_version(vcpu);
+ if (eoi_enabled(apic->vcpu))
+ apic->vcpu->arch.eoi.pending = eoi_get_pending(apic->vcpu);
apic_update_ppr(apic);
hrtimer_cancel(&apic->lapic_timer.timer);
update_divide_count(apic);
@@ -1392,3 +1473,18 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
return 0;
}
+
+int kvm_pv_enable_apic_eoi(struct kvm_vcpu *vcpu, u64 data)
+{
+ if (data == MSR_KVM_EOI_DISABLED) {
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ if (apic && apic_enabled(apic))
+ apic_update_isr(apic);
+ } else if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.eoi.data,
+ data))
+ return 1;
+
+ vcpu->arch.eoi.msr_val = data;
+ vcpu->arch.eoi.pending = false;
+ return 0;
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 6f4ce25..042dace 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -60,4 +60,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
{
return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
}
+
+int kvm_pv_enable_apic_eoi(struct kvm_vcpu *vcpu, u64 data);
#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4044ce0..31d6762 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1627,6 +1627,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (kvm_pv_enable_async_pf(vcpu, data))
return 1;
break;
+ case MSR_KVM_EOI_EN:
+ if (kvm_pv_enable_apic_eoi(vcpu, data))
+ return 1;
+ break;
case MSR_KVM_STEAL_TIME:
if (unlikely(!sched_info_on()))
next prev parent reply other threads:[~2012-04-15 16:18 UTC|newest]
Thread overview: 32+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-04-10 13:27 [PATCHv0 dont apply] RFC: kvm eoi PV using shared memory Michael S. Tsirkin
2012-04-10 14:03 ` Avi Kivity
2012-04-10 14:26 ` Michael S. Tsirkin
2012-04-10 14:33 ` Avi Kivity
2012-04-10 14:53 ` Michael S. Tsirkin
2012-04-10 15:00 ` Avi Kivity
2012-04-10 15:14 ` Michael S. Tsirkin
2012-04-10 16:08 ` Avi Kivity
2012-04-10 17:06 ` Michael S. Tsirkin
2012-04-10 17:59 ` Gleb Natapov
2012-04-10 19:30 ` Michael S. Tsirkin
2012-04-10 19:33 ` Gleb Natapov
2012-04-10 19:40 ` Michael S. Tsirkin
2012-04-10 19:42 ` Gleb Natapov
2012-04-15 16:18 ` Michael S. Tsirkin [this message]
2012-04-16 10:08 ` [PATCHv1 " Gleb Natapov
2012-04-16 11:09 ` Michael S. Tsirkin
2012-04-16 11:24 ` Gleb Natapov
2012-04-16 12:18 ` Michael S. Tsirkin
2012-04-16 12:30 ` Gleb Natapov
2012-04-16 13:13 ` Michael S. Tsirkin
2012-04-16 15:10 ` Gleb Natapov
2012-04-16 16:33 ` Michael S. Tsirkin
2012-04-16 17:51 ` Gleb Natapov
2012-04-16 19:01 ` Michael S. Tsirkin
2012-04-17 8:45 ` Gleb Natapov
2012-04-16 17:24 ` Michael S. Tsirkin
2012-04-16 17:37 ` Gleb Natapov
2012-04-16 18:56 ` Michael S. Tsirkin
2012-04-17 8:59 ` Gleb Natapov
2012-04-17 9:24 ` Avi Kivity
2012-04-17 9:22 ` Avi Kivity
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20120415161857.GA8710@redhat.com \
--to=mst@redhat.com \
--cc=kvm@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.