From: Yunhong Jiang <yunhong.jiang@linux.intel.com>
To: kvm@vger.kernel.org
Cc: rkrcmar@redhat.com, pbonzini@redhat.com
Subject: [RFC PATCH 4/5] Utilize the vmx preemption timer for tsc deadline timer
Date: Thu, 19 May 2016 18:45:02 -0700 [thread overview]
Message-ID: <1463708703-19208-5-git-send-email-yunhong.jiang@linux.intel.com> (raw)
In-Reply-To: <1463708703-19208-1-git-send-email-yunhong.jiang@linux.intel.com>
From: Yunhong Jiang <yunhong.jiang@gmail.com>
Utilizing the VMX preemption timer for tsc deadline timer
virtualization. The VMX preemption timer is armed when the vCPU is
running, and a VMExit will happen if the virtual TSC deadline timer
expires.
When the vCPU thread is scheduled out, the tsc deadline timer
virtualization will be switched to use the current solution, i.e. use
the timer for it. It's switched back to VMX preemption timer when the
vCPU thread is scheduled int.
This solution avoids the complex OS's hrtimer system, and also the host
timer interrupt handling cost, with a preemption_timer VMexit. It fits
well for some NFV usage scenario, when the vCPU is bound to a pCPU and
the pCPU is isolated, or some similar scenario.
However, it possibly has impact if the vCPU thread is scheduled in/out
very frequently, because it switches from/to the hrtimer emulation a lot.
Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
---
arch/x86/kvm/lapic.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++++--
arch/x86/kvm/lapic.h | 10 +++++
arch/x86/kvm/vmx.c | 26 +++++++++++++
arch/x86/kvm/x86.c | 6 +++
4 files changed, 147 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 12c416929d9c..c9e32bf1a613 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1320,7 +1320,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
__delay(tsc_deadline - guest_tsc);
}
-static void start_sw_tscdeadline(struct kvm_lapic *apic)
+static void start_sw_tscdeadline(struct kvm_lapic *apic, int no_expire)
{
u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
u64 ns = 0;
@@ -1337,7 +1337,8 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
now = apic->lapic_timer.timer.base->get_time();
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
- if (likely(tscdeadline > guest_tsc)) {
+ /* Not trigger the apic_timer if invoked from sched_out */
+ if (no_expire || likely(tscdeadline > guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
expire = ktime_add_ns(now, ns);
@@ -1396,9 +1397,110 @@ static void start_apic_timer(struct kvm_lapic *apic)
ktime_to_ns(ktime_add_ns(now,
apic->lapic_timer.period)));
} else if (apic_lvtt_tscdeadline(apic)) {
- start_sw_tscdeadline(apic);
+ /* lapic timer in tsc deadline mode */
+ if (hw_emul_timer(apic)) {
+ if (unlikely(!apic->lapic_timer.tscdeadline ||
+ !apic->vcpu->arch.virtual_tsc_khz))
+ return;
+
+ /* Expired timer will be checked on vcpu_run() */
+ apic->lapic_timer.hw_emulation = HWEMUL_ENABLED;
+ } else
+ start_sw_tscdeadline(apic, 0);
+ }
+}
+
+void switch_to_hw_lapic_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (apic->lapic_timer.hw_emulation)
+ return;
+
+ if (apic_lvtt_tscdeadline(apic) &&
+ !atomic_read(&apic->lapic_timer.pending)) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ /* In case the timer triggered in above small window */
+ if (!atomic_read(&apic->lapic_timer.pending))
+ apic->lapic_timer.hw_emulation = HWEMUL_ENABLED;
+ }
+}
+EXPORT_SYMBOL_GPL(switch_to_hw_lapic_timer);
+
+void switch_to_sw_lapic_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!apic->lapic_timer.hw_emulation)
+ return;
+
+ if (apic->lapic_timer.hw_emulation == HWEMUL_INJECTED)
+ kvm_x86_ops->clear_hwemul_timer(vcpu);
+ apic->lapic_timer.hw_emulation = 0;
+
+ if (atomic_read(&apic->lapic_timer.pending))
+ return;
+
+ /* Don't trigger the apic_timer_expired() for deadlock */
+ start_sw_tscdeadline(apic, 1);
+}
+EXPORT_SYMBOL_GPL(switch_to_sw_lapic_timer);
+
+/*
+ * Check the hwemul timer status.
+ * -1: hwemul timer is not enabled
+ * >0: hwemul timer it not expired yet, the return is the delta tsc
+ * 0: hwemul timer expired already
+ */
+int check_apic_hwemul_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (apic->lapic_timer.hw_emulation) {
+ u64 tscdeadline = apic->lapic_timer.tscdeadline;
+ u64 guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+
+ if (tscdeadline <= guest_tsc)
+ return 0;
+ else
+ return (tscdeadline - guest_tsc);
+ }
+ return -1;
+}
+
+int inject_expired_hwemul_timer(struct kvm_vcpu *vcpu)
+{
+ if (!check_apic_hwemul_timer(vcpu)) {
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (apic->lapic_timer.hw_emulation == HWEMUL_INJECTED)
+ kvm_x86_ops->clear_hwemul_timer(vcpu);
+ apic->lapic_timer.hw_emulation = 0;
+ atomic_inc(&apic->lapic_timer.pending);
+ kvm_set_pending_timer(vcpu);
+ return 1;
}
+
+ return 0;
+}
+
+int inject_pending_hwemul_timer(struct kvm_vcpu *vcpu)
+{
+ u64 hwemultsc;
+
+ hwemultsc = check_apic_hwemul_timer(vcpu);
+ /* Just before vmentry, so inject even if expired */
+ if (hwemultsc >= 0) {
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ kvm_x86_ops->set_hwemul_timer(vcpu, hwemultsc);
+ apic->lapic_timer.hw_emulation = HWEMUL_INJECTED;
+ return 1;
+ }
+
+ return 0;
}
+EXPORT_SYMBOL_GPL(inject_pending_hwemul_timer);
static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
{
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 891c6da7d4aa..5037d7bf609a 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -12,6 +12,10 @@
#define KVM_APIC_SHORT_MASK 0xc0000
#define KVM_APIC_DEST_MASK 0x800
+#define HWEMUL_ENABLED 1
+/* The VMCS has been set for the vmx preemption timer */
+#define HWEMUL_INJECTED 2
+
struct kvm_timer {
struct hrtimer timer;
s64 period; /* unit: ns */
@@ -20,6 +24,7 @@ struct kvm_timer {
u64 tscdeadline;
u64 expired_tscdeadline;
atomic_t pending; /* accumulated triggered timers */
+ int hw_emulation;
};
struct kvm_lapic {
@@ -212,4 +217,9 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
const unsigned long *bitmap, u32 bitmap_size);
+void switch_to_sw_lapic_timer(struct kvm_vcpu *vcpu);
+void switch_to_hw_lapic_timer(struct kvm_vcpu *vcpu);
+int check_apic_hwemul_timer(struct kvm_vcpu *vcpu);
+int inject_expired_hwemul_timer(struct kvm_vcpu *vcpu);
+int inject_pending_hwemul_timer(struct kvm_vcpu *vcpu);
#endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5475a7699ee5..f3659ab45b30 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7572,6 +7572,23 @@ static int handle_pcommit(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (apic->lapic_timer.hw_emulation != HWEMUL_INJECTED)
+ printk(KERN_WARNING "Preemption timer w/o hwemulation\n");
+
+ if (!atomic_read(&apic->lapic_timer.pending)) {
+ atomic_inc(&apic->lapic_timer.pending);
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ }
+
+ apic->lapic_timer.hw_emulation = 0;
+ vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ return 1;
+}
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7623,6 +7640,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_XRSTORS] = handle_xrstors,
[EXIT_REASON_PML_FULL] = handle_pml_full,
[EXIT_REASON_PCOMMIT] = handle_pcommit,
+ [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
};
static const int kvm_vmx_max_exit_handlers =
@@ -8674,6 +8692,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
+ inject_pending_hwemul_timer(vcpu);
+
if (vmx->guest_pkru_valid)
__write_pkru(vmx->guest_pkru);
@@ -10693,10 +10713,16 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
if (ple_gap)
shrink_ple_window(vcpu);
+ if (vmx_hwemul_timer(vcpu))
+ switch_to_hw_lapic_timer(vcpu);
}
static void vmx_sched_out(struct kvm_vcpu *vcpu)
{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (apic->lapic_timer.hw_emulation)
+ switch_to_sw_lapic_timer(vcpu);
}
static void vmx_slot_enable_log_dirty(struct kvm *kvm,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5776473be362..a613bcfda59a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6608,6 +6608,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
local_irq_disable();
+ inject_expired_hwemul_timer(vcpu);
+
if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
|| need_resched() || signal_pending(current)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -6773,6 +6775,10 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
break;
clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+
+ /* Inject the hwemul timer if expired to avoid one VMExit */
+ inject_expired_hwemul_timer(vcpu);
+
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
--
1.8.3.1
next prev parent reply other threads:[~2016-05-20 1:53 UTC|newest]
Thread overview: 29+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-05-20 1:44 [RFC PATCH 0/5] Utilizing VMX preemption for timer virtualization Yunhong Jiang
2016-05-20 1:44 ` [RFC PATCH 1/5] Add the kvm sched_out hook Yunhong Jiang
2016-05-20 1:45 ` [RFC PATCH 2/5] Utilize the vmx preemption timer Yunhong Jiang
2016-05-20 9:45 ` Paolo Bonzini
2016-05-20 1:45 ` [RFC PATCH 3/5] Separate the start_sw_tscdeadline Yunhong Jiang
2016-05-20 10:16 ` Paolo Bonzini
2016-05-20 1:45 ` Yunhong Jiang [this message]
2016-05-20 10:34 ` [RFC PATCH 4/5] Utilize the vmx preemption timer for tsc deadline timer Paolo Bonzini
2016-05-20 22:06 ` Jiang, Yunhong
2016-05-21 12:38 ` Paolo Bonzini
2016-05-22 0:21 ` Wanpeng Li
2016-05-23 22:58 ` yunhong jiang
2016-05-24 0:53 ` Wanpeng Li
2016-05-24 0:55 ` yunhong jiang
2016-05-24 1:16 ` Wanpeng Li
2016-05-24 1:20 ` yunhong jiang
2016-05-24 1:32 ` Wanpeng Li
2016-05-20 1:45 ` [RFC PATCH 5/5] Adding trace for the hwemul_timer Yunhong Jiang
2016-05-20 10:28 ` Paolo Bonzini
2016-05-20 6:03 ` [RFC PATCH 0/5] Utilizing VMX preemption for timer virtualization Jan Kiszka
2016-05-20 9:41 ` Paolo Bonzini
2016-05-20 21:50 ` Jiang, Yunhong
2016-05-20 18:18 ` Marcelo Tosatti
2016-05-20 18:21 ` Marcelo Tosatti
2016-05-20 20:49 ` Paolo Bonzini
2016-05-20 22:27 ` Jiang, Yunhong
2016-05-20 23:53 ` yunhong jiang
2016-05-20 22:18 ` Jiang, Yunhong
2016-05-21 0:45 ` Marcelo Tosatti
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1463708703-19208-5-git-send-email-yunhong.jiang@linux.intel.com \
--to=yunhong.jiang@linux.intel.com \
--cc=kvm@vger.kernel.org \
--cc=pbonzini@redhat.com \
--cc=rkrcmar@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox