From: Yunhong Jiang <yunhong.jiang@linux.intel.com>
To: kvm@vger.kernel.org
Cc: mtosatti@redhat.com, rkrcmar@redhat.com, pbonzini@redhat.com,
kernellwp@gmail.com
Subject: [RFC PATCH V2 4/4] Utilize the vmx preemption timer for tsc deadline timer
Date: Tue, 24 May 2016 15:27:32 -0700 [thread overview]
Message-ID: <1464128852-14138-5-git-send-email-yunhong.jiang@linux.intel.com> (raw)
In-Reply-To: <1464128852-14138-1-git-send-email-yunhong.jiang@linux.intel.com>
From: Yunhong Jiang <yunhong.jiang@gmail.com>
Utilizing the VMX preemption timer for tsc deadline timer
virtualization. The VMX preemption timer is armed when the vCPU is
running, and a VMExit will happen if the virtual TSC deadline timer
expires.
When the vCPU thread is scheduled out, the tsc deadline timer
virtualization will be switched to use the current solution, i.e. use
the timer for it. It's switched back to VMX preemption timer when the
vCPU thread is scheduled int.
This solution avoids the complex OS's hrtimer system, and also the host
timer interrupt handling cost, with a preemption_timer VMexit. It fits
well for some NFV usage scenario, when the vCPU is bound to a pCPU and
the pCPU is isolated, or some similar scenario.
However, it possibly has impact if the vCPU thread is scheduled in/out
very frequently, because it switches from/to the hrtimer emulation a lot.
Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
---
arch/x86/kvm/lapic.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++--
arch/x86/kvm/lapic.h | 11 ++++++
arch/x86/kvm/trace.h | 22 ++++++++++++
arch/x86/kvm/vmx.c | 8 +++++
arch/x86/kvm/x86.c | 4 +++
5 files changed, 139 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f1cf8a5ede11..93679db7ce0f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1313,7 +1313,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
__delay(tsc_deadline - guest_tsc);
}
-static void start_sw_tscdeadline(struct kvm_lapic *apic)
+static void start_sw_tscdeadline(struct kvm_lapic *apic, int no_expire)
{
u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
u64 ns = 0;
@@ -1330,7 +1330,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
now = apic->lapic_timer.timer.base->get_time();
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
- if (likely(tscdeadline > guest_tsc)) {
+ if (no_expire || likely(tscdeadline > guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
expire = ktime_add_ns(now, ns);
@@ -1343,6 +1343,85 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
local_irq_restore(flags);
}
+void kvm_lapic_arm_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 tscdeadline, guest_tsc;
+
+ if (apic->lapic_timer.hv_timer_state == HV_TIMER_NOT_USED)
+ return;
+
+ tscdeadline = apic->lapic_timer.tscdeadline;
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+
+ if (tscdeadline >= guest_tsc)
+ kvm_x86_ops->set_hv_timer(vcpu, tscdeadline - guest_tsc);
+ else
+ kvm_x86_ops->set_hv_timer(vcpu, 0);
+
+ apic->lapic_timer.hv_timer_state = HV_TIMER_ARMED;
+ trace_kvm_hv_timer_state(vcpu->vcpu_id,
+ apic->lapic_timer.hv_timer_state);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_arm_hv_timer);
+
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ WARN_ON(apic->lapic_timer.hv_timer_state != HV_TIMER_ARMED);
+ WARN_ON(swait_active(&vcpu->wq));
+ kvm_x86_ops->cancel_hv_timer(vcpu);
+ apic->lapic_timer.hv_timer_state = HV_TIMER_NOT_USED;
+ apic_timer_expired(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+
+void switch_to_hv_lapic_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ WARN_ON(apic->lapic_timer.hv_timer_state != HV_TIMER_NOT_USED);
+
+ if (apic_lvtt_tscdeadline(apic) &&
+ !atomic_read(&apic->lapic_timer.pending)) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ /* In case the timer triggered in above small window */
+ if (!atomic_read(&apic->lapic_timer.pending)) {
+ apic->lapic_timer.hv_timer_state =
+ HV_TIMER_NEEDS_ARMING;
+ trace_kvm_hv_timer_state(vcpu->vcpu_id,
+ apic->lapic_timer.hv_timer_state);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(switch_to_hv_lapic_timer);
+
+void switch_to_sw_lapic_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ /* Possibly the TSC deadline timer is not enabled yet */
+ if (apic->lapic_timer.hv_timer_state == HV_TIMER_NOT_USED)
+ return;
+
+ if (apic->lapic_timer.hv_timer_state == HV_TIMER_ARMED)
+ kvm_x86_ops->cancel_hv_timer(vcpu);
+ apic->lapic_timer.hv_timer_state = HV_TIMER_NOT_USED;
+
+ if (atomic_read(&apic->lapic_timer.pending))
+ return;
+
+ /*
+ * Don't trigger the apic_timer_expired() for deadlock,
+ * because the swake_up() from apic_timer_expired() will
+ * try to get the run queue lock, which has been held here
+ * since we are in context switch procedure already.
+ */
+ start_sw_tscdeadline(apic, 1);
+}
+EXPORT_SYMBOL_GPL(switch_to_sw_lapic_timer);
+
static void start_apic_timer(struct kvm_lapic *apic)
{
ktime_t now;
@@ -1389,7 +1468,19 @@ static void start_apic_timer(struct kvm_lapic *apic)
ktime_to_ns(ktime_add_ns(now,
apic->lapic_timer.period)));
} else if (apic_lvtt_tscdeadline(apic)) {
- start_sw_tscdeadline(apic);
+ /* lapic timer in tsc deadline mode */
+ if (kvm_x86_ops->set_hv_timer) {
+ if (unlikely(!apic->lapic_timer.tscdeadline ||
+ !apic->vcpu->arch.virtual_tsc_khz))
+ return;
+
+ /* Expired timer will be checked on vcpu_run() */
+ apic->lapic_timer.hv_timer_state =
+ HV_TIMER_NEEDS_ARMING;
+ trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
+ apic->lapic_timer.hv_timer_state);
+ } else
+ start_sw_tscdeadline(apic, 0);
}
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 891c6da7d4aa..dc4fd8eea04d 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -12,6 +12,12 @@
#define KVM_APIC_SHORT_MASK 0xc0000
#define KVM_APIC_DEST_MASK 0x800
+enum {
+ HV_TIMER_NOT_USED,
+ HV_TIMER_NEEDS_ARMING,
+ HV_TIMER_ARMED,
+};
+
struct kvm_timer {
struct hrtimer timer;
s64 period; /* unit: ns */
@@ -20,6 +26,7 @@ struct kvm_timer {
u64 tscdeadline;
u64 expired_tscdeadline;
atomic_t pending; /* accumulated triggered timers */
+ int hv_timer_state;
};
struct kvm_lapic {
@@ -212,4 +219,8 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
const unsigned long *bitmap, u32 bitmap_size);
+void switch_to_sw_lapic_timer(struct kvm_vcpu *vcpu);
+void switch_to_hv_lapic_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_arm_hv_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
#endif
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 8de925031b5c..8e1b5e9c2e78 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -6,6 +6,7 @@
#include <asm/svm.h>
#include <asm/clocksource.h>
#include <asm/pvclock-abi.h>
+#include <lapic.h>
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm
@@ -1348,6 +1349,27 @@ TRACE_EVENT(kvm_avic_unaccelerated_access,
__entry->vec)
);
+#define kvm_trace_symbol_hv_timer \
+ {HV_TIMER_NOT_USED, "no"}, \
+ {HV_TIMER_NEEDS_ARMING, "need_arming"}, \
+ {HV_TIMER_ARMED, "armed"}
+
+TRACE_EVENT(kvm_hv_timer_state,
+ TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_state),
+ TP_ARGS(vcpu_id, hv_timer_state),
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(unsigned int, hv_timer_state)
+ ),
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->hv_timer_state = hv_timer_state;
+ ),
+ TP_printk("vcpu_id %x hv_timer %s\n",
+ __entry->vcpu_id,
+ __print_symbolic(__entry->hv_timer_state,
+ kvm_trace_symbol_hv_timer))
+ );
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2b29afa61715..dc0b8cae02b8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7576,6 +7576,11 @@ static int handle_pcommit(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+ kvm_lapic_expired_hv_timer(vcpu);
+ return 1;
+}
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7627,6 +7632,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_XRSTORS] = handle_xrstors,
[EXIT_REASON_PML_FULL] = handle_pml_full,
[EXIT_REASON_PCOMMIT] = handle_pcommit,
+ [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
};
static const int kvm_vmx_max_exit_handlers =
@@ -8678,6 +8684,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
+ kvm_lapic_arm_hv_timer(vcpu);
+
if (vmx->guest_pkru_valid)
__write_pkru(vmx->guest_pkru);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 269d576520ba..f4b50608568a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7724,11 +7724,15 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
+ if (kvm_x86_ops->set_hv_timer)
+ switch_to_hv_lapic_timer(vcpu);
kvm_x86_ops->sched_in(vcpu, cpu);
}
void kvm_arch_sched_out(struct kvm_vcpu *vcpu)
{
+ if (kvm_x86_ops->set_hv_timer)
+ switch_to_sw_lapic_timer(vcpu);
}
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
--
1.8.3.1
next prev parent reply other threads:[~2016-05-24 22:35 UTC|newest]
Thread overview: 32+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-05-24 22:27 [RFC PATCH V2 0/4] Utilizing VMX preemption for timer virtualization Yunhong Jiang
2016-05-24 22:27 ` [RFC PATCH V2 1/4] Add the kvm sched_out hook Yunhong Jiang
2016-05-24 22:27 ` [RFC PATCH V2 2/4] Utilize the vmx preemption timer Yunhong Jiang
2016-06-14 13:23 ` Roman Kagan
2016-06-14 13:41 ` Paolo Bonzini
2016-06-14 16:46 ` yunhong jiang
2016-06-14 21:56 ` Paolo Bonzini
2016-06-15 18:03 ` yunhong jiang
2016-06-14 16:46 ` yunhong jiang
2016-05-24 22:27 ` [RFC PATCH V2 3/4] Separate the start_sw_tscdeadline Yunhong Jiang
2016-05-24 22:27 ` Yunhong Jiang [this message]
2016-05-24 23:11 ` [RFC PATCH V2 4/4] Utilize the vmx preemption timer for tsc deadline timer David Matlack
2016-05-24 23:35 ` yunhong jiang
2016-05-25 11:58 ` Paolo Bonzini
2016-05-25 22:53 ` yunhong jiang
2016-05-26 7:20 ` Paolo Bonzini
2016-05-25 10:40 ` Paolo Bonzini
2016-05-25 13:38 ` Radim Krčmář
2016-05-25 11:52 ` Paolo Bonzini
2016-05-25 22:44 ` yunhong jiang
2016-05-26 14:05 ` Alan Jenkins
2016-05-26 15:32 ` Paolo Bonzini
2016-06-04 0:24 ` yunhong jiang
2016-06-06 13:49 ` Paolo Bonzini
2016-06-06 18:21 ` yunhong jiang
2016-05-25 13:27 ` Radim Krčmář
2016-05-25 13:51 ` Paolo Bonzini
2016-05-25 14:31 ` Radim Krčmář
2016-05-25 23:13 ` yunhong jiang
2016-06-14 11:34 ` Paolo Bonzini
2016-05-25 13:45 ` Radim Krčmář
2016-05-25 22:57 ` yunhong jiang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1464128852-14138-5-git-send-email-yunhong.jiang@linux.intel.com \
--to=yunhong.jiang@linux.intel.com \
--cc=kernellwp@gmail.com \
--cc=kvm@vger.kernel.org \
--cc=mtosatti@redhat.com \
--cc=pbonzini@redhat.com \
--cc=rkrcmar@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).