* [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-10 16:53 [patch 0/2] KVM: " Marcelo.Tosatti
@ 2014-12-10 16:53 ` Marcelo.Tosatti
2014-12-10 17:08 ` Paolo Bonzini
0 siblings, 1 reply; 21+ messages in thread
From: Marcelo.Tosatti @ 2014-12-10 16:53 UTC (permalink / raw)
To: kvm
Cc: Luiz Capitulino, Rik van Riel, Paolo Bonzini, Radim Krcmar,
Marcelo Tosatti
[-- Attachment #1: lapic-timer-advance --]
[-- Type: text/plain, Size: 7031 bytes --]
For the hrtimer which emulates the tscdeadline timer in the guest,
add an option to advance expiration, and busy spin on VM-entry waiting
for the actual expiration time to elapse.
This allows achieving low latencies in cyclictest (or any scenario
which requires strict timing regarding timer expiration).
Reduces cyclictest avg latency by 50%.
Note: this option requires tuning to find the appropriate value
for a particular hardware/guest combination. One method is to measure the
average delay between apic_timer_fn and VM-entry.
Another method is to start with 1000ns, and increase the value
in say 500ns increments until avg cyclictest numbers stop decreasing.
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Index: kvm/arch/x86/kvm/lapic.c
===================================================================
--- kvm.orig/arch/x86/kvm/lapic.c
+++ kvm/arch/x86/kvm/lapic.c
@@ -33,6 +33,7 @@
#include <asm/page.h>
#include <asm/current.h>
#include <asm/apicdef.h>
+#include <asm/delay.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
#include "kvm_cache_regs.h"
@@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv
{
struct kvm_vcpu *vcpu = apic->vcpu;
wait_queue_head_t *q = &vcpu->wq;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
/*
* Note: KVM_REQ_PENDING_TIMER is implicitly checked in
@@ -1087,11 +1089,59 @@ static void apic_timer_expired(struct kv
if (waitqueue_active(q))
wake_up_interruptible(q);
+
+ if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE)
+ ktimer->expired_tscdeadline = ktimer->tscdeadline;
+}
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
+
+ if (kvm_apic_hw_enabled(apic)) {
+ int vec = reg & APIC_VECTOR_MASK;
+
+ if (kvm_x86_ops->test_posted_interrupt)
+ return kvm_x86_ops->test_posted_interrupt(vcpu, vec);
+ else {
+ if (apic_test_vector(vec, apic->regs + APIC_ISR))
+ return true;
+ }
+ }
+ return false;
+}
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 guest_tsc, tsc_deadline;
+
+ if (!kvm_vcpu_has_lapic(vcpu))
+ return;
+
+ if (!apic_lvtt_tscdeadline(apic))
+ return;
+
+ if (!lapic_timer_int_injected(vcpu))
+ return;
+
+ tsc_deadline = apic->lapic_timer.expired_tscdeadline;
+ guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+
+ while (guest_tsc < tsc_deadline) {
+ int delay = min(tsc_deadline - guest_tsc, 1000ULL);
+
+ ndelay(delay);
+ guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+ }
}
static void start_apic_timer(struct kvm_lapic *apic)
{
ktime_t now;
+ struct kvm_arch *kvm_arch = &apic->vcpu->kvm->arch;
+
atomic_set(&apic->lapic_timer.pending, 0);
if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
@@ -1137,6 +1187,7 @@ static void start_apic_timer(struct kvm_
/* lapic timer in tsc deadline mode */
u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
u64 ns = 0;
+ ktime_t expire;
struct kvm_vcpu *vcpu = apic->vcpu;
unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
unsigned long flags;
@@ -1149,10 +1200,14 @@ static void start_apic_timer(struct kvm_
now = apic->lapic_timer.timer.base->get_time();
guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
if (likely(tscdeadline > guest_tsc)) {
+ u32 advance = kvm_arch->lapic_tscdeadline_advance_ns;
+
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
+ expire = ktime_add_ns(now, ns);
+ expire = ktime_sub_ns(expire, advance);
hrtimer_start(&apic->lapic_timer.timer,
- ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+ expire, HRTIMER_MODE_ABS);
} else
apic_timer_expired(apic);
Index: kvm/arch/x86/kvm/lapic.h
===================================================================
--- kvm.orig/arch/x86/kvm/lapic.h
+++ kvm/arch/x86/kvm/lapic.h
@@ -14,6 +14,7 @@ struct kvm_timer {
u32 timer_mode;
u32 timer_mode_mask;
u64 tscdeadline;
+ u64 expired_tscdeadline;
atomic_t pending; /* accumulated triggered timers */
};
@@ -170,4 +171,6 @@ static inline bool kvm_apic_has_events(s
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
+void wait_lapic_expire(struct kvm_vcpu *vcpu);
+
#endif
Index: kvm/arch/x86/kvm/x86.c
===================================================================
--- kvm.orig/arch/x86/kvm/x86.c
+++ kvm/arch/x86/kvm/x86.c
@@ -2742,6 +2742,7 @@ int kvm_vm_ioctl_check_extension(struct
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
#endif
+ case KVM_CAP_TSCDEADLINE_ADVANCE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -4079,6 +4080,22 @@ long kvm_arch_vm_ioctl(struct file *filp
r = 0;
break;
}
+ case KVM_SET_TSCDEADLINE_ADVANCE: {
+ struct kvm_tscdeadline_advance adv;
+
+ r = -EFAULT;
+ if (copy_from_user(&adv, argp, sizeof(adv)))
+ goto out;
+
+ /* cap at 50us to avoid spinning for too long */
+ r = -EINVAL;
+ if (adv.timer_advance > 50000)
+ goto out;
+
+ kvm->arch.lapic_tscdeadline_advance_ns = adv.timer_advance;
+ r = 0;
+ break;
+ }
default:
r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
@@ -6311,6 +6328,7 @@ static int vcpu_enter_guest(struct kvm_v
}
trace_kvm_entry(vcpu->vcpu_id);
+ wait_lapic_expire(vcpu);
kvm_x86_ops->run(vcpu);
/*
Index: kvm/arch/x86/include/asm/kvm_host.h
===================================================================
--- kvm.orig/arch/x86/include/asm/kvm_host.h
+++ kvm/arch/x86/include/asm/kvm_host.h
@@ -612,6 +612,8 @@ struct kvm_arch {
u64 hv_hypercall;
u64 hv_tsc_page;
+ u32 lapic_tscdeadline_advance_ns;
+
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
#endif
Index: kvm/arch/x86/include/uapi/asm/kvm.h
===================================================================
--- kvm.orig/arch/x86/include/uapi/asm/kvm.h
+++ kvm/arch/x86/include/uapi/asm/kvm.h
@@ -277,6 +277,11 @@ struct kvm_reinject_control {
__u8 reserved[31];
};
+struct kvm_tscdeadline_advance {
+ __u32 timer_advance;
+ __u32 reserved[3];
+};
+
/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
Index: kvm/include/uapi/linux/kvm.h
===================================================================
--- kvm.orig/include/uapi/linux/kvm.h
+++ kvm/include/uapi/linux/kvm.h
@@ -753,6 +753,7 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_PPC_FIXUP_HCALL 103
#define KVM_CAP_PPC_ENABLE_HCALL 104
#define KVM_CAP_CHECK_EXTENSION_VM 105
+#define KVM_CAP_TSCDEADLINE_ADVANCE 106
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1053,6 +1054,8 @@ struct kvm_s390_ucas_mapping {
#define KVM_GET_DEVICE_ATTR _IOW(KVMIO, 0xe2, struct kvm_device_attr)
#define KVM_HAS_DEVICE_ATTR _IOW(KVMIO, 0xe3, struct kvm_device_attr)
+#define KVM_SET_TSCDEADLINE_ADVANCE _IOW(KVMIO, 0xe4, struct kvm_tscdeadline_advance)
+
/*
* ioctls for vcpu fds
*/
^ permalink raw reply [flat|nested] 21+ messages in thread
* [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-10 17:06 [patch 0/2] KVM: add option to advance tscdeadline hrtimer expiration (v2) Marcelo Tosatti
@ 2014-12-10 17:06 ` Marcelo Tosatti
2014-12-10 17:11 ` Rik van Riel
0 siblings, 1 reply; 21+ messages in thread
From: Marcelo Tosatti @ 2014-12-10 17:06 UTC (permalink / raw)
To: kvm
Cc: Luiz Capitulino, Rik van Riel, Paolo Bonzini, Radim Krcmar,
Marcelo Tosatti
[-- Attachment #1: lapic-timer-advance --]
[-- Type: text/plain, Size: 7031 bytes --]
For the hrtimer which emulates the tscdeadline timer in the guest,
add an option to advance expiration, and busy spin on VM-entry waiting
for the actual expiration time to elapse.
This allows achieving low latencies in cyclictest (or any scenario
which requires strict timing regarding timer expiration).
Reduces cyclictest avg latency by 50%.
Note: this option requires tuning to find the appropriate value
for a particular hardware/guest combination. One method is to measure the
average delay between apic_timer_fn and VM-entry.
Another method is to start with 1000ns, and increase the value
in say 500ns increments until avg cyclictest numbers stop decreasing.
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Index: kvm/arch/x86/kvm/lapic.c
===================================================================
--- kvm.orig/arch/x86/kvm/lapic.c
+++ kvm/arch/x86/kvm/lapic.c
@@ -33,6 +33,7 @@
#include <asm/page.h>
#include <asm/current.h>
#include <asm/apicdef.h>
+#include <asm/delay.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
#include "kvm_cache_regs.h"
@@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv
{
struct kvm_vcpu *vcpu = apic->vcpu;
wait_queue_head_t *q = &vcpu->wq;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
/*
* Note: KVM_REQ_PENDING_TIMER is implicitly checked in
@@ -1087,11 +1089,59 @@ static void apic_timer_expired(struct kv
if (waitqueue_active(q))
wake_up_interruptible(q);
+
+ if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE)
+ ktimer->expired_tscdeadline = ktimer->tscdeadline;
+}
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
+
+ if (kvm_apic_hw_enabled(apic)) {
+ int vec = reg & APIC_VECTOR_MASK;
+
+ if (kvm_x86_ops->test_posted_interrupt)
+ return kvm_x86_ops->test_posted_interrupt(vcpu, vec);
+ else {
+ if (apic_test_vector(vec, apic->regs + APIC_ISR))
+ return true;
+ }
+ }
+ return false;
+}
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 guest_tsc, tsc_deadline;
+
+ if (!kvm_vcpu_has_lapic(vcpu))
+ return;
+
+ if (!apic_lvtt_tscdeadline(apic))
+ return;
+
+ if (!lapic_timer_int_injected(vcpu))
+ return;
+
+ tsc_deadline = apic->lapic_timer.expired_tscdeadline;
+ guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+
+ while (guest_tsc < tsc_deadline) {
+ int delay = min(tsc_deadline - guest_tsc, 1000ULL);
+
+ ndelay(delay);
+ guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+ }
}
static void start_apic_timer(struct kvm_lapic *apic)
{
ktime_t now;
+ struct kvm_arch *kvm_arch = &apic->vcpu->kvm->arch;
+
atomic_set(&apic->lapic_timer.pending, 0);
if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
@@ -1137,6 +1187,7 @@ static void start_apic_timer(struct kvm_
/* lapic timer in tsc deadline mode */
u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
u64 ns = 0;
+ ktime_t expire;
struct kvm_vcpu *vcpu = apic->vcpu;
unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
unsigned long flags;
@@ -1149,10 +1200,14 @@ static void start_apic_timer(struct kvm_
now = apic->lapic_timer.timer.base->get_time();
guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
if (likely(tscdeadline > guest_tsc)) {
+ u32 advance = kvm_arch->lapic_tscdeadline_advance_ns;
+
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
+ expire = ktime_add_ns(now, ns);
+ expire = ktime_sub_ns(expire, advance);
hrtimer_start(&apic->lapic_timer.timer,
- ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+ expire, HRTIMER_MODE_ABS);
} else
apic_timer_expired(apic);
Index: kvm/arch/x86/kvm/lapic.h
===================================================================
--- kvm.orig/arch/x86/kvm/lapic.h
+++ kvm/arch/x86/kvm/lapic.h
@@ -14,6 +14,7 @@ struct kvm_timer {
u32 timer_mode;
u32 timer_mode_mask;
u64 tscdeadline;
+ u64 expired_tscdeadline;
atomic_t pending; /* accumulated triggered timers */
};
@@ -170,4 +171,6 @@ static inline bool kvm_apic_has_events(s
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
+void wait_lapic_expire(struct kvm_vcpu *vcpu);
+
#endif
Index: kvm/arch/x86/kvm/x86.c
===================================================================
--- kvm.orig/arch/x86/kvm/x86.c
+++ kvm/arch/x86/kvm/x86.c
@@ -2742,6 +2742,7 @@ int kvm_vm_ioctl_check_extension(struct
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
#endif
+ case KVM_CAP_TSCDEADLINE_ADVANCE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -4079,6 +4080,22 @@ long kvm_arch_vm_ioctl(struct file *filp
r = 0;
break;
}
+ case KVM_SET_TSCDEADLINE_ADVANCE: {
+ struct kvm_tscdeadline_advance adv;
+
+ r = -EFAULT;
+ if (copy_from_user(&adv, argp, sizeof(adv)))
+ goto out;
+
+ /* cap at 50us to avoid spinning for too long */
+ r = -EINVAL;
+ if (adv.timer_advance > 50000)
+ goto out;
+
+ kvm->arch.lapic_tscdeadline_advance_ns = adv.timer_advance;
+ r = 0;
+ break;
+ }
default:
r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
@@ -6311,6 +6328,7 @@ static int vcpu_enter_guest(struct kvm_v
}
trace_kvm_entry(vcpu->vcpu_id);
+ wait_lapic_expire(vcpu);
kvm_x86_ops->run(vcpu);
/*
Index: kvm/arch/x86/include/asm/kvm_host.h
===================================================================
--- kvm.orig/arch/x86/include/asm/kvm_host.h
+++ kvm/arch/x86/include/asm/kvm_host.h
@@ -612,6 +612,8 @@ struct kvm_arch {
u64 hv_hypercall;
u64 hv_tsc_page;
+ u32 lapic_tscdeadline_advance_ns;
+
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
#endif
Index: kvm/arch/x86/include/uapi/asm/kvm.h
===================================================================
--- kvm.orig/arch/x86/include/uapi/asm/kvm.h
+++ kvm/arch/x86/include/uapi/asm/kvm.h
@@ -277,6 +277,11 @@ struct kvm_reinject_control {
__u8 reserved[31];
};
+struct kvm_tscdeadline_advance {
+ __u32 timer_advance;
+ __u32 reserved[3];
+};
+
/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
Index: kvm/include/uapi/linux/kvm.h
===================================================================
--- kvm.orig/include/uapi/linux/kvm.h
+++ kvm/include/uapi/linux/kvm.h
@@ -753,6 +753,7 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_PPC_FIXUP_HCALL 103
#define KVM_CAP_PPC_ENABLE_HCALL 104
#define KVM_CAP_CHECK_EXTENSION_VM 105
+#define KVM_CAP_TSCDEADLINE_ADVANCE 106
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1053,6 +1054,8 @@ struct kvm_s390_ucas_mapping {
#define KVM_GET_DEVICE_ATTR _IOW(KVMIO, 0xe2, struct kvm_device_attr)
#define KVM_HAS_DEVICE_ATTR _IOW(KVMIO, 0xe3, struct kvm_device_attr)
+#define KVM_SET_TSCDEADLINE_ADVANCE _IOW(KVMIO, 0xe4, struct kvm_tscdeadline_advance)
+
/*
* ioctls for vcpu fds
*/
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-10 16:53 ` [patch 2/2] KVM: x86: " Marcelo.Tosatti
@ 2014-12-10 17:08 ` Paolo Bonzini
2014-12-10 17:34 ` Marcelo Tosatti
0 siblings, 1 reply; 21+ messages in thread
From: Paolo Bonzini @ 2014-12-10 17:08 UTC (permalink / raw)
To: Marcelo.Tosatti, kvm
Cc: Luiz Capitulino, Rik van Riel, Radim Krcmar, Marcelo Tosatti
On 10/12/2014 17:53, Marcelo.Tosatti@amt.cnet wrote:
> For the hrtimer which emulates the tscdeadline timer in the guest,
> add an option to advance expiration, and busy spin on VM-entry waiting
> for the actual expiration time to elapse.
>
> This allows achieving low latencies in cyclictest (or any scenario
> which requires strict timing regarding timer expiration).
>
> Reduces cyclictest avg latency by 50%.
>
> Note: this option requires tuning to find the appropriate value
> for a particular hardware/guest combination. One method is to measure the
> average delay between apic_timer_fn and VM-entry.
> Another method is to start with 1000ns, and increase the value
> in say 500ns increments until avg cyclictest numbers stop decreasing.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
What is the latency value that you find in practice, for both
apic_timer_fn to vmentry? Or for apic_timer_fn to just before vmrun?
Let's start with a kvm-unit-tests patch to measure this value.
We can then decide whether to hardcode a small default value (e.g.
1000-3000) and make it a module parameter? Or perhaps start with a
higher value (twice what you find in practice?) and adjust it towards a
target every time wait_lapic_expire is called. But in order to judge
the correct approach, I need to see the numbers.
Paolo
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-10 17:06 ` [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration Marcelo Tosatti
@ 2014-12-10 17:11 ` Rik van Riel
0 siblings, 0 replies; 21+ messages in thread
From: Rik van Riel @ 2014-12-10 17:11 UTC (permalink / raw)
To: Marcelo Tosatti, kvm; +Cc: Luiz Capitulino, Paolo Bonzini, Radim Krcmar
On 12/10/2014 12:06 PM, Marcelo Tosatti wrote:
> For the hrtimer which emulates the tscdeadline timer in the guest,
> add an option to advance expiration, and busy spin on VM-entry waiting
> for the actual expiration time to elapse.
>
> This allows achieving low latencies in cyclictest (or any scenario
> which requires strict timing regarding timer expiration).
>
> Reduces cyclictest avg latency by 50%.
>
> Note: this option requires tuning to find the appropriate value
> for a particular hardware/guest combination. One method is to measure the
> average delay between apic_timer_fn and VM-entry.
> Another method is to start with 1000ns, and increase the value
> in say 500ns increments until avg cyclictest numbers stop decreasing.
It would be good to document how this is used, in the changelog.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-10 17:08 ` Paolo Bonzini
@ 2014-12-10 17:34 ` Marcelo Tosatti
2014-12-10 17:53 ` Paolo Bonzini
0 siblings, 1 reply; 21+ messages in thread
From: Marcelo Tosatti @ 2014-12-10 17:34 UTC (permalink / raw)
To: Paolo Bonzini
Cc: Marcelo.Tosatti, kvm, Luiz Capitulino, Rik van Riel, Radim Krcmar
On Wed, Dec 10, 2014 at 06:08:14PM +0100, Paolo Bonzini wrote:
>
>
> On 10/12/2014 17:53, Marcelo.Tosatti@amt.cnet wrote:
> > For the hrtimer which emulates the tscdeadline timer in the guest,
> > add an option to advance expiration, and busy spin on VM-entry waiting
> > for the actual expiration time to elapse.
> >
> > This allows achieving low latencies in cyclictest (or any scenario
> > which requires strict timing regarding timer expiration).
> >
> > Reduces cyclictest avg latency by 50%.
> >
> > Note: this option requires tuning to find the appropriate value
> > for a particular hardware/guest combination. One method is to measure the
> > average delay between apic_timer_fn and VM-entry.
> > Another method is to start with 1000ns, and increase the value
> > in say 500ns increments until avg cyclictest numbers stop decreasing.
> >
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>
> What is the latency value that you find in practice, for both
> apic_timer_fn to vmentry? Or for apic_timer_fn to just before vmrun?
7us between apic_timer_fn and kvm_entry tracepoint.
> Let's start with a kvm-unit-tests patch to measure this value.
I can, but kvm-unit-test register state will not be similar to
actual guest state (think host/guest state loading).
What is the advantage of using a kvm-unit-test test rather
than cyclictest in the guest ?
> We can then decide whether to hardcode a small default value (e.g.
> 1000-3000) and make it a module parameter? Or perhaps start with a
> higher value (twice what you find in practice?) and adjust it towards a
> target every time wait_lapic_expire is called. But in order to judge
> the correct approach, I need to see the numbers.
Problem with automatic adjustment is: what is the correct target?
You want faster instances of apic_timer_fn->vm-entry to spin a bit,
and allow slow instances of apic_timer_fn->vm-entry to have
an effective advancement.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-10 17:34 ` Marcelo Tosatti
@ 2014-12-10 17:53 ` Paolo Bonzini
0 siblings, 0 replies; 21+ messages in thread
From: Paolo Bonzini @ 2014-12-10 17:53 UTC (permalink / raw)
To: Marcelo Tosatti; +Cc: kvm, Luiz Capitulino, Rik van Riel, Radim Krcmar
On 10/12/2014 18:34, Marcelo Tosatti wrote:
>> > Let's start with a kvm-unit-tests patch to measure this value.
> I can, but kvm-unit-test register state will not be similar to
> actual guest state (think host/guest state loading).
7us is about 20000 clock cycles. A lightweight vmexit is an order of
magnitude less expensive, and half of the vmexit overhead is the VMRUN
instruction itself. All in all, the host/guest state loading should not
matter (or should matter little).
> What is the advantage of using a kvm-unit-test test rather
> than cyclictest in the guest ?
That it starts in <3 seconds, and that you can vary the timer frequency
in order to measure jitter in addition to latency.
>> We can then decide whether to hardcode a small default value (e.g.
>> 1000-3000) and make it a module parameter? Or perhaps start with a
>> higher value (twice what you find in practice?) and adjust it towards a
>> target every time wait_lapic_expire is called. But in order to judge
>> the correct approach, I need to see the numbers.
>
> Problem with automatic adjustment is: what is the correct target?
We cannot say without seeing the numbers, particularly the jitter. This
is why I want to see numbers for varying frequencies (from 100us to 10ms
per tick, say).
> You want faster instances of apic_timer_fn->vm-entry to spin a bit,
> and allow slow instances of apic_timer_fn->vm-entry to have
> an effective advancement.
If it is small enogh, you can make the timer a little "early" (increase
advance) by a small amount on every delivered interrupt. This prepares
for a slow instance.
And you can make the timer less "early" (decrease advance) by some
percentage of what you had to wait on every wait_lapic_expire, if you
had to wait more than a given threshold. This avoids that you wait too
much on consecutive fast instances.
Paolo
^ permalink raw reply [flat|nested] 21+ messages in thread
* [patch 0/2] KVM: add option to advance tscdeadline hrtimer expiration (v3)
@ 2014-12-10 20:57 Marcelo Tosatti
2014-12-10 20:57 ` [patch 1/2] KVM: x86: add method to test PIR bitmap vector Marcelo Tosatti
2014-12-10 20:57 ` [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration Marcelo Tosatti
0 siblings, 2 replies; 21+ messages in thread
From: Marcelo Tosatti @ 2014-12-10 20:57 UTC (permalink / raw)
To: kvm; +Cc: Luiz Capitulino, Rik van Riel, Paolo Bonzini, Radim Krcmar
See patches for details.
v2:
- fix email address.
v3:
- use module parameter for configuration of value (Paolo/Radim)
^ permalink raw reply [flat|nested] 21+ messages in thread
* [patch 1/2] KVM: x86: add method to test PIR bitmap vector
2014-12-10 20:57 [patch 0/2] KVM: add option to advance tscdeadline hrtimer expiration (v3) Marcelo Tosatti
@ 2014-12-10 20:57 ` Marcelo Tosatti
2014-12-10 20:57 ` [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration Marcelo Tosatti
1 sibling, 0 replies; 21+ messages in thread
From: Marcelo Tosatti @ 2014-12-10 20:57 UTC (permalink / raw)
To: kvm
Cc: Luiz Capitulino, Rik van Riel, Paolo Bonzini, Radim Krcmar,
Marcelo Tosatti
[-- Attachment #1: add-test-pir --]
[-- Type: text/plain, Size: 2342 bytes --]
kvm_x86_ops->test_posted_interrupt() returns true/false depending
whether 'vector' is set.
Next patch makes use of this interface.
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Index: kvm/arch/x86/include/asm/kvm_host.h
===================================================================
--- kvm.orig/arch/x86/include/asm/kvm_host.h
+++ kvm/arch/x86/include/asm/kvm_host.h
@@ -743,6 +743,7 @@ struct kvm_x86_ops {
void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
+ bool (*test_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
Index: kvm/arch/x86/kvm/vmx.c
===================================================================
--- kvm.orig/arch/x86/kvm/vmx.c
+++ kvm/arch/x86/kvm/vmx.c
@@ -435,6 +435,11 @@ static int pi_test_and_set_pir(int vecto
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
+static int pi_test_pir(int vector, struct pi_desc *pi_desc)
+{
+ return test_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
@@ -5939,6 +5944,7 @@ static __init int hardware_setup(void)
else {
kvm_x86_ops->hwapic_irr_update = NULL;
kvm_x86_ops->deliver_posted_interrupt = NULL;
+ kvm_x86_ops->test_posted_interrupt = NULL;
kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
}
@@ -6960,6 +6966,13 @@ static int handle_invvpid(struct kvm_vcp
return 1;
}
+static bool vmx_test_pir(struct kvm_vcpu *vcpu, int vector)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ return pi_test_pir(vector, &vmx->pi_desc);
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -9374,6 +9387,7 @@ static struct kvm_x86_ops vmx_x86_ops =
.hwapic_isr_update = vmx_hwapic_isr_update,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
+ .test_posted_interrupt = vmx_test_pir,
.set_tss_addr = vmx_set_tss_addr,
.get_tdp_level = get_ept_level,
^ permalink raw reply [flat|nested] 21+ messages in thread
* [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-10 20:57 [patch 0/2] KVM: add option to advance tscdeadline hrtimer expiration (v3) Marcelo Tosatti
2014-12-10 20:57 ` [patch 1/2] KVM: x86: add method to test PIR bitmap vector Marcelo Tosatti
@ 2014-12-10 20:57 ` Marcelo Tosatti
2014-12-10 23:37 ` Paolo Bonzini
2014-12-12 18:35 ` Radim Krcmar
1 sibling, 2 replies; 21+ messages in thread
From: Marcelo Tosatti @ 2014-12-10 20:57 UTC (permalink / raw)
To: kvm
Cc: Luiz Capitulino, Rik van Riel, Paolo Bonzini, Radim Krcmar,
Marcelo Tosatti
[-- Attachment #1: lapic-timer-advance --]
[-- Type: text/plain, Size: 5206 bytes --]
For the hrtimer which emulates the tscdeadline timer in the guest,
add an option to advance expiration, and busy spin on VM-entry waiting
for the actual expiration time to elapse.
This allows achieving low latencies in cyclictest (or any scenario
which requires strict timing regarding timer expiration).
Reduces cyclictest avg latency by 50%.
Note: this option requires tuning to find the appropriate value
for a particular hardware/guest combination. One method is to measure the
average delay between apic_timer_fn and VM-entry.
Another method is to start with 1000ns, and increase the value
in say 500ns increments until avg cyclictest numbers stop decreasing.
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Index: kvm/arch/x86/kvm/lapic.c
===================================================================
--- kvm.orig/arch/x86/kvm/lapic.c
+++ kvm/arch/x86/kvm/lapic.c
@@ -33,6 +33,7 @@
#include <asm/page.h>
#include <asm/current.h>
#include <asm/apicdef.h>
+#include <asm/delay.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
#include "kvm_cache_regs.h"
@@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv
{
struct kvm_vcpu *vcpu = apic->vcpu;
wait_queue_head_t *q = &vcpu->wq;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
/*
* Note: KVM_REQ_PENDING_TIMER is implicitly checked in
@@ -1087,11 +1089,58 @@ static void apic_timer_expired(struct kv
if (waitqueue_active(q))
wake_up_interruptible(q);
+
+ if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE)
+ ktimer->expired_tscdeadline = ktimer->tscdeadline;
+}
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
+
+ if (kvm_apic_hw_enabled(apic)) {
+ int vec = reg & APIC_VECTOR_MASK;
+
+ if (kvm_x86_ops->test_posted_interrupt)
+ return kvm_x86_ops->test_posted_interrupt(vcpu, vec);
+ else {
+ if (apic_test_vector(vec, apic->regs + APIC_ISR))
+ return true;
+ }
+ }
+ return false;
+}
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 guest_tsc, tsc_deadline;
+
+ if (!kvm_vcpu_has_lapic(vcpu))
+ return;
+
+ if (!apic_lvtt_tscdeadline(apic))
+ return;
+
+ if (!lapic_timer_int_injected(vcpu))
+ return;
+
+ tsc_deadline = apic->lapic_timer.expired_tscdeadline;
+ guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+
+ while (guest_tsc < tsc_deadline) {
+ int delay = min(tsc_deadline - guest_tsc, 1000ULL);
+
+ ndelay(delay);
+ guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+ }
}
static void start_apic_timer(struct kvm_lapic *apic)
{
ktime_t now;
+
atomic_set(&apic->lapic_timer.pending, 0);
if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
@@ -1137,6 +1186,7 @@ static void start_apic_timer(struct kvm_
/* lapic timer in tsc deadline mode */
u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
u64 ns = 0;
+ ktime_t expire;
struct kvm_vcpu *vcpu = apic->vcpu;
unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
unsigned long flags;
@@ -1151,8 +1201,10 @@ static void start_apic_timer(struct kvm_
if (likely(tscdeadline > guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
+ expire = ktime_add_ns(now, ns);
+ expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
hrtimer_start(&apic->lapic_timer.timer,
- ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+ expire, HRTIMER_MODE_ABS);
} else
apic_timer_expired(apic);
Index: kvm/arch/x86/kvm/lapic.h
===================================================================
--- kvm.orig/arch/x86/kvm/lapic.h
+++ kvm/arch/x86/kvm/lapic.h
@@ -14,6 +14,7 @@ struct kvm_timer {
u32 timer_mode;
u32 timer_mode_mask;
u64 tscdeadline;
+ u64 expired_tscdeadline;
atomic_t pending; /* accumulated triggered timers */
};
@@ -170,4 +171,6 @@ static inline bool kvm_apic_has_events(s
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
+void wait_lapic_expire(struct kvm_vcpu *vcpu);
+
#endif
Index: kvm/arch/x86/kvm/x86.c
===================================================================
--- kvm.orig/arch/x86/kvm/x86.c
+++ kvm/arch/x86/kvm/x86.c
@@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz)
static u32 tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+/* lapic timer advance (tscdeadline mode only) in nanoseconds */
+unsigned int lapic_timer_advance_ns = 0;
+module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
+
static bool backwards_tsc_observed = false;
#define KVM_NR_SHARED_MSRS 16
@@ -6311,6 +6315,7 @@ static int vcpu_enter_guest(struct kvm_v
}
trace_kvm_entry(vcpu->vcpu_id);
+ wait_lapic_expire(vcpu);
kvm_x86_ops->run(vcpu);
/*
Index: kvm/arch/x86/kvm/x86.h
===================================================================
--- kvm.orig/arch/x86/kvm/x86.h
+++ kvm/arch/x86/kvm/x86.h
@@ -170,5 +170,7 @@ extern u64 kvm_supported_xcr0(void);
extern unsigned int min_timer_period_us;
+extern unsigned int lapic_timer_advance_ns;
+
extern struct static_key kvm_no_apic_vcpu;
#endif
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-10 20:57 ` [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration Marcelo Tosatti
@ 2014-12-10 23:37 ` Paolo Bonzini
2014-12-11 3:07 ` Marcelo Tosatti
2014-12-12 18:35 ` Radim Krcmar
1 sibling, 1 reply; 21+ messages in thread
From: Paolo Bonzini @ 2014-12-10 23:37 UTC (permalink / raw)
To: Marcelo Tosatti, kvm; +Cc: Luiz Capitulino, Rik van Riel, Radim Krcmar
On 10/12/2014 21:57, Marcelo Tosatti wrote:
> For the hrtimer which emulates the tscdeadline timer in the guest,
> add an option to advance expiration, and busy spin on VM-entry waiting
> for the actual expiration time to elapse.
>
> This allows achieving low latencies in cyclictest (or any scenario
> which requires strict timing regarding timer expiration).
>
> Reduces cyclictest avg latency by 50%.
>
> Note: this option requires tuning to find the appropriate value
> for a particular hardware/guest combination. One method is to measure the
> average delay between apic_timer_fn and VM-entry.
> Another method is to start with 1000ns, and increase the value
> in say 500ns increments until avg cyclictest numbers stop decreasing.
What values are you using in practice for the parameter?
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>
> Index: kvm/arch/x86/kvm/lapic.c
> ===================================================================
> --- kvm.orig/arch/x86/kvm/lapic.c
> +++ kvm/arch/x86/kvm/lapic.c
> @@ -33,6 +33,7 @@
> #include <asm/page.h>
> #include <asm/current.h>
> #include <asm/apicdef.h>
> +#include <asm/delay.h>
> #include <linux/atomic.h>
> #include <linux/jump_label.h>
> #include "kvm_cache_regs.h"
> @@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv
> {
> struct kvm_vcpu *vcpu = apic->vcpu;
> wait_queue_head_t *q = &vcpu->wq;
> + struct kvm_timer *ktimer = &apic->lapic_timer;
>
> /*
> * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
> @@ -1087,11 +1089,58 @@ static void apic_timer_expired(struct kv
>
> if (waitqueue_active(q))
> wake_up_interruptible(q);
> +
> + if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE)
> + ktimer->expired_tscdeadline = ktimer->tscdeadline;
> +}
> +
> +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
> +{
> + struct kvm_lapic *apic = vcpu->arch.apic;
> + u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
> +
> + if (kvm_apic_hw_enabled(apic)) {
> + int vec = reg & APIC_VECTOR_MASK;
> +
> + if (kvm_x86_ops->test_posted_interrupt)
> + return kvm_x86_ops->test_posted_interrupt(vcpu, vec);
> + else {
> + if (apic_test_vector(vec, apic->regs + APIC_ISR))
> + return true;
> + }
One branch here is testing IRR, the other is testing ISR. I think
testing ISR is right; on APICv, the above test will cause a busy wait
during a higher-priority task (or during an interrupt service routine
for the timer itself), just because the timer interrupt was delivered.
So, on APICv, if the interrupt is in PIR but it has bits 7:4 <=
PPR[7:4], you have a problem. :( There is no APICv hook that lets you
get a vmexit when the PPR becomes low enough.
> + }
> + return false;
> +}
> +
> +void wait_lapic_expire(struct kvm_vcpu *vcpu)
> +{
> + struct kvm_lapic *apic = vcpu->arch.apic;
> + u64 guest_tsc, tsc_deadline;
> +
> + if (!kvm_vcpu_has_lapic(vcpu))
> + return;
> +
> + if (!apic_lvtt_tscdeadline(apic))
> + return;
This test is wrong, I think. You need to check whether the timer
interrupt was a TSC deadline interrupt. Instead, you are checking
whether the current mode is TSC-deadline. This can be different if the
interrupt could not be delivered immediately after it was received.
This is easy to fix: replace the first two tests with
"apic->lapic_timer.expired_tscdeadline != 0" and...
> + if (!lapic_timer_int_injected(vcpu))
> + return;
> + tsc_deadline = apic->lapic_timer.expired_tscdeadline;
... set apic->lapic_timer.expired_tscdeadline to 0 here.
But I'm not sure how to solve the above problem with APICv. That's a
pity. Knowing what values you use in practice for the parameter, would
also make it easier to understand the problem. Please report that
together with the graphs produced by the unit test you added.
Paolo
> + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
> +
> + while (guest_tsc < tsc_deadline) {
> + int delay = min(tsc_deadline - guest_tsc, 1000ULL);
> +
> + ndelay(delay);
> + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
> + }
> }
>
> static void start_apic_timer(struct kvm_lapic *apic)
> {
> ktime_t now;
> +
> atomic_set(&apic->lapic_timer.pending, 0);
>
> if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
> @@ -1137,6 +1186,7 @@ static void start_apic_timer(struct kvm_
> /* lapic timer in tsc deadline mode */
> u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
> u64 ns = 0;
> + ktime_t expire;
> struct kvm_vcpu *vcpu = apic->vcpu;
> unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
> unsigned long flags;
> @@ -1151,8 +1201,10 @@ static void start_apic_timer(struct kvm_
> if (likely(tscdeadline > guest_tsc)) {
> ns = (tscdeadline - guest_tsc) * 1000000ULL;
> do_div(ns, this_tsc_khz);
> + expire = ktime_add_ns(now, ns);
> + expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
> hrtimer_start(&apic->lapic_timer.timer,
> - ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
> + expire, HRTIMER_MODE_ABS);
> } else
> apic_timer_expired(apic);
>
> Index: kvm/arch/x86/kvm/lapic.h
> ===================================================================
> --- kvm.orig/arch/x86/kvm/lapic.h
> +++ kvm/arch/x86/kvm/lapic.h
> @@ -14,6 +14,7 @@ struct kvm_timer {
> u32 timer_mode;
> u32 timer_mode_mask;
> u64 tscdeadline;
> + u64 expired_tscdeadline;
> atomic_t pending; /* accumulated triggered timers */
> };
>
> @@ -170,4 +171,6 @@ static inline bool kvm_apic_has_events(s
>
> bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
>
> +void wait_lapic_expire(struct kvm_vcpu *vcpu);
> +
> #endif
> Index: kvm/arch/x86/kvm/x86.c
> ===================================================================
> --- kvm.orig/arch/x86/kvm/x86.c
> +++ kvm/arch/x86/kvm/x86.c
> @@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz)
> static u32 tsc_tolerance_ppm = 250;
> module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
>
> +/* lapic timer advance (tscdeadline mode only) in nanoseconds */
> +unsigned int lapic_timer_advance_ns = 0;
> +module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
> +
> static bool backwards_tsc_observed = false;
>
> #define KVM_NR_SHARED_MSRS 16
> @@ -6311,6 +6315,7 @@ static int vcpu_enter_guest(struct kvm_v
> }
>
> trace_kvm_entry(vcpu->vcpu_id);
> + wait_lapic_expire(vcpu);
> kvm_x86_ops->run(vcpu);
>
> /*
> Index: kvm/arch/x86/kvm/x86.h
> ===================================================================
> --- kvm.orig/arch/x86/kvm/x86.h
> +++ kvm/arch/x86/kvm/x86.h
> @@ -170,5 +170,7 @@ extern u64 kvm_supported_xcr0(void);
>
> extern unsigned int min_timer_period_us;
>
> +extern unsigned int lapic_timer_advance_ns;
> +
> extern struct static_key kvm_no_apic_vcpu;
> #endif
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-10 23:37 ` Paolo Bonzini
@ 2014-12-11 3:07 ` Marcelo Tosatti
2014-12-11 18:58 ` Paolo Bonzini
2014-12-11 20:48 ` Andy Lutomirski
0 siblings, 2 replies; 21+ messages in thread
From: Marcelo Tosatti @ 2014-12-11 3:07 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: kvm, Luiz Capitulino, Rik van Riel, Radim Krcmar
On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
>
>
> On 10/12/2014 21:57, Marcelo Tosatti wrote:
> > For the hrtimer which emulates the tscdeadline timer in the guest,
> > add an option to advance expiration, and busy spin on VM-entry waiting
> > for the actual expiration time to elapse.
> >
> > This allows achieving low latencies in cyclictest (or any scenario
> > which requires strict timing regarding timer expiration).
> >
> > Reduces cyclictest avg latency by 50%.
> >
> > Note: this option requires tuning to find the appropriate value
> > for a particular hardware/guest combination. One method is to measure the
> > average delay between apic_timer_fn and VM-entry.
> > Another method is to start with 1000ns, and increase the value
> > in say 500ns increments until avg cyclictest numbers stop decreasing.
>
> What values are you using in practice for the parameter?
7us.
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> >
> > Index: kvm/arch/x86/kvm/lapic.c
> > ===================================================================
> > --- kvm.orig/arch/x86/kvm/lapic.c
> > +++ kvm/arch/x86/kvm/lapic.c
> > @@ -33,6 +33,7 @@
> > #include <asm/page.h>
> > #include <asm/current.h>
> > #include <asm/apicdef.h>
> > +#include <asm/delay.h>
> > #include <linux/atomic.h>
> > #include <linux/jump_label.h>
> > #include "kvm_cache_regs.h"
> > @@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv
> > {
> > struct kvm_vcpu *vcpu = apic->vcpu;
> > wait_queue_head_t *q = &vcpu->wq;
> > + struct kvm_timer *ktimer = &apic->lapic_timer;
> >
> > /*
> > * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
> > @@ -1087,11 +1089,58 @@ static void apic_timer_expired(struct kv
> >
> > if (waitqueue_active(q))
> > wake_up_interruptible(q);
> > +
> > + if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE)
> > + ktimer->expired_tscdeadline = ktimer->tscdeadline;
> > +}
> > +
> > +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
> > +{
> > + struct kvm_lapic *apic = vcpu->arch.apic;
> > + u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
> > +
> > + if (kvm_apic_hw_enabled(apic)) {
> > + int vec = reg & APIC_VECTOR_MASK;
> > +
> > + if (kvm_x86_ops->test_posted_interrupt)
> > + return kvm_x86_ops->test_posted_interrupt(vcpu, vec);
> > + else {
> > + if (apic_test_vector(vec, apic->regs + APIC_ISR))
> > + return true;
> > + }
>
> One branch here is testing IRR, the other is testing ISR. I think
> testing ISR is right; on APICv, the above test will cause a busy wait
> during a higher-priority task (or during an interrupt service routine
> for the timer itself), just because the timer interrupt was delivered.
Yes.
> So, on APICv, if the interrupt is in PIR but it has bits 7:4 <=
> PPR[7:4], you have a problem. :( There is no APICv hook that lets you
> get a vmexit when the PPR becomes low enough.
Well, you simply exit earlier and busy spin for VM-exit
time.
For Linux guests, there is no problem.
> > + }
> > + return false;
> > +}
> > +
> > +void wait_lapic_expire(struct kvm_vcpu *vcpu)
> > +{
> > + struct kvm_lapic *apic = vcpu->arch.apic;
> > + u64 guest_tsc, tsc_deadline;
> > +
> > + if (!kvm_vcpu_has_lapic(vcpu))
> > + return;
> > +
> > + if (!apic_lvtt_tscdeadline(apic))
> > + return;
>
> This test is wrong, I think. You need to check whether the timer
> interrupt was a TSC deadline interrupt. Instead, you are checking
> whether the current mode is TSC-deadline. This can be different if the
> interrupt could not be delivered immediately after it was received.
> This is easy to fix: replace the first two tests with
> "apic->lapic_timer.expired_tscdeadline != 0" and...
Yes.
> > + if (!lapic_timer_int_injected(vcpu))
> > + return;
> > + tsc_deadline = apic->lapic_timer.expired_tscdeadline;
>
> ... set apic->lapic_timer.expired_tscdeadline to 0 here.
>
> But I'm not sure how to solve the above problem with APICv. That's a
> pity. Knowing what values you use in practice for the parameter, would
> also make it easier to understand the problem. Please report that
> together with the graphs produced by the unit test you added.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-11 3:07 ` Marcelo Tosatti
@ 2014-12-11 18:58 ` Paolo Bonzini
2014-12-11 20:48 ` Andy Lutomirski
1 sibling, 0 replies; 21+ messages in thread
From: Paolo Bonzini @ 2014-12-11 18:58 UTC (permalink / raw)
To: Marcelo Tosatti; +Cc: kvm, Luiz Capitulino, Rik van Riel, Radim Krcmar
On 11/12/2014 04:07, Marcelo Tosatti wrote:
>> > So, on APICv, if the interrupt is in PIR but it has bits 7:4 <=
>> > PPR[7:4], you have a problem. :( There is no APICv hook that lets you
>> > get a vmexit when the PPR becomes low enough.
> Well, you simply exit earlier and busy spin for VM-exit
> time.
Okay, given how small the jitter is in your plots, a small busy wait is
not the end of the world even if it caused a very short priority
inversion. Can you add a tracepoint that triggers when you do the busy
wait, and possibly a kvm stat that counts good (no busy wait) vs. bad
(busy wait) invocations of wait_lapic_expire?
Thanks,
Paolo
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-11 3:07 ` Marcelo Tosatti
2014-12-11 18:58 ` Paolo Bonzini
@ 2014-12-11 20:48 ` Andy Lutomirski
2014-12-11 20:58 ` Marcelo Tosatti
2014-12-11 21:10 ` Paolo Bonzini
1 sibling, 2 replies; 21+ messages in thread
From: Andy Lutomirski @ 2014-12-11 20:48 UTC (permalink / raw)
To: Marcelo Tosatti, Paolo Bonzini
Cc: kvm, Luiz Capitulino, Rik van Riel, Radim Krcmar
On 12/10/2014 07:07 PM, Marcelo Tosatti wrote:
> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
>>
>>
>> On 10/12/2014 21:57, Marcelo Tosatti wrote:
>>> For the hrtimer which emulates the tscdeadline timer in the guest,
>>> add an option to advance expiration, and busy spin on VM-entry waiting
>>> for the actual expiration time to elapse.
>>>
>>> This allows achieving low latencies in cyclictest (or any scenario
>>> which requires strict timing regarding timer expiration).
>>>
>>> Reduces cyclictest avg latency by 50%.
>>>
>>> Note: this option requires tuning to find the appropriate value
>>> for a particular hardware/guest combination. One method is to measure the
>>> average delay between apic_timer_fn and VM-entry.
>>> Another method is to start with 1000ns, and increase the value
>>> in say 500ns increments until avg cyclictest numbers stop decreasing.
>>
>> What values are you using in practice for the parameter?
>
> 7us.
It takes 7us to get from TSC deadline expiration to the *start* of
vmresume? That seems rather extreme.
Is it possible that almost all of that latency is from deadline
expiration to C-state exit? If so, can we teach the timer code to wake
up early to account for that? We're supposed to know our idle exit
latency these days.
--Andy
>
>>> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>>>
>>> Index: kvm/arch/x86/kvm/lapic.c
>>> ===================================================================
>>> --- kvm.orig/arch/x86/kvm/lapic.c
>>> +++ kvm/arch/x86/kvm/lapic.c
>>> @@ -33,6 +33,7 @@
>>> #include <asm/page.h>
>>> #include <asm/current.h>
>>> #include <asm/apicdef.h>
>>> +#include <asm/delay.h>
>>> #include <linux/atomic.h>
>>> #include <linux/jump_label.h>
>>> #include "kvm_cache_regs.h"
>>> @@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv
>>> {
>>> struct kvm_vcpu *vcpu = apic->vcpu;
>>> wait_queue_head_t *q = &vcpu->wq;
>>> + struct kvm_timer *ktimer = &apic->lapic_timer;
>>>
>>> /*
>>> * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
>>> @@ -1087,11 +1089,58 @@ static void apic_timer_expired(struct kv
>>>
>>> if (waitqueue_active(q))
>>> wake_up_interruptible(q);
>>> +
>>> + if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE)
>>> + ktimer->expired_tscdeadline = ktimer->tscdeadline;
>>> +}
>>> +
>>> +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
>>> +{
>>> + struct kvm_lapic *apic = vcpu->arch.apic;
>>> + u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
>>> +
>>> + if (kvm_apic_hw_enabled(apic)) {
>>> + int vec = reg & APIC_VECTOR_MASK;
>>> +
>>> + if (kvm_x86_ops->test_posted_interrupt)
>>> + return kvm_x86_ops->test_posted_interrupt(vcpu, vec);
>>> + else {
>>> + if (apic_test_vector(vec, apic->regs + APIC_ISR))
>>> + return true;
>>> + }
>>
>> One branch here is testing IRR, the other is testing ISR. I think
>> testing ISR is right; on APICv, the above test will cause a busy wait
>> during a higher-priority task (or during an interrupt service routine
>> for the timer itself), just because the timer interrupt was delivered.
>
> Yes.
>
>> So, on APICv, if the interrupt is in PIR but it has bits 7:4 <=
>> PPR[7:4], you have a problem. :( There is no APICv hook that lets you
>> get a vmexit when the PPR becomes low enough.
>
> Well, you simply exit earlier and busy spin for VM-exit
> time.
>
> For Linux guests, there is no problem.
>
>>> + }
>>> + return false;
>>> +}
>>> +
>>> +void wait_lapic_expire(struct kvm_vcpu *vcpu)
>>> +{
>>> + struct kvm_lapic *apic = vcpu->arch.apic;
>>> + u64 guest_tsc, tsc_deadline;
>>> +
>>> + if (!kvm_vcpu_has_lapic(vcpu))
>>> + return;
>>> +
>>> + if (!apic_lvtt_tscdeadline(apic))
>>> + return;
>>
>> This test is wrong, I think. You need to check whether the timer
>> interrupt was a TSC deadline interrupt. Instead, you are checking
>> whether the current mode is TSC-deadline. This can be different if the
>> interrupt could not be delivered immediately after it was received.
>> This is easy to fix: replace the first two tests with
>> "apic->lapic_timer.expired_tscdeadline != 0" and...
>
> Yes.
>
>>> + if (!lapic_timer_int_injected(vcpu))
>>> + return;
>>> + tsc_deadline = apic->lapic_timer.expired_tscdeadline;
>>
>> ... set apic->lapic_timer.expired_tscdeadline to 0 here.
>>
>> But I'm not sure how to solve the above problem with APICv. That's a
>> pity. Knowing what values you use in practice for the parameter, would
>> also make it easier to understand the problem. Please report that
>> together with the graphs produced by the unit test you added.
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-11 20:48 ` Andy Lutomirski
@ 2014-12-11 20:58 ` Marcelo Tosatti
2014-12-11 21:07 ` Andy Lutomirski
2014-12-11 21:10 ` Paolo Bonzini
1 sibling, 1 reply; 21+ messages in thread
From: Marcelo Tosatti @ 2014-12-11 20:58 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Paolo Bonzini, kvm, Luiz Capitulino, Rik van Riel, Radim Krcmar
On Thu, Dec 11, 2014 at 12:48:36PM -0800, Andy Lutomirski wrote:
> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote:
> > On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
> >>
> >>
> >> On 10/12/2014 21:57, Marcelo Tosatti wrote:
> >>> For the hrtimer which emulates the tscdeadline timer in the guest,
> >>> add an option to advance expiration, and busy spin on VM-entry waiting
> >>> for the actual expiration time to elapse.
> >>>
> >>> This allows achieving low latencies in cyclictest (or any scenario
> >>> which requires strict timing regarding timer expiration).
> >>>
> >>> Reduces cyclictest avg latency by 50%.
> >>>
> >>> Note: this option requires tuning to find the appropriate value
> >>> for a particular hardware/guest combination. One method is to measure the
> >>> average delay between apic_timer_fn and VM-entry.
> >>> Another method is to start with 1000ns, and increase the value
> >>> in say 500ns increments until avg cyclictest numbers stop decreasing.
> >>
> >> What values are you using in practice for the parameter?
> >
> > 7us.
>
>
> It takes 7us to get from TSC deadline expiration to the *start* of
> vmresume? That seems rather extreme.
>
> Is it possible that almost all of that latency is from deadline
> expiration to C-state exit? If so, can we teach the timer code to wake
> up early to account for that? We're supposed to know our idle exit
> latency these days.
7us includes:
idle thread wakeup
idle schedout
ksoftirqd schedin
ksoftirqd schedout
qemu schedin
vm-entry
C-states are disabled of course.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-11 20:58 ` Marcelo Tosatti
@ 2014-12-11 21:07 ` Andy Lutomirski
2014-12-11 21:37 ` Rik van Riel
0 siblings, 1 reply; 21+ messages in thread
From: Andy Lutomirski @ 2014-12-11 21:07 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: Paolo Bonzini, kvm list, Luiz Capitulino, Rik van Riel,
Radim Krcmar
On Thu, Dec 11, 2014 at 12:58 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Thu, Dec 11, 2014 at 12:48:36PM -0800, Andy Lutomirski wrote:
>> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote:
>> > On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
>> >>
>> >>
>> >> On 10/12/2014 21:57, Marcelo Tosatti wrote:
>> >>> For the hrtimer which emulates the tscdeadline timer in the guest,
>> >>> add an option to advance expiration, and busy spin on VM-entry waiting
>> >>> for the actual expiration time to elapse.
>> >>>
>> >>> This allows achieving low latencies in cyclictest (or any scenario
>> >>> which requires strict timing regarding timer expiration).
>> >>>
>> >>> Reduces cyclictest avg latency by 50%.
>> >>>
>> >>> Note: this option requires tuning to find the appropriate value
>> >>> for a particular hardware/guest combination. One method is to measure the
>> >>> average delay between apic_timer_fn and VM-entry.
>> >>> Another method is to start with 1000ns, and increase the value
>> >>> in say 500ns increments until avg cyclictest numbers stop decreasing.
>> >>
>> >> What values are you using in practice for the parameter?
>> >
>> > 7us.
>>
>>
>> It takes 7us to get from TSC deadline expiration to the *start* of
>> vmresume? That seems rather extreme.
>>
>> Is it possible that almost all of that latency is from deadline
>> expiration to C-state exit? If so, can we teach the timer code to wake
>> up early to account for that? We're supposed to know our idle exit
>> latency these days.
>
> 7us includes:
>
> idle thread wakeup
> idle schedout
> ksoftirqd schedin
> ksoftirqd schedout
> qemu schedin
> vm-entry
Is there some secret way to get perf to profile this? Like some way
to tell perf to only record samples after the IRQ fires and before
vmresume? Because 7 us seems waaaaay slower than it should be for
this.
Yes, Rik, I know that we're wasting all kinds of time doing dumb
things with xstate, but that shouldn't be anywhere near 7us on modern
hardware, unless we're actually taking a device-not-available
exception in there. :) There might be a whopping size xstate
operations, but those should be 60ns each, perhaps, if the state is
dirty?
Maybe everything misses cache?
>
> C-states are disabled of course.
>
:)
--Andy
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-11 20:48 ` Andy Lutomirski
2014-12-11 20:58 ` Marcelo Tosatti
@ 2014-12-11 21:10 ` Paolo Bonzini
2014-12-11 21:16 ` Andy Lutomirski
1 sibling, 1 reply; 21+ messages in thread
From: Paolo Bonzini @ 2014-12-11 21:10 UTC (permalink / raw)
To: Andy Lutomirski, Marcelo Tosatti
Cc: kvm, Luiz Capitulino, Rik van Riel, Radim Krcmar
On 11/12/2014 21:48, Andy Lutomirski wrote:
> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote:
>> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
>>>
>>>
>>> On 10/12/2014 21:57, Marcelo Tosatti wrote:
>>>> For the hrtimer which emulates the tscdeadline timer in the guest,
>>>> add an option to advance expiration, and busy spin on VM-entry waiting
>>>> for the actual expiration time to elapse.
>>>>
>>>> This allows achieving low latencies in cyclictest (or any scenario
>>>> which requires strict timing regarding timer expiration).
>>>>
>>>> Reduces cyclictest avg latency by 50%.
>>>>
>>>> Note: this option requires tuning to find the appropriate value
>>>> for a particular hardware/guest combination. One method is to measure the
>>>> average delay between apic_timer_fn and VM-entry.
>>>> Another method is to start with 1000ns, and increase the value
>>>> in say 500ns increments until avg cyclictest numbers stop decreasing.
>>>
>>> What values are you using in practice for the parameter?
>>
>> 7us.
>
> It takes 7us to get from TSC deadline expiration to the *start* of
> vmresume? That seems rather extreme.
No, to the end. 7us is 21000 clock cycles, and the vmexit+vmentry alone
costs about 1300.
> Is it possible that almost all of that latency is from deadline
> expiration to C-state exit?
No, I don't think so. Marcelo confirmed that C-states are disabled, bt
anyway none of the C-state latency matches Marcelo's data: C1 is really
small (1 us), C1e is too large (~10 us).
To see the effect of C-state exit, go to the plots I made on a normal
laptop and see latency jumping up to 200000 or 400000 cycles
(respectively 70 and 140 us, corresponding to C3 and C6 latencies of 60
and 80 us).
> If so, can we teach the timer code to wake
> up early to account for that?
What, it doesn't already do that?
Paolo
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-11 21:10 ` Paolo Bonzini
@ 2014-12-11 21:16 ` Andy Lutomirski
2014-12-11 21:27 ` Marcelo Tosatti
0 siblings, 1 reply; 21+ messages in thread
From: Andy Lutomirski @ 2014-12-11 21:16 UTC (permalink / raw)
To: Paolo Bonzini
Cc: Marcelo Tosatti, kvm list, Luiz Capitulino, Rik van Riel,
Radim Krcmar
On Thu, Dec 11, 2014 at 1:10 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 11/12/2014 21:48, Andy Lutomirski wrote:
>> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote:
>>> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
>>>>
>>>>
>>>> On 10/12/2014 21:57, Marcelo Tosatti wrote:
>>>>> For the hrtimer which emulates the tscdeadline timer in the guest,
>>>>> add an option to advance expiration, and busy spin on VM-entry waiting
>>>>> for the actual expiration time to elapse.
>>>>>
>>>>> This allows achieving low latencies in cyclictest (or any scenario
>>>>> which requires strict timing regarding timer expiration).
>>>>>
>>>>> Reduces cyclictest avg latency by 50%.
>>>>>
>>>>> Note: this option requires tuning to find the appropriate value
>>>>> for a particular hardware/guest combination. One method is to measure the
>>>>> average delay between apic_timer_fn and VM-entry.
>>>>> Another method is to start with 1000ns, and increase the value
>>>>> in say 500ns increments until avg cyclictest numbers stop decreasing.
>>>>
>>>> What values are you using in practice for the parameter?
>>>
>>> 7us.
>>
>> It takes 7us to get from TSC deadline expiration to the *start* of
>> vmresume? That seems rather extreme.
>
> No, to the end. 7us is 21000 clock cycles, and the vmexit+vmentry alone
> costs about 1300.
>
I suspect that something's massively wrong with context switching,
then -- it deserves to be considerably faster than that. The
architecturally expensive bits are vmresume, interrupt delivery, and
iret, but iret is only ~300 cycles and interrupt delivery should be
under 1k cycles.
Throw in a few hundred more cycles for whatever wrmsr idiocy is going
on somewhere in the process, and we're still nowhere near 21k cycles.
>> Is it possible that almost all of that latency is from deadline
>> expiration to C-state exit?
>
> No, I don't think so. Marcelo confirmed that C-states are disabled, bt
> anyway none of the C-state latency matches Marcelo's data: C1 is really
> small (1 us), C1e is too large (~10 us).
>
> To see the effect of C-state exit, go to the plots I made on a normal
> laptop and see latency jumping up to 200000 or 400000 cycles
> (respectively 70 and 140 us, corresponding to C3 and C6 latencies of 60
> and 80 us).
>
>> If so, can we teach the timer code to wake
>> up early to account for that?
>
> What, it doesn't already do that?
No clue. My one machine that actually cares about this has C states
rather heavily tuned, so I wouldn't notice.
--Andy
>
> Paolo
--
Andy Lutomirski
AMA Capital Management, LLC
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-11 21:16 ` Andy Lutomirski
@ 2014-12-11 21:27 ` Marcelo Tosatti
2014-12-11 21:29 ` Marcelo Tosatti
0 siblings, 1 reply; 21+ messages in thread
From: Marcelo Tosatti @ 2014-12-11 21:27 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Paolo Bonzini, kvm list, Luiz Capitulino, Rik van Riel,
Radim Krcmar
On Thu, Dec 11, 2014 at 01:16:52PM -0800, Andy Lutomirski wrote:
> On Thu, Dec 11, 2014 at 1:10 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> >
> >
> > On 11/12/2014 21:48, Andy Lutomirski wrote:
> >> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote:
> >>> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
> >>>>
> >>>>
> >>>> On 10/12/2014 21:57, Marcelo Tosatti wrote:
> >>>>> For the hrtimer which emulates the tscdeadline timer in the guest,
> >>>>> add an option to advance expiration, and busy spin on VM-entry waiting
> >>>>> for the actual expiration time to elapse.
> >>>>>
> >>>>> This allows achieving low latencies in cyclictest (or any scenario
> >>>>> which requires strict timing regarding timer expiration).
> >>>>>
> >>>>> Reduces cyclictest avg latency by 50%.
> >>>>>
> >>>>> Note: this option requires tuning to find the appropriate value
> >>>>> for a particular hardware/guest combination. One method is to measure the
> >>>>> average delay between apic_timer_fn and VM-entry.
> >>>>> Another method is to start with 1000ns, and increase the value
> >>>>> in say 500ns increments until avg cyclictest numbers stop decreasing.
> >>>>
> >>>> What values are you using in practice for the parameter?
> >>>
> >>> 7us.
> >>
> >> It takes 7us to get from TSC deadline expiration to the *start* of
> >> vmresume? That seems rather extreme.
> >
> > No, to the end. 7us is 21000 clock cycles, and the vmexit+vmentry alone
> > costs about 1300.
> >
>
> I suspect that something's massively wrong with context switching,
> then -- it deserves to be considerably faster than that. The
> architecturally expensive bits are vmresume, interrupt delivery, and
> iret, but iret is only ~300 cycles and interrupt delivery should be
> under 1k cycles.
>
> Throw in a few hundred more cycles for whatever wrmsr idiocy is going
> on somewhere in the process, and we're still nowhere near 21k cycles.
<idle>-0 [003] d..h2.. 1991756745496752: apic_timer_fn
<-__run_hrtimer
<idle>-0 [003] dN.h2.. 1991756745498732: tick_program_event <-hrtimer_interrupt
<idle>-0 [003] d...3.. 1991756745502112: sched_switch: prev_comm=swapper/3 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=qemu-system-x86 next_pid=20114 next_prio=98
<idle>-0 [003] d...2.. 1991756745502592: __context_tracking_task_switch <-__schedule
qemu-system-x86-20114 [003] ....1.. 1991756745503916: kvm_arch_vcpu_load <-kvm_sched_in
qemu-system-x86-20114 [003] ....... 1991756745505320: kvm_cpu_has_pending_timer <-kvm_vcpu_block
qemu-system-x86-20114 [003] ....... 1991756745506260: kvm_cpu_has_pending_timer <-kvm_arch_vcpu_ioctl_run
qemu-system-x86-20114 [003] ....... 1991756745507812: kvm_apic_accept_events <-kvm_arch_vcpu_ioctl_run
qemu-system-x86-20114 [003] ....... 1991756745508100: kvm_cpu_has_pending_timer <-kvm_arch_vcpu_ioctl_run
qemu-system-x86-20114 [003] ....... 1991756745508872: kvm_apic_accept_events <-vcpu_enter_guest
qemu-system-x86-20114 [003] ....1.. 1991756745510040: vmx_save_host_state <-vcpu_enter_guest
qemu-system-x86-20114 [003] d...2.. 1991756745511876: kvm_entry: vcpu 1
1991756745511876 - 1991756745496752 = 15124
The timestamps are TSC reads.
This is patched to run without ksoftirqd. Consider:
The LAPIC is programmed to the next earliest event by hrtimer_interrupt.
VM-entry is processing KVM_REQ_DEACTIVATE_FPU, KVM_REQ_EVENT.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-11 21:27 ` Marcelo Tosatti
@ 2014-12-11 21:29 ` Marcelo Tosatti
0 siblings, 0 replies; 21+ messages in thread
From: Marcelo Tosatti @ 2014-12-11 21:29 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Paolo Bonzini, kvm list, Luiz Capitulino, Rik van Riel,
Radim Krcmar
On Thu, Dec 11, 2014 at 07:27:17PM -0200, Marcelo Tosatti wrote:
> On Thu, Dec 11, 2014 at 01:16:52PM -0800, Andy Lutomirski wrote:
> > On Thu, Dec 11, 2014 at 1:10 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> > >
> > >
> > > On 11/12/2014 21:48, Andy Lutomirski wrote:
> > >> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote:
> > >>> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
> > >>>>
> > >>>>
> > >>>> On 10/12/2014 21:57, Marcelo Tosatti wrote:
> > >>>>> For the hrtimer which emulates the tscdeadline timer in the guest,
> > >>>>> add an option to advance expiration, and busy spin on VM-entry waiting
> > >>>>> for the actual expiration time to elapse.
> > >>>>>
> > >>>>> This allows achieving low latencies in cyclictest (or any scenario
> > >>>>> which requires strict timing regarding timer expiration).
> > >>>>>
> > >>>>> Reduces cyclictest avg latency by 50%.
> > >>>>>
> > >>>>> Note: this option requires tuning to find the appropriate value
> > >>>>> for a particular hardware/guest combination. One method is to measure the
> > >>>>> average delay between apic_timer_fn and VM-entry.
> > >>>>> Another method is to start with 1000ns, and increase the value
> > >>>>> in say 500ns increments until avg cyclictest numbers stop decreasing.
> > >>>>
> > >>>> What values are you using in practice for the parameter?
> > >>>
> > >>> 7us.
> > >>
> > >> It takes 7us to get from TSC deadline expiration to the *start* of
> > >> vmresume? That seems rather extreme.
> > >
> > > No, to the end. 7us is 21000 clock cycles, and the vmexit+vmentry alone
> > > costs about 1300.
> > >
> >
> > I suspect that something's massively wrong with context switching,
> > then -- it deserves to be considerably faster than that. The
> > architecturally expensive bits are vmresume, interrupt delivery, and
> > iret, but iret is only ~300 cycles and interrupt delivery should be
> > under 1k cycles.
> >
> > Throw in a few hundred more cycles for whatever wrmsr idiocy is going
> > on somewhere in the process, and we're still nowhere near 21k cycles.
>
>
> <idle>-0 [003] d..h2.. 1991756745496752: apic_timer_fn
> <-__run_hrtimer
> <idle>-0 [003] dN.h2.. 1991756745498732: tick_program_event <-hrtimer_interrupt
> <idle>-0 [003] d...3.. 1991756745502112: sched_switch: prev_comm=swapper/3 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=qemu-system-x86 next_pid=20114 next_prio=98
> <idle>-0 [003] d...2.. 1991756745502592: __context_tracking_task_switch <-__schedule
> qemu-system-x86-20114 [003] ....1.. 1991756745503916: kvm_arch_vcpu_load <-kvm_sched_in
> qemu-system-x86-20114 [003] ....... 1991756745505320: kvm_cpu_has_pending_timer <-kvm_vcpu_block
> qemu-system-x86-20114 [003] ....... 1991756745506260: kvm_cpu_has_pending_timer <-kvm_arch_vcpu_ioctl_run
> qemu-system-x86-20114 [003] ....... 1991756745507812: kvm_apic_accept_events <-kvm_arch_vcpu_ioctl_run
> qemu-system-x86-20114 [003] ....... 1991756745508100: kvm_cpu_has_pending_timer <-kvm_arch_vcpu_ioctl_run
> qemu-system-x86-20114 [003] ....... 1991756745508872: kvm_apic_accept_events <-vcpu_enter_guest
> qemu-system-x86-20114 [003] ....1.. 1991756745510040: vmx_save_host_state <-vcpu_enter_guest
> qemu-system-x86-20114 [003] d...2.. 1991756745511876: kvm_entry: vcpu 1
>
>
> 1991756745511876 - 1991756745496752 = 15124
>
> The timestamps are TSC reads.
>
> This is patched to run without ksoftirqd. Consider:
>
> The LAPIC is programmed to the next earliest event by hrtimer_interrupt.
> VM-entry is processing KVM_REQ_DEACTIVATE_FPU, KVM_REQ_EVENT.
>
model : 58
model name : Intel(R) Core(TM) i5-3470S CPU @ 2.90GHz
stepping : 9
microcode : 0x1b
cpu MHz : 2873.492
cache size : 6144 KB
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-11 21:07 ` Andy Lutomirski
@ 2014-12-11 21:37 ` Rik van Riel
0 siblings, 0 replies; 21+ messages in thread
From: Rik van Riel @ 2014-12-11 21:37 UTC (permalink / raw)
To: Andy Lutomirski, Marcelo Tosatti
Cc: Paolo Bonzini, kvm list, Luiz Capitulino, Radim Krcmar
On 12/11/2014 04:07 PM, Andy Lutomirski wrote:
> On Thu, Dec 11, 2014 at 12:58 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> On Thu, Dec 11, 2014 at 12:48:36PM -0800, Andy Lutomirski wrote:
>>> On 12/10/2014 07:07 PM, Marcelo Tosatti wrote:
>>>> On Thu, Dec 11, 2014 at 12:37:57AM +0100, Paolo Bonzini wrote:
>>>>>
>>>>>
>>>>> On 10/12/2014 21:57, Marcelo Tosatti wrote:
>>>>>> For the hrtimer which emulates the tscdeadline timer in the guest,
>>>>>> add an option to advance expiration, and busy spin on VM-entry waiting
>>>>>> for the actual expiration time to elapse.
>>>>>>
>>>>>> This allows achieving low latencies in cyclictest (or any scenario
>>>>>> which requires strict timing regarding timer expiration).
>>>>>>
>>>>>> Reduces cyclictest avg latency by 50%.
>>>>>>
>>>>>> Note: this option requires tuning to find the appropriate value
>>>>>> for a particular hardware/guest combination. One method is to measure the
>>>>>> average delay between apic_timer_fn and VM-entry.
>>>>>> Another method is to start with 1000ns, and increase the value
>>>>>> in say 500ns increments until avg cyclictest numbers stop decreasing.
>>>>>
>>>>> What values are you using in practice for the parameter?
>>>>
>>>> 7us.
>>>
>>>
>>> It takes 7us to get from TSC deadline expiration to the *start* of
>>> vmresume? That seems rather extreme.
>>>
>>> Is it possible that almost all of that latency is from deadline
>>> expiration to C-state exit? If so, can we teach the timer code to wake
>>> up early to account for that? We're supposed to know our idle exit
>>> latency these days.
>>
>> 7us includes:
>>
>> idle thread wakeup
>> idle schedout
>> ksoftirqd schedin
>> ksoftirqd schedout
>> qemu schedin
>> vm-entry
>
> Is there some secret way to get perf to profile this? Like some way
> to tell perf to only record samples after the IRQ fires and before
> vmresume? Because 7 us seems waaaaay slower than it should be for
> this.
>
> Yes, Rik, I know that we're wasting all kinds of time doing dumb
> things with xstate, but that shouldn't be anywhere near 7us on modern
> hardware, unless we're actually taking a device-not-available
> exception in there. :) There might be a whopping size xstate
> operations, but those should be 60ns each, perhaps, if the state is
> dirty?
>
> Maybe everything misses cache?
I don't expect the xstate stuff to be more than about half
a microsecond, with cache misses and failure to optimize
XSAVEOPT / XRSTOR across vmenter/vmexit.
We get another microsecond or so from not trapping from the
guest to the host every time the guest accesses the FPU for
the first time. Marcelo already has a hack for that in his
tree, and my series merely has something that achieves the
same in an automated (and hopefully upstreamable) way.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration
2014-12-10 20:57 ` [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration Marcelo Tosatti
2014-12-10 23:37 ` Paolo Bonzini
@ 2014-12-12 18:35 ` Radim Krcmar
1 sibling, 0 replies; 21+ messages in thread
From: Radim Krcmar @ 2014-12-12 18:35 UTC (permalink / raw)
To: Marcelo Tosatti; +Cc: kvm, Luiz Capitulino, Rik van Riel, Paolo Bonzini
2014-12-10 15:57-0500, Marcelo Tosatti:
> For the hrtimer which emulates the tscdeadline timer in the guest,
> add an option to advance expiration, and busy spin on VM-entry waiting
> for the actual expiration time to elapse.
>
> This allows achieving low latencies in cyclictest (or any scenario
> which requires strict timing regarding timer expiration).
>
> Reduces cyclictest avg latency by 50%.
>
> Note: this option requires tuning to find the appropriate value
> for a particular hardware/guest combination. One method is to measure the
> average delay between apic_timer_fn and VM-entry.
> Another method is to start with 1000ns, and increase the value
> in say 500ns increments until avg cyclictest numbers stop decreasing.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>
> Index: kvm/arch/x86/kvm/lapic.c
> ===================================================================
> --- kvm.orig/arch/x86/kvm/lapic.c
> +++ kvm/arch/x86/kvm/lapic.c
> @@ -1087,11 +1089,58 @@ static void apic_timer_expired(struct kv
>
> if (waitqueue_active(q))
> wake_up_interruptible(q);
> +
> + if (ktimer->timer_mode_mask == APIC_LVT_TIMER_TSCDEADLINE)
timer_mode != timer_mode_mask. (Please use apic_lvtt_tscdeadline().)
(So the code never waited for tsc_deadline >= guest_tsc ...
I suppose it was possible to achieve lower latencies thanks to that.)
> +void wait_lapic_expire(struct kvm_vcpu *vcpu)
> +{
> + struct kvm_lapic *apic = vcpu->arch.apic;
> + u64 guest_tsc, tsc_deadline;
> +
> + if (!kvm_vcpu_has_lapic(vcpu))
> + return;
> +
> + if (!apic_lvtt_tscdeadline(apic))
> + return;
(It is better to check expired_tscdeadline here and zero it later.)
> +
> + if (!lapic_timer_int_injected(vcpu))
> + return;
> +
> + tsc_deadline = apic->lapic_timer.expired_tscdeadline;
> + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
> +
> + while (guest_tsc < tsc_deadline) {
> + int delay = min(tsc_deadline - guest_tsc, 1000ULL);
> +
> + ndelay(delay);
The delay is in nanoseconds, but you feed it a difference in TSC.
(And usually overestimate the time; cpu_relax() loop seems easiest.)
^ permalink raw reply [flat|nested] 21+ messages in thread
end of thread, other threads:[~2014-12-12 18:35 UTC | newest]
Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-12-10 20:57 [patch 0/2] KVM: add option to advance tscdeadline hrtimer expiration (v3) Marcelo Tosatti
2014-12-10 20:57 ` [patch 1/2] KVM: x86: add method to test PIR bitmap vector Marcelo Tosatti
2014-12-10 20:57 ` [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration Marcelo Tosatti
2014-12-10 23:37 ` Paolo Bonzini
2014-12-11 3:07 ` Marcelo Tosatti
2014-12-11 18:58 ` Paolo Bonzini
2014-12-11 20:48 ` Andy Lutomirski
2014-12-11 20:58 ` Marcelo Tosatti
2014-12-11 21:07 ` Andy Lutomirski
2014-12-11 21:37 ` Rik van Riel
2014-12-11 21:10 ` Paolo Bonzini
2014-12-11 21:16 ` Andy Lutomirski
2014-12-11 21:27 ` Marcelo Tosatti
2014-12-11 21:29 ` Marcelo Tosatti
2014-12-12 18:35 ` Radim Krcmar
-- strict thread matches above, loose matches on Subject: below --
2014-12-10 17:06 [patch 0/2] KVM: add option to advance tscdeadline hrtimer expiration (v2) Marcelo Tosatti
2014-12-10 17:06 ` [patch 2/2] KVM: x86: add option to advance tscdeadline hrtimer expiration Marcelo Tosatti
2014-12-10 17:11 ` Rik van Riel
2014-12-10 16:53 [patch 0/2] KVM: " Marcelo.Tosatti
2014-12-10 16:53 ` [patch 2/2] KVM: x86: " Marcelo.Tosatti
2014-12-10 17:08 ` Paolo Bonzini
2014-12-10 17:34 ` Marcelo Tosatti
2014-12-10 17:53 ` Paolo Bonzini
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox