* [patch] kvmclock fix
@ 2009-02-03 17:02 Gerd Hoffmann
2009-02-03 17:14 ` Glauber Costa
2009-02-04 16:17 ` Marcelo Tosatti
0 siblings, 2 replies; 5+ messages in thread
From: Gerd Hoffmann @ 2009-02-03 17:02 UTC (permalink / raw)
To: KVM list
[-- Attachment #1: Type: text/plain, Size: 276 bytes --]
Hi,
Here is a patch which fixes the kvmclock on multicore systems without
constant_tsc.
I'm not that happy with the current form as the notifier duplicates code
from tsc.c. I don't see an easy way around that though.
Suggestions? Other review comments?
thanks,
Gerd
[-- Attachment #2: 0001-kvm-test-patch.patch --]
[-- Type: text/plain, Size: 5922 bytes --]
>From 42dec55d19261bfd31097c1800341fbcafc0d336 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Fri, 30 Jan 2009 23:52:46 +0100
Subject: [PATCH] kvm test patch
---
arch/x86/kvm/x86.c | 109 +++++++++++++++++++++++++++++++++++++++++++---
include/linux/kvm_host.h | 1 +
2 files changed, 104 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cc17546..8b02e0d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -36,6 +36,7 @@
#include <linux/highmem.h>
#include <linux/iommu.h>
#include <linux/intel-iommu.h>
+#include <linux/cpufreq.h>
#include <asm/uaccess.h>
#include <asm/msr.h>
@@ -586,6 +587,8 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
hv_clock->tsc_to_system_mul);
}
+static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+
static void kvm_write_guest_time(struct kvm_vcpu *v)
{
struct timespec ts;
@@ -596,9 +599,9 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
if ((!vcpu->time_page))
return;
- if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
- kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
- vcpu->hv_clock_tsc_khz = tsc_khz;
+ if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
+ kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
+ vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
}
/* Keep irq disabled to prevent changes to the clock */
@@ -629,6 +632,17 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
}
+static int kvm_request_guest_time_update(struct kvm_vcpu *v)
+{
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+
+ if (!vcpu->time_page)
+ return 0;
+ if (test_and_set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests))
+ return 0;
+ return 1;
+}
+
static bool msr_mtrr_valid(unsigned msr)
{
switch (msr) {
@@ -758,7 +772,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
vcpu->arch.time_page = NULL;
}
- kvm_write_guest_time(vcpu);
+ kvm_request_guest_time_update(vcpu);
break;
}
default:
@@ -1062,7 +1076,7 @@ out:
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
kvm_x86_ops->vcpu_load(vcpu, cpu);
- kvm_write_guest_time(vcpu);
+ kvm_request_guest_time_update(vcpu);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2585,9 +2599,84 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
}
EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
+static void bounce_off(void *info)
+{
+ /* nothing */
+}
+
+static unsigned int ref_freq;
+static unsigned long loops_per_jiffy_ref;
+static unsigned long tsc_khz_ref;
+
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ unsigned long *lpj, dummy;
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ int i, guest_mode;
+
+ if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
+ return 0;
+
+ lpj = &dummy;
+ if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+#ifdef CONFIG_SMP
+ lpj = &cpu_data(freq->cpu).loops_per_jiffy;
+#else
+ lpj = &boot_cpu_data.loops_per_jiffy;
+#endif
+
+ if (!ref_freq) {
+ ref_freq = freq->old;
+ loops_per_jiffy_ref = *lpj;
+ tsc_khz_ref = tsc_khz;
+ }
+
+ if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
+ (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+ (val == CPUFREQ_RESUMECHANGE)) {
+ *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+ per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+ }
+
+ guest_mode = 0;
+ spin_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+ vcpu = kvm->vcpus[i];
+ if (!vcpu)
+ continue;
+ if (vcpu->cpu != freq->cpu)
+ continue;
+ if (!kvm_request_guest_time_update(vcpu))
+ continue;
+ if (vcpu->guest_mode)
+ guest_mode++;
+ }
+ }
+ spin_unlock(&kvm_lock);
+ if (freq->old < freq->new && guest_mode) {
+ /*
+ * Upscaling frequency while guest runs. Must make
+ * sure kvmclock is updated before cpufreq actually
+ * changes the frequency, otherwise we risk the guest
+ * sees time go backwards. Send interrupt to kick cpu
+ * out of guest context.
+ */
+ smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
+ }
+ return 0;
+}
+
+static struct notifier_block kvmclock_cpufreq_notifier_block = {
+ .notifier_call = kvmclock_cpufreq_notifier
+};
+
int kvm_arch_init(void *opaque)
{
- int r;
+ int r, cpu;
struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
if (kvm_x86_ops) {
@@ -2618,6 +2707,12 @@ int kvm_arch_init(void *opaque)
kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
+
+ for_each_possible_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+
return 0;
out:
@@ -2943,6 +3038,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (vcpu->requests) {
if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
__kvm_migrate_timers(vcpu);
+ if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+ kvm_write_guest_time(vcpu);
if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
kvm_mmu_sync_roots(vcpu);
if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ec49d0b..5d116a7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -37,6 +37,7 @@
#define KVM_REQ_PENDING_TIMER 5
#define KVM_REQ_UNHALT 6
#define KVM_REQ_MMU_SYNC 7
+#define KVM_REQ_KVMCLOCK_UPDATE 8
#define KVM_USERSPACE_IRQ_SOURCE_ID 0
--
1.6.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [patch] kvmclock fix
2009-02-03 17:02 [patch] kvmclock fix Gerd Hoffmann
@ 2009-02-03 17:14 ` Glauber Costa
2009-02-03 17:19 ` Gerd Hoffmann
2009-02-04 16:17 ` Marcelo Tosatti
1 sibling, 1 reply; 5+ messages in thread
From: Glauber Costa @ 2009-02-03 17:14 UTC (permalink / raw)
To: Gerd Hoffmann; +Cc: KVM list
> +static unsigned int ref_freq;
> +static unsigned long loops_per_jiffy_ref;
> +static unsigned long tsc_khz_ref;
Doesn't these need to be percpu too ?
Otherwise we could be calculating the new frequency based on a foreign
reference.
> +
> +static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
> + void *data)
> +{
> + struct cpufreq_freqs *freq = data;
> + unsigned long *lpj, dummy;
> + struct kvm *kvm;
> + struct kvm_vcpu *vcpu;
> + int i, guest_mode;
> +
> + if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
> + return 0;
> +
> + lpj = &dummy;
> + if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> +#ifdef CONFIG_SMP
> + lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> +#else
> + lpj = &boot_cpu_data.loops_per_jiffy;
> +#endif
> +
> + if (!ref_freq) {
> + ref_freq = freq->old;
> + loops_per_jiffy_ref = *lpj;
> + tsc_khz_ref = tsc_khz;
> + }
> +
> + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
> + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
> + (val == CPUFREQ_RESUMECHANGE)) {
> + *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> + per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
I believe a possible approach is to move those that refer to tsc
calculation to tsc.c, and make all uses
of tsc_khz comply to a per_cpu use. A global tsc_khz does not make sense anyway.
Then we'll have a specific kvm notifier that just forces the guest out
of its execution so we can update
the clock state.
--
Glauber Costa.
"Free as in Freedom"
http://glommer.net
"The less confident you are, the more serious you have to act."
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [patch] kvmclock fix
2009-02-03 17:14 ` Glauber Costa
@ 2009-02-03 17:19 ` Gerd Hoffmann
0 siblings, 0 replies; 5+ messages in thread
From: Gerd Hoffmann @ 2009-02-03 17:19 UTC (permalink / raw)
To: Glauber Costa; +Cc: KVM list
Glauber Costa wrote:
>> +static unsigned int ref_freq;
>> +static unsigned long loops_per_jiffy_ref;
>> +static unsigned long tsc_khz_ref;
>
> Doesn't these need to be percpu too ?
>
> Otherwise we could be calculating the new frequency based on a foreign
> reference.
I doubt you can have a SMP systems with cpus having different freqs.
Also tsc.c does the same.
> I believe a possible approach is to move those that refer to tsc
> calculation to tsc.c, and make all uses
> of tsc_khz comply to a per_cpu use. A global tsc_khz does not make sense anyway.
>
> Then we'll have a specific kvm notifier that just forces the guest out
> of its execution so we can update
> the clock state.
Then we have an ordering issue with the two notifier calls. I don't
thing there is a way to force a specific call order.
cheers,
Gerd
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [patch] kvmclock fix
2009-02-03 17:02 [patch] kvmclock fix Gerd Hoffmann
2009-02-03 17:14 ` Glauber Costa
@ 2009-02-04 16:17 ` Marcelo Tosatti
2009-02-04 16:43 ` Gerd Hoffmann
1 sibling, 1 reply; 5+ messages in thread
From: Marcelo Tosatti @ 2009-02-04 16:17 UTC (permalink / raw)
To: Gerd Hoffmann; +Cc: KVM list
On Tue, Feb 03, 2009 at 06:02:31PM +0100, Gerd Hoffmann wrote:
> Hi,
>
> Here is a patch which fixes the kvmclock on multicore systems without
> constant_tsc.
>
> I'm not that happy with the current form as the notifier duplicates code
> from tsc.c. I don't see an easy way around that though.
>
> Suggestions? Other review comments?
>
> thanks,
> Gerd
> + guest_mode = 0;
> + spin_lock(&kvm_lock);
> + list_for_each_entry(kvm, &vm_list, vm_list) {
> + for (i = 0; i < KVM_MAX_VCPUS; ++i) {
> + vcpu = kvm->vcpus[i];
> + if (!vcpu)
> + continue;
> + if (vcpu->cpu != freq->cpu)
> + continue;
> + if (!kvm_request_guest_time_update(vcpu))
> + continue;
> + if (vcpu->guest_mode)
> + guest_mode++;
This can race if the target vcpu has tested for vcpu->requests but not
set guest_mode yet, which is acceptable for the IRQs but for this case
it is not?
Better always send the IPI if the bit was unset and cpu_self !=
target_cpu.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [patch] kvmclock fix
2009-02-04 16:17 ` Marcelo Tosatti
@ 2009-02-04 16:43 ` Gerd Hoffmann
0 siblings, 0 replies; 5+ messages in thread
From: Gerd Hoffmann @ 2009-02-04 16:43 UTC (permalink / raw)
To: Marcelo Tosatti; +Cc: KVM list
Hi,
> This can race if the target vcpu has tested for vcpu->requests but not
> set guest_mode yet,
Right. New patch coming in a minute.
cheers,
Gerd
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2009-02-04 16:43 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-02-03 17:02 [patch] kvmclock fix Gerd Hoffmann
2009-02-03 17:14 ` Glauber Costa
2009-02-03 17:19 ` Gerd Hoffmann
2009-02-04 16:17 ` Marcelo Tosatti
2009-02-04 16:43 ` Gerd Hoffmann
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox