diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 1cc9d42..688df87 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -79,7 +79,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o pvclock.o ifdef CONFIG_INPUT_PCSPKR obj-y += pcspeaker.o diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index ddee040..476b7c7 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -37,17 +38,9 @@ early_param("no-kvmclock", parse_no_kvmclock); /* The hypervisor will put information about time periodically here */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); -#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field - -static inline u64 kvm_get_delta(u64 last_tsc) -{ - int cpu = smp_processor_id(); - u64 delta = native_read_tsc() - last_tsc; - return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE; -} static struct kvm_wall_clock wall_clock; -static cycle_t kvm_clock_read(void); + /* * The wallclock is the time of day when we booted. Since then, some time may * have elapsed since the hypervisor wrote the data. So we try to account for @@ -55,35 +48,19 @@ static cycle_t kvm_clock_read(void); */ unsigned long kvm_get_wallclock(void) { - u32 wc_sec, wc_nsec; - u64 delta; + struct kvm_vcpu_time_info *vcpu_time; struct timespec ts; - int version, nsec; int low, high; low = (int)__pa(&wall_clock); high = ((u64)__pa(&wall_clock) >> 32); - - delta = kvm_clock_read(); - native_write_msr(MSR_KVM_WALL_CLOCK, low, high); - do { - version = wall_clock.wc_version; - rmb(); - wc_sec = wall_clock.wc_sec; - wc_nsec = wall_clock.wc_nsec; - rmb(); - } while ((wall_clock.wc_version != version) || (version & 1)); - - delta = kvm_clock_read() - delta; - delta += wc_nsec; - nsec = do_div(delta, NSEC_PER_SEC); - set_normalized_timespec(&ts, wc_sec + delta, nsec); - /* - * Of all mechanisms of time adjustment I've tested, this one - * was the champion! - */ - return ts.tv_sec + 1; + + vcpu_time = &get_cpu_var(hv_clock); + pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); + put_cpu_var(hv_clock); + + return ts.tv_sec; } int kvm_set_wallclock(unsigned long now) @@ -91,28 +68,17 @@ int kvm_set_wallclock(unsigned long now) return 0; } -/* - * This is our read_clock function. The host puts an tsc timestamp each time - * it updates a new time. Without the tsc adjustment, we can have a situation - * in which a vcpu starts to run earlier (smaller system_time), but probes - * time later (compared to another vcpu), leading to backwards time - */ static cycle_t kvm_clock_read(void) { - u64 last_tsc, now; - int cpu; + struct kvm_vcpu_time_info *src; + cycle_t ret; - preempt_disable(); - cpu = smp_processor_id(); - - last_tsc = get_clock(cpu, tsc_timestamp); - now = get_clock(cpu, system_time); - - now += kvm_get_delta(last_tsc); - preempt_enable(); - - return now; + src = &get_cpu_var(hv_clock); + ret = pvclock_clocksource_read(src); + put_cpu_var(hv_clock); + return ret; } + static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_read, diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c new file mode 100644 index 0000000..2da148d --- /dev/null +++ b/arch/x86/kernel/pvclock.c @@ -0,0 +1,144 @@ +/* paravirtual clock -- common code used by kvm/xen + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include +#include +#include + +/* + * These are perodically updated + * xen: magic shared_info page + * kvm: gpa registered via msr + * and then copied here. + */ +struct pvclock_shadow_time { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + int tsc_shift; + u32 version; +}; + +static DEFINE_PER_CPU(struct pvclock_shadow_time, shadow_time); + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif __x86_64__ + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + +static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) +{ + u64 delta = native_read_tsc() - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +/* + * Reads a consistent set of time-base values from hypervisor, + * into a shadow data area. + */ +static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, + struct kvm_vcpu_time_info *src) +{ + do { + dst->version = src->version; + rmb(); /* fetch version before data */ + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); /* test version after fetching data */ + } while ((src->version & 1) | (dst->version ^ src->version)); + + return dst->version; +} + +/* + * This is our read_clock function. The host puts an tsc timestamp each time + * it updates a new time. Without the tsc adjustment, we can have a situation + * in which a vcpu starts to run earlier (smaller system_time), but probes + * time later (compared to another vcpu), leading to backwards time + */ +cycle_t pvclock_clocksource_read(struct kvm_vcpu_time_info *src) +{ + struct pvclock_shadow_time *shadow = &get_cpu_var(shadow_time); + cycle_t ret; + + pvclock_get_time_values(shadow, src); + ret = shadow->system_timestamp + pvclock_get_nsec_offset(shadow); + + put_cpu_var(shadow_time); + return ret; +} + +void pvclock_read_wallclock(struct kvm_wall_clock *wall_clock, + struct kvm_vcpu_time_info *vcpu_time, + struct timespec *ts) +{ + u32 version; + u64 delta; + struct timespec now; + + /* get wallclock at system boot */ + do { + version = wall_clock->wc_version; + rmb(); /* fetch version before time */ + now.tv_sec = wall_clock->wc_sec; + now.tv_nsec = wall_clock->wc_nsec; + rmb(); /* fetch time before checking version */ + } while ((wall_clock->wc_version & 1) || (version != wall_clock->wc_version)); + + delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ + delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + + now.tv_nsec = do_div(delta, NSEC_PER_SEC); + now.tv_sec = delta; + + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +} diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0ce5563..bdd3128 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -493,7 +493,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { static int version; struct kvm_wall_clock wc; - struct timespec wc_ts; + struct timespec now,sys,boot; if (!wall_clock) return; @@ -502,9 +502,16 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); - wc_ts = current_kernel_time(); - wc.wc_sec = wc_ts.tv_sec; - wc.wc_nsec = wc_ts.tv_nsec; +#if 0 + /* Hmm, getboottime() isn't exported to modules ... */ + getboottime(&boot); +#else + now = current_kernel_time(); + ktime_get_ts(&sys); + boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); +#endif + wc.wc_sec = boot.tv_sec; + wc.wc_nsec = boot.tv_nsec; wc.wc_version = version; kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); @@ -544,13 +551,51 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + sizeof(vcpu->hv_clock)); kunmap_atomic(shared_kaddr, KM_USER0); mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); } +static uint32_t div_frac(uint32_t dividend, uint32_t divisor) +{ + uint32_t quotient, remainder; + + __asm__ ( "divl %4" + : "=a" (quotient), "=d" (remainder) + : "0" (0), "1" (dividend), "r" (divisor) ); + return quotient; +} + +static void kvm_set_time_scale(uint32_t tsc_khz, struct kvm_vcpu_time_info *hv_clock) +{ + uint64_t nsecs = 1000000000LL; + int32_t shift = 0; + uint64_t tps64; + uint32_t tps32; + + tps64 = tsc_khz * 1000LL; + while (tps64 > nsecs*2) { + tps64 >>= 1; + shift--; + } + + tps32 = (uint32_t)tps64; + while (tps32 <= (uint32_t)nsecs) { + tps32 <<= 1; + shift++; + } + + hv_clock->tsc_shift = shift; + hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); + +#if 0 + printk(KERN_DEBUG "%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", + __FUNCTION__, tsc_khz, hv_clock->tsc_shift, + hv_clock->tsc_to_system_mul); +#endif +} int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { @@ -599,9 +644,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* ...but clean it before doing the actual write */ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); - vcpu->arch.hv_clock.tsc_to_system_mul = - clocksource_khz2mult(tsc_khz, 22); - vcpu->arch.hv_clock.tsc_shift = 22; + kvm_set_time_scale(tsc_khz, &vcpu->arch.hv_clock); down_read(¤t->mm->mmap_sem); vcpu->arch.time_page = diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c39e1a5..3d5f945 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -13,6 +13,7 @@ #include #include +#include #include #include @@ -30,17 +31,6 @@ static cycle_t xen_clocksource_read(void); -/* These are perodically updated in shared_info, and then copied here. */ -struct shadow_time_info { - u64 tsc_timestamp; /* TSC at last update of time vals. */ - u64 system_timestamp; /* Time, in nanosecs, since boot. */ - u32 tsc_to_nsec_mul; - int tsc_shift; - u32 version; -}; - -static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); - /* runstate info updated by Xen */ static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); @@ -230,95 +220,14 @@ unsigned long xen_cpu_khz(void) return xen_khz; } -/* - * Reads a consistent set of time-base values from Xen, into a shadow data - * area. - */ -static unsigned get_time_values_from_xen(void) -{ - struct vcpu_time_info *src; - struct shadow_time_info *dst; - - /* src is shared memory with the hypervisor, so we need to - make sure we get a consistent snapshot, even in the face of - being preempted. */ - src = &__get_cpu_var(xen_vcpu)->time; - dst = &__get_cpu_var(shadow_time); - - do { - dst->version = src->version; - rmb(); /* fetch version before data */ - dst->tsc_timestamp = src->tsc_timestamp; - dst->system_timestamp = src->system_time; - dst->tsc_to_nsec_mul = src->tsc_to_system_mul; - dst->tsc_shift = src->tsc_shift; - rmb(); /* test version after fetching data */ - } while ((src->version & 1) | (dst->version ^ src->version)); - - return dst->version; -} - -/* - * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, - * yielding a 64-bit result. - */ -static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) -{ - u64 product; -#ifdef __i386__ - u32 tmp1, tmp2; -#endif - - if (shift < 0) - delta >>= -shift; - else - delta <<= shift; - -#ifdef __i386__ - __asm__ ( - "mul %5 ; " - "mov %4,%%eax ; " - "mov %%edx,%4 ; " - "mul %5 ; " - "xor %5,%5 ; " - "add %4,%%eax ; " - "adc %5,%%edx ; " - : "=A" (product), "=r" (tmp1), "=r" (tmp2) - : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); -#elif __x86_64__ - __asm__ ( - "mul %%rdx ; shrd $32,%%rdx,%%rax" - : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); -#else -#error implement me! -#endif - - return product; -} - -static u64 get_nsec_offset(struct shadow_time_info *shadow) -{ - u64 now, delta; - now = native_read_tsc(); - delta = now - shadow->tsc_timestamp; - return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); -} - static cycle_t xen_clocksource_read(void) { - struct shadow_time_info *shadow = &get_cpu_var(shadow_time); + struct vcpu_time_info *src; cycle_t ret; - unsigned version; - - do { - version = get_time_values_from_xen(); - barrier(); - ret = shadow->system_timestamp + get_nsec_offset(shadow); - barrier(); - } while (version != __get_cpu_var(xen_vcpu)->time.version); - - put_cpu_var(shadow_time); + src = &get_cpu_var(xen_vcpu)->time; + ret = pvclock_clocksource_read((void*)src); + put_cpu_var(xen_vcpu); return ret; } @@ -349,9 +258,14 @@ static void xen_read_wallclock(struct timespec *ts) unsigned long xen_get_wallclock(void) { + const struct shared_info *s = HYPERVISOR_shared_info; + struct kvm_wall_clock *wall_clock = (void*)&(s->wc_version); + struct vcpu_time_info *vcpu_time; struct timespec ts; - xen_read_wallclock(&ts); + vcpu_time = &get_cpu_var(xen_vcpu)->time; + pvclock_read_wallclock(wall_clock, (void*)vcpu_time, &ts); + put_cpu_var(xen_vcpu); return ts.tv_sec; } @@ -576,8 +490,6 @@ __init void xen_time_init(void) { int cpu = smp_processor_id(); - get_time_values_from_xen(); - clocksource_register(&xen_clocksource); if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { diff --git a/include/asm-x86/pvclock.h b/include/asm-x86/pvclock.h new file mode 100644 index 0000000..2b9812f --- /dev/null +++ b/include/asm-x86/pvclock.h @@ -0,0 +1,6 @@ +#include +#include +cycle_t pvclock_clocksource_read(struct kvm_vcpu_time_info *src); +void pvclock_read_wallclock(struct kvm_wall_clock *wall, + struct kvm_vcpu_time_info *vcpu, + struct timespec *ts);