diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index f6e44fc..b60bea8 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -248,6 +248,15 @@ config VMI (it could be used by other hypervisors in theory too, but is not at the moment), by linking the kernel to a GPL-ed ROM module provided by the hypervisor. +config KVM_CLOCK + bool "KVM paravirtualized clock" + depends on PARAVIRT && GENERIC_CLOCKEVENTS + help + Turning on this option will allow you to run a paravirtualized clock + when running over the KVM hypervisor. Instead of relying on a PIT + (or probably other) emulation by the underlying device model, the host + provides the guest with timing infrastructure, as time of day, and + timer expiration. config ACPI_SRAT bool diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32 index ccea590..1718d86 100644 --- a/arch/x86/kernel/Makefile_32 +++ b/arch/x86/kernel/Makefile_32 @@ -43,6 +43,7 @@ obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o +obj-$(CONFIG_KVM_CLOCK) += kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt_32.o obj-y += pcspeaker.o diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c new file mode 100644 index 0000000..6817576 --- /dev/null +++ b/arch/x86/kernel/kvmclock.c @@ -0,0 +1,274 @@ +/* KVM paravirtual clock driver. A clocksource/clockevents implementation + Copyright (C) 2007 Glauber de Oliveira Costa, Red Hat Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define KVM_SCALE 22 + +static int kvmclock = 1; +static int kvmclockevt = 1; + +static int parse_no_kvmclock(char *arg) +{ + kvmclock = 0; + return 0; +} +early_param("no-kvmclock", parse_no_kvmclock); + +static int parse_no_kvmclockevt(char *arg) +{ + kvmclockevt = 0; + return 0; +} +early_param("no-kvmclockevents", parse_no_kvmclockevt); + +/* The hypervisor will put information about time periodically here */ +struct kvm_hv_clock hv_clock __attribute__((__aligned__(PAGE_SIZE))); + +/* + * The wallclock is the time of day when we booted. Since then, some time may + * have elapsed since the hypervisor wrote the data. So we try to account for + * that. Even if the tsc is not accurate, it gives us a more accurate timing + * than not adjusting at all + */ +unsigned long kvm_get_wallclock(void) +{ + u64 wc_sec, delta, last_tsc; + int version, nsec; + struct timespec ts; + + do { + version = hv_clock.version; + rmb(); + last_tsc = hv_clock.last_tsc; + rmb(); + wc_sec = hv_clock.wc_sec; + rmb(); + } while ((hv_clock.version != version) && !(version & 1)); + + rdtscll(delta); + delta = delta - last_tsc; + delta = (delta * hv_clock.tsc_mult) >> KVM_SCALE; + nsec = do_div(delta, NSEC_PER_SEC); + set_normalized_timespec(&ts, wc_sec + delta, nsec); + + /* + * Of all mechanisms of time adjustment I've tested, this one + * was the champion! + */ + return ts.tv_sec + 1; +} + +int kvm_set_wallclock(unsigned long now) +{ + return 0; +} + +/* + * This is our read_clock function. The host puts an tsc timestamp each time + * it updates a new time, and then we can use it to derive a slightly more + * precise notion of elapsed time, converted to nanoseconds. + * + * If the platform provides a stable tsc, we just use it, and there is no need + * for the host to update anything. + */ +static cycle_t kvm_clock_read(void) { + + u64 delta, last_tsc, now; + u32 version; + + if (hv_clock.stable_tsc) { + rdtscll(last_tsc); + return last_tsc; + } + + do { + version = hv_clock.version; + rmb(); + last_tsc = hv_clock.last_tsc; + rmb(); + now = hv_clock.now_ns; + rmb(); + } while ((hv_clock.version != version) && !(version & 1)); + + delta = native_read_tsc() - last_tsc; + delta = (delta * hv_clock.tsc_mult) >> KVM_SCALE; + + return now + delta; +} + +static struct clocksource kvm_clock = { + .name = "kvm-clock", + .read = kvm_clock_read, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .mult = 1 << KVM_SCALE, + .shift = KVM_SCALE, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static void kvm_timer_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + WARN_ON(!irqs_disabled()); + + switch (mode) { + case CLOCK_EVT_MODE_ONESHOT: + /* this is what we want */ + break; + case CLOCK_EVT_MODE_RESUME: + break; + case CLOCK_EVT_MODE_PERIODIC: + WARN_ON(1); + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + kvm_hypercall0(KVM_HCALL_STOP_ONESHOT); + break; + default: + break; + } +} + +/* + * Programming the next event is just a matter of asking the host + * to generate us an interrupt when the time expires. + */ +static int kvm_timer_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + ktime_t kt; + struct timespec ts; + + WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); + kt = ktime_add_ns(ktime_get(), delta); + ts = ktime_to_timespec(kt); + + return kvm_hypercall2(KVM_HCALL_SET_ALARM, ts.tv_sec, ts.tv_nsec); +} + +/* This is our clockevents structure. We only support one shot operation */ +static struct clock_event_device kvm_clockevent = { + .name = "kvm-timer", + .features = CLOCK_EVT_FEAT_ONESHOT, + .shift = 0, + .mult = 1, + .max_delta_ns = 0xffffffff, + .min_delta_ns = 1000000, + .set_mode = kvm_timer_set_mode, + .set_next_event = kvm_timer_next_event, + .rating = 1000, + .irq = 0, + .cpumask = CPU_MASK_NONE, +}; + +unsigned long long kvm_sched_clock(void) +{ + return kvm_clock_read(); +} + + +static DEFINE_PER_CPU(struct clock_event_device, kvm_clock_evt); + +static irqreturn_t kvm_timer_interrupt(int irq, void *dev_id) +{ + struct clock_event_device *evt = &__get_cpu_var(kvm_clock_evt); + BUG_ON(!evt); + evt->event_handler(evt); + return IRQ_HANDLED; +} + +static struct irqaction irq0 = { + .handler = kvm_timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL, + .mask = CPU_MASK_CPU0, + .name = "kvm-timer" +}; + +static void kvm_clockevents_init(int cpu) +{ + struct clock_event_device *evt; + evt = &per_cpu(kvm_clock_evt, cpu); + memcpy(evt, &kvm_clockevent, sizeof(kvm_clockevent)); + evt->cpumask = cpumask_of_cpu(cpu); + clockevents_register_device(evt); +} + +/* + * QEMU connects a pit timer, which keeps sending us interrupts. We disable + * it when the time comes + */ +void kvm_disable_pit(void) +{ + outb_p(0x30, PIT_MODE); +} + +static void kvm_time_init(void) +{ + /* + * If we let the normal APIC initialization code run, they will + * override our event handler, relying that the APIC will deliver + * the interrupts in the LOCAL_TIMER_VECTOR. The easy solution is + * keep the PIT running until then: Side note: irqs are still disabled + * here, so it safe to override yet another pvop. + */ + + kvm_clockevents_init(smp_processor_id()); + + setup_irq(0, &irq0); +} + +void __init kvmclock_init(void) +{ + + unsigned long kvm_clock_info = __pa((unsigned long)&hv_clock); + + /* + * If we can't use the paravirt clock, just go with + * the usual timekeeping + */ + if (!kvm_para_available()) + return; + + if (kvm_hypercall1(KVM_HCALL_REGISTER_CLOCK, kvm_clock_info)) + return; + + if (hv_clock.stable_tsc) + kvm_clock.mult = hv_clock.tsc_mult; + + if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { + pv_time_ops.get_wallclock = kvm_get_wallclock; + pv_time_ops.set_wallclock = kvm_set_wallclock; + pv_time_ops.sched_clock = kvm_sched_clock; + clocksource_register(&kvm_clock); + } + + if (kvmclockevt && kvm_para_has_feature(KVM_FEATURE_CLOCKEVENTS)) { + pv_time_ops.time_init = kvm_time_init; + pv_apic_ops.setup_boot_clock = kvm_disable_pit; + } +} diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c index 6a80d67..fa5d894 100644 --- a/arch/x86/kernel/paravirt_32.c +++ b/arch/x86/kernel/paravirt_32.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -50,6 +51,13 @@ char *memory_setup(void) return pv_init_ops.memory_setup(); } +static void paravirt_init_special_features(void) +{ +#ifdef CONFIG_KVM_CLOCK + kvmclock_init(); +#endif +} + /* Simple instruction patching code. */ #define DEF_NATIVE(ops, name, code) \ extern const char start_##ops##_##name[], end_##ops##_##name[]; \ @@ -334,7 +342,7 @@ struct pv_info pv_info = { struct pv_init_ops pv_init_ops = { .patch = native_patch, .banner = default_banner, - .arch_setup = paravirt_nop, + .arch_setup = paravirt_init_special_features, .memory_setup = machine_specific_memory_setup, }; diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index cc0e914..f833395 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -546,6 +546,7 @@ void __init setup_arch(char **cmdline_p) unsigned long max_low_pfn; memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); + parse_early_param(); pre_setup_arch_hook(); early_cpu_init(); @@ -605,8 +606,6 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = virt_to_phys(&__bss_start); bss_resource.end = virt_to_phys(&__bss_stop)-1; - parse_early_param(); - if (user_defined_memmap) { printk(KERN_INFO "user-defined physical RAM map:\n"); print_memory_map("user"); diff --git a/drivers/kvm/irq.c b/drivers/kvm/irq.c index 22bfeee..fe30099 100644 --- a/drivers/kvm/irq.c +++ b/drivers/kvm/irq.c @@ -33,6 +33,8 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) { struct kvm_pic *s; + if (v->timer_vector != -1) + return 1; if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ if (kvm_apic_accept_pic_intr(v)) { s = pic_irqchip(v->kvm); /* PIC */ @@ -44,6 +46,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) } EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); +static int kvm_get_pvclock_interrupt(struct kvm_vcpu *v) +{ + int ret = v->timer_vector; + v->timer_vector = -1; + return ret; +} /* * Read pending interrupt vector and intack. */ @@ -52,7 +60,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) struct kvm_pic *s; int vector; - vector = kvm_get_apic_interrupt(v); /* APIC */ + vector = kvm_get_pvclock_interrupt(v); + if (vector == -1) + vector = kvm_get_apic_interrupt(v); /* APIC */ if (vector == -1) { if (kvm_apic_accept_pic_intr(v)) { s = pic_irqchip(v->kvm); diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 08b5b21..3d2158c 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -362,6 +362,10 @@ struct kvm { struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; int nmemslots; struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; + + struct page *clock_page; + u64 clock_gpa; + int time_needs_update; /* * Hash table of struct kvm_mmu_page. */ diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 6f7b31e..14d7073 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -21,6 +21,7 @@ #include "segment_descriptor.h" #include "irq.h" +#include #include #include #include @@ -43,6 +44,7 @@ #include #include #include +#include #include #include @@ -1564,6 +1566,36 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); +static struct kvm_hv_clock hv_clock; + +static void kvm_write_guest_time(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct timespec ts; + void *clock_addr; + + if ((!kvm->clock_page) || !(kvm->clock_gpa)) + return; + + clock_addr = kmap(kvm->clock_page); + + /* Updates version to the next odd number, indicating we're writing */ + hv_clock.version++; + /* Updating the tsc count is the first thing we do */ + kvm_get_msr(vcpu, MSR_IA32_TIME_STAMP_COUNTER, &hv_clock.last_tsc); + ktime_get_ts(&ts); + hv_clock.now_ns = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec; + hv_clock.wc_sec = get_seconds(); + hv_clock.version++; + WARN_ON(hv_clock.version & 1); + + memcpy(clock_addr, &hv_clock, sizeof(hv_clock)); + mark_page_dirty(kvm, kvm->clock_gpa >> PAGE_SHIFT); + kunmap(kvm->clock_page); + + kvm->time_needs_update = 0; +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; @@ -1584,7 +1616,37 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a3 &= 0xFFFFFFFF; } + ret = 0; switch (nr) { + case KVM_HCALL_REGISTER_CLOCK: { + if (!irqchip_in_kernel(vcpu->kvm)) { + ret = -1; + break; + } + + vcpu->kvm->clock_gpa = a0; + vcpu->kvm->clock_page = gfn_to_page(vcpu->kvm, a0 >> PAGE_SHIFT); + + hv_clock.stable_tsc = check_tsc_unstable(); + hv_clock.tsc_mult = clocksource_khz2mult(tsc_khz, 22); + + kvm_write_guest_time(vcpu); + break; + } + case KVM_HCALL_SET_ALARM: { + ktime_t kt; + struct timespec ts; + + kt = ktime_set(a0, a1); + ts = ktime_to_timespec(kt); + hrtimer_start(&vcpu->oneshooter, kt, HRTIMER_MODE_ABS); + + ret = 0; + break; + } + case KVM_HCALL_STOP_ONESHOT: + hrtimer_cancel(&vcpu->oneshooter); + break; default: ret = -KVM_ENOSYS; break; @@ -2117,6 +2179,14 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu, vcpu->irq_summary == 0); } +void kvm_update_guest_time(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + if (!(hv_clock.stable_tsc) && (kvm->time_needs_update)) + kvm_write_guest_time(vcpu); +} + static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -2177,6 +2247,7 @@ again: vcpu->guest_mode = 1; kvm_guest_enter(); + kvm_update_guest_time(vcpu); kvm_x86_ops->run(vcpu, kvm_run); vcpu->guest_mode = 0; @@ -2628,6 +2699,16 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) return fd; } + +static enum hrtimer_restart kvm_clockdev_fn(struct hrtimer *timer) +{ + struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu, oneshooter); + + vcpu->timer_vector = 0x20; + + return HRTIMER_NORESTART; +} + /* * Creates some virtual cpus. Good luck creating more than one. */ @@ -2643,6 +2724,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) if (IS_ERR(vcpu)) return PTR_ERR(vcpu); + vcpu->timer_vector = -1; preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); /* We do fxsave: this must be aligned. */ @@ -2665,6 +2747,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) kvm->vcpus[n] = vcpu; mutex_unlock(&kvm->lock); + hrtimer_init(&vcpu->oneshooter, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + vcpu->oneshooter.function = kvm_clockdev_fn; + /* Now it's all set up, let userspace reach it */ r = create_vcpu_fd(vcpu); if (r < 0) diff --git a/drivers/kvm/x86.h b/drivers/kvm/x86.h index 01452b5..27c65bc 100644 --- a/drivers/kvm/x86.h +++ b/drivers/kvm/x86.h @@ -83,6 +83,9 @@ struct kvm_vcpu { /* emulate context */ struct x86_emulate_ctxt emulate_ctxt; + struct hrtimer oneshooter; + int timer_vector; + }; static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h index c6f3fd8..8dec29f 100644 --- a/include/asm-x86/kvm_para.h +++ b/include/asm-x86/kvm_para.h @@ -10,9 +10,12 @@ * paravirtualization, the appropriate feature bit should be checked. */ #define KVM_CPUID_FEATURES 0x40000001 +#define KVM_FEATURE_CLOCKEVENTS 0 +#define KVM_FEATURE_CLOCKSOURCE 1 #ifdef __KERNEL__ #include +extern void kvmclock_init(void); /* This instruction is vmcall. On non-VT architectures, it will generate a * trap that we will then rewrite to the appropriate instruction. @@ -29,6 +32,26 @@ * noted by the particular hypercall. */ +#define KVM_HCALL_REGISTER_CLOCK 0 +#define KVM_HCALL_SET_ALARM 1 +#define KVM_HCALL_STOP_ONESHOT 2 + +struct kvm_hv_clock { + u64 tsc_mult; + u64 now_ns; + /* That's the wall clock, not the water closet */ + u64 wc_sec; + u64 wc_nsec; + u64 last_tsc; + /* + * At first, we could use the tsc value as a marker, but Jeremy + * well noted that it will cause us locking problems in 32-bit sys, + * so we have a special version field + */ + u32 version; + u32 stable_tsc; /* use raw tsc for clock_read */ +}; + static inline long kvm_hypercall0(unsigned int nr) { long ret; diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h index e4db25f..4c2afd0 100644 --- a/include/linux/kvm_para.h +++ b/include/linux/kvm_para.h @@ -9,6 +9,10 @@ * - kvm_para_available */ +#define KVM_CPUID_FEATURES 0x40000001 +#define KVM_FEATURE_CLOCKEVENTS 0 +#define KVM_FEATURE_CLOCKSOURCE 1 + /* Return values for hypercalls */ #define KVM_ENOSYS 1000