public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2 -v(many)] kvmclock
@ 2008-02-15 19:52 Glauber de Oliveira Costa
  2008-02-15 19:52 ` [PATCH 1/2] [PATCH] kvmclock - the host part Glauber de Oliveira Costa
  2008-02-17  9:48 ` [PATCH 0/2 -v(many)] kvmclock Avi Kivity
  0 siblings, 2 replies; 4+ messages in thread
From: Glauber de Oliveira Costa @ 2008-02-15 19:52 UTC (permalink / raw)
  To: kvm-devel; +Cc: chrisw, avi

I think this version addresses avi's last comments.
I'm not resending userspace since it is unchanged




-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 1/2] [PATCH] kvmclock - the host part.
  2008-02-15 19:52 [PATCH 0/2 -v(many)] kvmclock Glauber de Oliveira Costa
@ 2008-02-15 19:52 ` Glauber de Oliveira Costa
  2008-02-15 19:52   ` [PATCH 2/2] [PATCH] kvmclock implementation, the guest part Glauber de Oliveira Costa
  2008-02-17  9:48 ` [PATCH 0/2 -v(many)] kvmclock Avi Kivity
  1 sibling, 1 reply; 4+ messages in thread
From: Glauber de Oliveira Costa @ 2008-02-15 19:52 UTC (permalink / raw)
  To: kvm-devel; +Cc: chrisw, avi, Glauber de Oliveira Costa

This is the host part of kvm clocksource implementation. As it does
not include clockevents, it is a fairly simple implementation. We
only have to register a per-vcpu area, and start writting to it periodically.

The area is binary compatible with xen, as we use the same shadow_info structure.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
---
 arch/x86/kvm/x86.c         |   96 ++++++++++++++++++++++++++++++++++++++++++++
 include/asm-x86/kvm_host.h |    7 +++
 include/asm-x86/kvm_para.h |   25 +++++++++++
 include/linux/kvm.h        |    1 
 4 files changed, 128 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0c910c7..5dfc21f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -19,6 +19,7 @@ #include "segment_descriptor.h"
 #include "irq.h"
 #include "mmu.h"
 
+#include <linux/clocksource.h>
 #include <linux/kvm.h>
 #include <linux/fs.h>
 #include <linux/vmalloc.h>
@@ -424,7 +425,7 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-	MSR_IA32_TIME_STAMP_COUNTER,
+	MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 };
 
 static unsigned num_msrs_to_save;
@@ -482,6 +483,70 @@ static int do_set_msr(struct kvm_vcpu *v
 	return kvm_set_msr(vcpu, index, *data);
 }
 
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+{
+	static int version;
+	struct kvm_wall_clock wc;
+	struct timespec wc_ts;
+
+	if (!wall_clock)
+		return
+
+	mutex_lock(&kvm->lock);
+
+	version++;
+	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+
+	wc_ts = current_kernel_time();
+	wc.wc_sec = wc_ts.tv_sec;
+	wc.wc_nsec = wc_ts.tv_nsec;
+	wc.wc_version = version;
+	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+
+	version++;
+	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+
+	mutex_unlock(&kvm->lock);
+}
+
+static void kvm_write_guest_time(struct kvm_vcpu *v)
+{
+	struct timespec ts;
+	unsigned long flags;
+	struct kvm_vcpu_arch *vcpu = &v->arch;
+	void *shared_kaddr;
+
+	if ((!vcpu->time_page))
+		return;
+
+	/* Keep irq disabled to prevent changes to the clock */
+	local_irq_save(flags);
+	kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
+			  &vcpu->hv_clock.tsc_timestamp);
+	ktime_get_ts(&ts);
+	local_irq_restore(flags);
+
+	/* With all the info we got, fill in the values */
+
+	vcpu->hv_clock.system_time = ts.tv_nsec +
+				     (NSEC_PER_SEC * (u64)ts.tv_sec);
+	/*
+	 * The interface expects us to write an even number signaling that the
+	 * update is finished. Since the guest won't see the intermediate
+	 * state, we just write "2" at the end
+	 */
+	vcpu->hv_clock.version = 2;
+
+	shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
+
+	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
+		sizeof(vcpu->hv_clock));
+
+	kunmap_atomic(shared_kaddr, KM_USER0);
+
+	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
+}
+
 
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
@@ -511,6 +576,27 @@ int kvm_set_msr_common(struct kvm_vcpu *
 	case MSR_IA32_MISC_ENABLE:
 		vcpu->arch.ia32_misc_enable_msr = data;
 		break;
+	case MSR_KVM_WALL_CLOCK:
+		vcpu->kvm->arch.wall_clock = data;
+		kvm_write_wall_clock(vcpu->kvm, data);
+		break;
+	case MSR_KVM_SYSTEM_TIME: {
+		vcpu->arch.time = data & PAGE_MASK;
+		vcpu->arch.time_offset = data & ~PAGE_MASK;
+
+		vcpu->arch.hv_clock.tsc_to_system_mul =
+					clocksource_khz2mult(tsc_khz, 22);
+		vcpu->arch.hv_clock.tsc_shift = 22;
+
+		down_read(&current->mm->mmap_sem);
+		vcpu->arch.time_page =
+				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
+		up_read(&current->mm->mmap_sem);
+
+		if (is_error_page(vcpu->arch.time_page))
+			vcpu->arch.time_page = NULL;
+		break;
+	}
 	default:
 		pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
 		return 1;
@@ -569,6 +655,12 @@ int kvm_get_msr_common(struct kvm_vcpu *
 	case MSR_EFER:
 		data = vcpu->arch.shadow_efer;
 		break;
+	case MSR_KVM_WALL_CLOCK:
+		data = vcpu->kvm->arch.wall_clock;
+		break;
+	case MSR_KVM_SYSTEM_TIME:
+		data = vcpu->arch.time;
+		break;
 	default:
 		pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 		return 1;
@@ -696,6 +788,7 @@ int kvm_dev_ioctl_check_extension(long e
 	case KVM_CAP_USER_MEMORY:
 	case KVM_CAP_SET_TSS_ADDR:
 	case KVM_CAP_EXT_CPUID:
+	case KVM_CAP_CLOCKSOURCE:
 		r = 1;
 		break;
 	case KVM_CAP_VAPIC:
@@ -771,6 +864,7 @@ out:
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
+	kvm_write_guest_time(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index da61255..0c429c8 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -261,6 +261,11 @@ #define VCPU_MP_STATE_HALTED            
 	/* emulate context */
 
 	struct x86_emulate_ctxt emulate_ctxt;
+
+	gpa_t time;
+	struct kvm_vcpu_time_info hv_clock;
+	unsigned int time_offset;
+	struct page *time_page;
 };
 
 struct kvm_mem_alias {
@@ -287,6 +292,8 @@ struct kvm_arch{
 	int round_robin_prev_vcpu;
 	unsigned int tss_addr;
 	struct page *apic_access_page;
+
+	gpa_t wall_clock;
 };
 
 struct kvm_vm_stat {
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
index c6f3fd8..481bf18 100644
--- a/include/asm-x86/kvm_para.h
+++ b/include/asm-x86/kvm_para.h
@@ -10,10 +10,35 @@ #define KVM_CPUID_SIGNATURE	0x40000000
  * paravirtualization, the appropriate feature bit should be checked.
  */
 #define KVM_CPUID_FEATURES	0x40000001
+#define KVM_FEATURE_CLOCKSOURCE 0
+
+#define MSR_KVM_WALL_CLOCK  0x11
+#define MSR_KVM_SYSTEM_TIME 0x12
 
 #ifdef __KERNEL__
 #include <asm/processor.h>
 
+/* xen binary-compatible interface. See xen headers for details */
+struct kvm_vcpu_time_info {
+	uint32_t version;
+	uint32_t pad0;
+	uint64_t tsc_timestamp;
+	uint64_t system_time;
+	uint32_t tsc_to_system_mul;
+	int8_t   tsc_shift;
+	int8_t	 pad[3];
+} __attribute__((__packed__)); /* 32 bytes */
+
+struct kvm_wall_clock {
+	uint32_t wc_version;
+	uint32_t wc_sec;
+	uint32_t wc_nsec;
+};
+
+
+extern void kvmclock_init(void);
+
+
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
  * trap that we will then rewrite to the appropriate instruction.
  */
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index c1ec04f..94540b3 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -233,6 +233,7 @@ #define KVM_CAP_USER_MEMORY 3
 #define KVM_CAP_SET_TSS_ADDR 4
 #define KVM_CAP_VAPIC 6
 #define KVM_CAP_EXT_CPUID 7
+#define KVM_CAP_CLOCKSOURCE 8
 
 /*
  * ioctls for VM fds
-- 
1.4.2


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/2] [PATCH] kvmclock implementation, the guest part.
  2008-02-15 19:52 ` [PATCH 1/2] [PATCH] kvmclock - the host part Glauber de Oliveira Costa
@ 2008-02-15 19:52   ` Glauber de Oliveira Costa
  0 siblings, 0 replies; 4+ messages in thread
From: Glauber de Oliveira Costa @ 2008-02-15 19:52 UTC (permalink / raw)
  To: kvm-devel; +Cc: chrisw, avi, Glauber de Oliveira Costa

This is the guest part of kvm clock implementation
It does not do tsc-only timing, as tsc can have deltas
between cpus, and it did not seem worthy to me to keep
adjusting them.

We do use it, however, for fine-grained adjustment.

Other than that, time comes from the host.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
---
 arch/x86/Kconfig           |   10 +++
 arch/x86/kernel/Makefile   |    1 
 arch/x86/kernel/kvmclock.c |  161 ++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup_32.c |    5 +
 arch/x86/kernel/setup_64.c |    5 +
 5 files changed, 182 insertions(+), 0 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3be2305..cc2bc37 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -372,6 +372,16 @@ config VMI
 	  at the moment), by linking the kernel to a GPL-ed ROM module
 	  provided by the hypervisor.
 
+config KVM_CLOCK
+	bool "KVM paravirtualized clock"
+	select PARAVIRT
+	help
+	  Turning on this option will allow you to run a paravirtualized clock
+	  when running over the KVM hypervisor. Instead of relying on a PIT
+	  (or probably other) emulation by the underlying device model, the host
+	  provides the guest with timing infrastructure such as time of day, and
+	  system time
+
 source "arch/x86/lguest/Kconfig"
 
 config PARAVIRT
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 76ec0f8..5b91b82 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_
 obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o
 
 obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
+obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
 
 ifdef CONFIG_INPUT_PCSPKR
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 0000000..b8da3bf
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,161 @@
+/*  KVM paravirtual clock driver. A clocksource implementation
+    Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include <linux/clocksource.h>
+#include <linux/kvm_para.h>
+#include <asm/arch_hooks.h>
+#include <asm/msr.h>
+#include <xen/interface/xen.h>
+
+#define KVM_SCALE 22
+
+static int kvmclock = 1;
+
+static int parse_no_kvmclock(char *arg)
+{
+	kvmclock = 0;
+	return 0;
+}
+early_param("no-kvmclock", parse_no_kvmclock);
+
+struct shared_info shared_info __attribute__((__aligned__(PAGE_SIZE)));
+
+/* The hypervisor will put information about time periodically here */
+static struct kvm_vcpu_time_info hv_clock[NR_CPUS];
+#define get_clock(cpu, field) hv_clock[cpu].field
+
+static inline u64 kvm_get_delta(u64 last_tsc)
+{
+	int cpu = smp_processor_id();
+	u64 delta = native_read_tsc() - last_tsc;
+	return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
+}
+
+static struct kvm_wall_clock wall_clock;
+static cycle_t kvm_clock_read(void);
+/*
+ * The wallclock is the time of day when we booted. Since then, some time may
+ * have elapsed since the hypervisor wrote the data. So we try to account for
+ * that with system time
+ */
+unsigned long kvm_get_wallclock(void)
+{
+	u32 wc_sec, wc_nsec;
+	u64 delta;
+	struct timespec ts;
+	int version, nsec;
+	int low, high;
+
+	low = (int)__pa(&wall_clock);
+	high = ((u64)__pa(&wall_clock) >> 32);
+
+	delta = kvm_clock_read();
+
+	native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
+	do {
+		version = wall_clock.wc_version;
+		rmb();
+		wc_sec = wall_clock.wc_sec;
+		wc_nsec = wall_clock.wc_nsec;
+		rmb();
+	} while ((wall_clock.wc_version != version) || (version & 1));
+
+	delta = kvm_clock_read() - delta;
+	delta += wc_nsec;
+	nsec = do_div(delta, NSEC_PER_SEC);
+	set_normalized_timespec(&ts, wc_sec + delta, nsec);
+	/*
+	 * Of all mechanisms of time adjustment I've tested, this one
+	 * was the champion!
+	 */
+	return ts.tv_sec + 1;
+}
+
+int kvm_set_wallclock(unsigned long now)
+{
+	return 0;
+}
+
+/*
+ * This is our read_clock function. The host puts an tsc timestamp each time
+ * it updates a new time. Without the tsc adjustment, we can have a situation
+ * in which a vcpu starts to run earlier (smaller system_time), but probes
+ * time later (compared to another vcpu), leading to backwards time
+ */
+static cycle_t kvm_clock_read(void)
+{
+	u64 last_tsc, now;
+	int cpu;
+
+	preempt_disable();
+	cpu = smp_processor_id();
+
+	last_tsc = get_clock(cpu, tsc_timestamp);
+	now = get_clock(cpu, system_time);
+
+	now += kvm_get_delta(last_tsc);
+	preempt_enable();
+
+	return now;
+}
+static struct clocksource kvm_clock = {
+	.name = "kvm-clock",
+	.read = kvm_clock_read,
+	.rating = 400,
+	.mask = CLOCKSOURCE_MASK(64),
+	.mult = 1 << KVM_SCALE,
+	.shift = KVM_SCALE,
+	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static int kvm_register_clock(void)
+{
+	int cpu = smp_processor_id();
+	int low, high;
+	low = (int)__pa(&hv_clock[cpu]);
+	high = ((u64)__pa(&hv_clock[cpu]) >> 32);
+
+	return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
+}
+
+static void kvm_setup_secondary_clock(void)
+{
+	/*
+	 * Now that the first cpu already had this clocksource initialized,
+	 * we shouldn't fail.
+	 */
+	WARN_ON(kvm_register_clock());
+	/* ok, done with our trickery, call native */
+	setup_secondary_APIC_clock();
+}
+
+void __init kvmclock_init(void)
+{
+	if (!kvm_para_available())
+		return;
+
+	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+		if (kvm_register_clock())
+			return;
+		pv_time_ops.get_wallclock = kvm_get_wallclock;
+		pv_time_ops.set_wallclock = kvm_set_wallclock;
+		pv_time_ops.sched_clock = kvm_clock_read;
+		pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
+		clocksource_register(&kvm_clock);
+	}
+}
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 691ab4c..988da57 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -46,6 +46,7 @@ #include <linux/dmi.h>
 #include <linux/pfn.h>
 #include <linux/pci.h>
 #include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
 
 #include <video/edid.h>
 
@@ -772,6 +773,10 @@ #endif
 	if (mtrr_trim_uncached_memory(max_pfn))
 		max_low_pfn = setup_memory();
 
+#ifdef CONFIG_KVM_CLOCK
+	kvmclock_init();
+#endif
+
 #ifdef CONFIG_VMI
 	/*
 	 * Must be after max_low_pfn is determined, and before kernel
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index c0d8208..cdf9578 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -41,6 +41,7 @@ #include <linux/dma-mapping.h>
 #include <linux/ctype.h>
 #include <linux/uaccess.h>
 #include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
 
 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
@@ -349,6 +350,10 @@ #endif
 
 	io_delay_init();
 
+#ifdef CONFIG_KVM_CLOCK
+	kvmclock_init();
+#endif
+
 #ifdef CONFIG_SMP
 	/* setup to use the early static init tables during kernel startup */
 	x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
-- 
1.4.2


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 0/2 -v(many)] kvmclock
  2008-02-15 19:52 [PATCH 0/2 -v(many)] kvmclock Glauber de Oliveira Costa
  2008-02-15 19:52 ` [PATCH 1/2] [PATCH] kvmclock - the host part Glauber de Oliveira Costa
@ 2008-02-17  9:48 ` Avi Kivity
  1 sibling, 0 replies; 4+ messages in thread
From: Avi Kivity @ 2008-02-17  9:48 UTC (permalink / raw)
  To: Glauber de Oliveira Costa; +Cc: kvm-devel, chrisw

Glauber de Oliveira Costa wrote:
> I think this version addresses avi's last comments.
> I'm not resending userspace since it is unchanged
>   

Applied, thanks.  Added structure packing for kvm_wall_clock.

-- 
Any sufficiently difficult bug is indistinguishable from a feature.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2008-02-17  9:48 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-02-15 19:52 [PATCH 0/2 -v(many)] kvmclock Glauber de Oliveira Costa
2008-02-15 19:52 ` [PATCH 1/2] [PATCH] kvmclock - the host part Glauber de Oliveira Costa
2008-02-15 19:52   ` [PATCH 2/2] [PATCH] kvmclock implementation, the guest part Glauber de Oliveira Costa
2008-02-17  9:48 ` [PATCH 0/2 -v(many)] kvmclock Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox