qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Marcelo Tosatti <mtosatti@redhat.com>
To: kvm@vger.kernel.org
Cc: qemu-devel@nongnu.org,
	Dr David Alan Gilbert <dgilbert@redhat.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Juan Quintela <quintela@redhat.com>,
	Radim Krcmar <rkrcmar@redhat.com>,
	Eduardo Habkost <ehabkost@redhat.com>,
	Marcelo Tosatti <mtosatti@redhat.com>
Subject: [Qemu-devel] [qemu patch V4 2/2] kvmclock: reduce kvmclock difference on migration
Date: Sat, 10 Dec 2016 15:21:50 -0200	[thread overview]
Message-ID: <20161210172324.482367805@redhat.com> (raw)
In-Reply-To: 20161210172148.361793326@redhat.com

[-- Attachment #1: kvmclock-advance-clock.patch --]
[-- Type: text/plain, Size: 8220 bytes --]

Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
indicates that KVM_GET_CLOCK returns a value as seen by the guest at
that moment.

For new machine types, use this value rather than reading
from guest memory.

This reduces kvmclock difference on migration from 5s to 0.1s
(when max_downtime == 5s).

Note: pre_save contradicts the following comment
        /*
         * If the VM is stopped, declare the clock state valid to
         * avoid re-reading it on next vmsave (which would return
         * a different value). Will be reset when the VM is continued.
         */
But the comment is bogus: vm_state_change is never called twice in a row
with running=0 or running=1.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 hw/i386/kvm/clock.c    |  107 ++++++++++++++++++++++++++++++++++++++++++-------
 include/hw/i386/pc.h   |    5 ++
 target-i386/kvm.c      |    7 +++
 target-i386/kvm_i386.h |    1 
 4 files changed, 106 insertions(+), 14 deletions(-)

v2: 
- improve variable names (Juan)
- consolidate code on kvm_get_clock function (Paolo)
- return mach_use_reliable_get_clock from needed function (Paolo)
v3: 
- simplify check for src_use_reliable_get_clock (Eduardo)
- move src_use_reliable_get_clock initialization to realize (Eduardo)

v4: 
- have kvm_get_clock and clock_is_reliable assignments synchronized (Eduardo)
- add comment to the reasoning of kvmclock_pre_save (Eduardo)


Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
===================================================================
--- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
+++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-12-10 13:57:58.115983966 -0200
@@ -36,6 +36,13 @@
 
     uint64_t clock;
     bool clock_valid;
+
+    /* whether machine type supports reliable get clock */
+    bool mach_use_reliable_get_clock;
+
+    /* whether the 'clock' value was obtained in a host with
+     * reliable KVM_GET_CLOCK */
+    bool clock_is_reliable;
 } KVMClockState;
 
 struct pvclock_vcpu_time_info {
@@ -81,6 +88,19 @@
     return nsec + time.system_time;
 }
 
+static uint64_t kvm_get_clock(void)
+{
+    struct kvm_clock_data data;
+    int ret;
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
+                abort();
+    }
+    return data.clock;
+}
+
 static void kvmclock_vm_state_change(void *opaque, int running,
                                      RunState state)
 {
@@ -91,15 +111,23 @@
 
     if (running) {
         struct kvm_clock_data data = {};
-        uint64_t time_at_migration = kvmclock_current_nsec(s);
+        uint64_t pvclock_via_mem = 0;
 
-        s->clock_valid = false;
+        /*
+         * If the host where s->clock was read did not support reliable
+         * KVM_GET_CLOCK, read kvmclock value from memory.
+         */
+        if (!s->clock_is_reliable) {
+            pvclock_via_mem = kvmclock_current_nsec(s);
+        }
 
-        /* We can't rely on the migrated clock value, just discard it */
-        if (time_at_migration) {
-            s->clock = time_at_migration;
+        /* We can't rely on the saved clock value, just discard it */
+        if (pvclock_via_mem) {
+            s->clock = pvclock_via_mem;
         }
 
+        s->clock_valid = false;
+
         data.clock = s->clock;
         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
         if (ret < 0) {
@@ -120,8 +148,6 @@
             }
         }
     } else {
-        struct kvm_clock_data data;
-        int ret;
 
         if (s->clock_valid) {
             return;
@@ -129,13 +155,11 @@
 
         kvm_synchronize_all_tsc();
 
-        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
-        if (ret < 0) {
-            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
-            abort();
-        }
-        s->clock = data.clock;
-
+        s->clock = kvm_get_clock();
+        /* any code that sets s->clock needs to ensure clock_is_reliable
+         * is correctly set.
+         */
+        s->clock_is_reliable = kvm_has_adjust_clock_stable();
         /*
          * If the VM is stopped, declare the clock state valid to
          * avoid re-reading it on next vmsave (which would return
@@ -149,25 +173,80 @@
 {
     KVMClockState *s = KVM_CLOCK(dev);
 
+    if (kvm_has_adjust_clock_stable()) {
+        s->clock_is_reliable = true;
+    }
+
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
+static bool kvmclock_clock_is_reliable_needed(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    return s->mach_use_reliable_get_clock;
+}
+
+static const VMStateDescription kvmclock_reliable_get_clock = {
+    .name = "kvmclock/clock_is_reliable",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = kvmclock_clock_is_reliable_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+/*
+ * When migrating, read the clock just before migration,
+ * so that the guest clock counts during the events
+ * between:
+ *
+ *  * vm_stop()
+ *  *
+ *  * pre_save()
+ *
+ *  This reduces kvmclock difference on migration from 5s
+ *  to 0.1s (when max_downtime == 5s), because sending the
+ *  final pages of memory (which happens between vm_stop()
+ *  and pre_save()) takes max_downtime.
+ */
+static void kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    s->clock = kvm_get_clock();
+}
+
 static const VMStateDescription kvmclock_vmsd = {
     .name = "kvmclock",
     .version_id = 1,
     .minimum_version_id = 1,
+    .pre_save = kvmclock_pre_save,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(clock, KVMClockState),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription * []) {
+        &kvmclock_reliable_get_clock,
+        NULL
     }
 };
 
+static Property kvmclock_properties[] = {
+    DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState,
+                      mach_use_reliable_get_clock, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void kvmclock_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     dc->realize = kvmclock_realize;
     dc->vmsd = &kvmclock_vmsd;
+    dc->props = kvmclock_properties;
 }
 
 static const TypeInfo kvmclock_info = {
Index: qemu-mig-advance-clock/include/hw/i386/pc.h
===================================================================
--- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-17 15:07:11.220632761 -0200
+++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-17 15:08:58.096791416 -0200
@@ -370,6 +370,11 @@
 #define PC_COMPAT_2_7 \
     HW_COMPAT_2_7 \
     {\
+        .driver   = "kvmclock",\
+        .property = "x-mach-use-reliable-get-clock",\
+        .value    = "off",\
+    },\
+    {\
         .driver   = TYPE_X86_CPU,\
         .property = "l3-cache",\
         .value    = "off",\
Index: qemu-mig-advance-clock/target-i386/kvm.c
===================================================================
--- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-17 15:07:11.221632762 -0200
+++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-17 15:07:15.867639659 -0200
@@ -117,6 +117,13 @@
     return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
 }
 
+bool kvm_has_adjust_clock_stable(void)
+{
+    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
+
+    return (ret == KVM_CLOCK_TSC_STABLE);
+}
+
 bool kvm_allows_irq0_override(void)
 {
     return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
===================================================================
--- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-17 15:07:11.222632764 -0200
+++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-17 15:07:15.867639659 -0200
@@ -17,6 +17,7 @@
 
 bool kvm_allows_irq0_override(void);
 bool kvm_has_smm(void);
+bool kvm_has_adjust_clock_stable(void);
 void kvm_synchronize_all_tsc(void);
 void kvm_arch_reset_vcpu(X86CPU *cs);
 void kvm_arch_do_init_vcpu(X86CPU *cs);

  parent reply	other threads:[~2016-12-10 17:25 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-12-10 17:21 [Qemu-devel] [qemu patch V4 0/2] improve kvmclock difference on migration Marcelo Tosatti
2016-12-10 17:21 ` [Qemu-devel] [qemu patch V4 1/2] kvm: sync linux headers Marcelo Tosatti
2016-12-10 17:21 ` Marcelo Tosatti [this message]
2016-12-12  7:36   ` [Qemu-devel] [qemu patch V4 2/2] kvmclock: reduce kvmclock difference on migration Pankaj Gupta
2016-12-12 11:22     ` Marcelo Tosatti
2016-12-12 14:24       ` Pankaj Gupta
2016-12-13  1:32         ` Pankaj Gupta
2016-12-12 18:01   ` Eduardo Habkost
2016-12-12 19:44     ` Marcelo Tosatti
2016-12-12 19:57       ` Eduardo Habkost
2016-12-16 10:03   ` Paolo Bonzini
2016-12-16 13:41     ` Eduardo Habkost
2016-12-16 15:59       ` Marcelo Tosatti

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20161210172324.482367805@redhat.com \
    --to=mtosatti@redhat.com \
    --cc=dgilbert@redhat.com \
    --cc=ehabkost@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=quintela@redhat.com \
    --cc=rkrcmar@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).