qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [qemu patch V3 0/2] improve kvmclock difference on migration
@ 2016-11-21 10:50 Marcelo Tosatti
  2016-11-21 10:50 ` [Qemu-devel] [qemu patch V3 1/2] kvm: sync linux headers Marcelo Tosatti
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Marcelo Tosatti @ 2016-11-21 10:50 UTC (permalink / raw)
  To: kvm
  Cc: qemu-devel, Dr David Alan Gilbert, Paolo Bonzini, Juan Quintela,
	Radim Krcmar, Eduardo Habkost

See individual patches for details. This patchset depends on kernels
"[PATCH] kvm: kvmclock: let KVM_GET_CLOCK return whether the master
clock is in use" from Paolo.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [Qemu-devel] [qemu patch V3 1/2] kvm: sync linux headers
  2016-11-21 10:50 [Qemu-devel] [qemu patch V3 0/2] improve kvmclock difference on migration Marcelo Tosatti
@ 2016-11-21 10:50 ` Marcelo Tosatti
  2016-11-21 10:50 ` [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration Marcelo Tosatti
  2016-11-28 12:16 ` [Qemu-devel] [qemu patch V3 0/2] improve " Marcelo Tosatti
  2 siblings, 0 replies; 18+ messages in thread
From: Marcelo Tosatti @ 2016-11-21 10:50 UTC (permalink / raw)
  To: kvm
  Cc: qemu-devel, Dr David Alan Gilbert, Paolo Bonzini, Juan Quintela,
	Radim Krcmar, Eduardo Habkost, Marcelo Tosatti

[-- Attachment #1: sync-linux-headers.patch --]
[-- Type: text/plain, Size: 5447 bytes --]

Import KVM_CLOCK_TSC_STABLE.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/include/standard-headers/linux/input.h b/include/standard-headers/linux/input.h
index 7361a16..b472b85 100644
--- a/include/standard-headers/linux/input.h
+++ b/include/standard-headers/linux/input.h
@@ -245,6 +245,7 @@ struct input_mask {
 #define BUS_SPI			0x1C
 #define BUS_RMI			0x1D
 #define BUS_CEC			0x1E
+#define BUS_INTEL_ISHTP		0x1F
 
 /*
  * MT_TOOL types
diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h
index 4040951..e5a2e68 100644
--- a/include/standard-headers/linux/pci_regs.h
+++ b/include/standard-headers/linux/pci_regs.h
@@ -612,6 +612,8 @@
  */
 #define PCI_EXP_DEVCAP2		36	/* Device Capabilities 2 */
 #define  PCI_EXP_DEVCAP2_ARI		0x00000020 /* Alternative Routing-ID */
+#define  PCI_EXP_DEVCAP2_ATOMIC_ROUTE	0x00000040 /* Atomic Op routing */
+#define PCI_EXP_DEVCAP2_ATOMIC_COMP64	0x00000100 /* Atomic 64-bit compare */
 #define  PCI_EXP_DEVCAP2_LTR		0x00000800 /* Latency tolerance reporting */
 #define  PCI_EXP_DEVCAP2_OBFF_MASK	0x000c0000 /* OBFF support mechanism */
 #define  PCI_EXP_DEVCAP2_OBFF_MSG	0x00040000 /* New message signaling */
@@ -619,6 +621,7 @@
 #define PCI_EXP_DEVCTL2		40	/* Device Control 2 */
 #define  PCI_EXP_DEVCTL2_COMP_TIMEOUT	0x000f	/* Completion Timeout Value */
 #define  PCI_EXP_DEVCTL2_ARI		0x0020	/* Alternative Routing-ID */
+#define PCI_EXP_DEVCTL2_ATOMIC_REQ	0x0040	/* Set Atomic requests */
 #define  PCI_EXP_DEVCTL2_IDO_REQ_EN	0x0100	/* Allow IDO for requests */
 #define  PCI_EXP_DEVCTL2_IDO_CMP_EN	0x0200	/* Allow IDO for completions */
 #define  PCI_EXP_DEVCTL2_LTR_EN		0x0400	/* Enable LTR mechanism */
@@ -671,7 +674,8 @@
 #define PCI_EXT_CAP_ID_PMUX	0x1A	/* Protocol Multiplexing */
 #define PCI_EXT_CAP_ID_PASID	0x1B	/* Process Address Space ID */
 #define PCI_EXT_CAP_ID_DPC	0x1D	/* Downstream Port Containment */
-#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_DPC
+#define PCI_EXT_CAP_ID_PTM	0x1F	/* Precision Time Measurement */
+#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PTM
 
 #define PCI_EXT_CAP_DSN_SIZEOF	12
 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40
@@ -964,4 +968,13 @@
 
 #define PCI_EXP_DPC_SOURCE_ID		10	/* DPC Source Identifier */
 
+/* Precision Time Measurement */
+#define PCI_PTM_CAP			0x04	    /* PTM Capability */
+#define  PCI_PTM_CAP_REQ		0x00000001  /* Requester capable */
+#define  PCI_PTM_CAP_ROOT		0x00000004  /* Root capable */
+#define  PCI_PTM_GRANULARITY_MASK	0x0000FF00  /* Clock granularity */
+#define PCI_PTM_CTRL			0x08	    /* PTM Control */
+#define  PCI_PTM_CTRL_ENABLE		0x00000001  /* PTM enable */
+#define  PCI_PTM_CTRL_ROOT		0x00000002  /* Root select */
+
 #endif /* LINUX_PCI_REGS_H */
diff --git a/linux-headers/asm-arm/kvm.h b/linux-headers/asm-arm/kvm.h
index 541268c..2fb7859 100644
--- a/linux-headers/asm-arm/kvm.h
+++ b/linux-headers/asm-arm/kvm.h
@@ -84,6 +84,13 @@ struct kvm_regs {
 #define KVM_VGIC_V2_DIST_SIZE		0x1000
 #define KVM_VGIC_V2_CPU_SIZE		0x2000
 
+/* Supported VGICv3 address types  */
+#define KVM_VGIC_V3_ADDR_TYPE_DIST	2
+#define KVM_VGIC_V3_ADDR_TYPE_REDIST	3
+
+#define KVM_VGIC_V3_DIST_SIZE		SZ_64K
+#define KVM_VGIC_V3_REDIST_SIZE		(2 * SZ_64K)
+
 #define KVM_ARM_VCPU_POWER_OFF		0 /* CPU is started in OFF state */
 #define KVM_ARM_VCPU_PSCI_0_2		1 /* CPU uses PSCI v0.2 */
 
diff --git a/linux-headers/asm-x86/unistd_32.h b/linux-headers/asm-x86/unistd_32.h
index abeaf40..d45ea28 100644
--- a/linux-headers/asm-x86/unistd_32.h
+++ b/linux-headers/asm-x86/unistd_32.h
@@ -377,5 +377,8 @@
 #define __NR_copy_file_range 377
 #define __NR_preadv2 378
 #define __NR_pwritev2 379
+#define __NR_pkey_mprotect 380
+#define __NR_pkey_alloc 381
+#define __NR_pkey_free 382
 
 #endif /* _ASM_X86_UNISTD_32_H */
diff --git a/linux-headers/asm-x86/unistd_64.h b/linux-headers/asm-x86/unistd_64.h
index 73c3d1f..e22db91 100644
--- a/linux-headers/asm-x86/unistd_64.h
+++ b/linux-headers/asm-x86/unistd_64.h
@@ -330,5 +330,8 @@
 #define __NR_copy_file_range 326
 #define __NR_preadv2 327
 #define __NR_pwritev2 328
+#define __NR_pkey_mprotect 329
+#define __NR_pkey_alloc 330
+#define __NR_pkey_free 331
 
 #endif /* _ASM_X86_UNISTD_64_H */
diff --git a/linux-headers/asm-x86/unistd_x32.h b/linux-headers/asm-x86/unistd_x32.h
index e5aea76..84e58b2 100644
--- a/linux-headers/asm-x86/unistd_x32.h
+++ b/linux-headers/asm-x86/unistd_x32.h
@@ -283,6 +283,9 @@
 #define __NR_membarrier (__X32_SYSCALL_BIT + 324)
 #define __NR_mlock2 (__X32_SYSCALL_BIT + 325)
 #define __NR_copy_file_range (__X32_SYSCALL_BIT + 326)
+#define __NR_pkey_mprotect (__X32_SYSCALL_BIT + 329)
+#define __NR_pkey_alloc (__X32_SYSCALL_BIT + 330)
+#define __NR_pkey_free (__X32_SYSCALL_BIT + 331)
 #define __NR_rt_sigaction (__X32_SYSCALL_BIT + 512)
 #define __NR_rt_sigreturn (__X32_SYSCALL_BIT + 513)
 #define __NR_ioctl (__X32_SYSCALL_BIT + 514)
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index 4806e06..bb0ed71 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -972,12 +972,19 @@ struct kvm_irqfd {
 	__u8  pad[16];
 };
 
+/* For KVM_CAP_ADJUST_CLOCK */
+
+/* Do not use 1, KVM_CHECK_EXTENSION returned it before we had flags.  */
+#define KVM_CLOCK_TSC_STABLE		2
+
 struct kvm_clock_data {
 	__u64 clock;
 	__u32 flags;
 	__u32 pad[9];
 };
 
+/* For KVM_CAP_SW_TLB */
+
 #define KVM_MMU_FSL_BOOKE_NOHV		0
 #define KVM_MMU_FSL_BOOKE_HV		1
 

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-21 10:50 [Qemu-devel] [qemu patch V3 0/2] improve kvmclock difference on migration Marcelo Tosatti
  2016-11-21 10:50 ` [Qemu-devel] [qemu patch V3 1/2] kvm: sync linux headers Marcelo Tosatti
@ 2016-11-21 10:50 ` Marcelo Tosatti
  2016-11-28 14:13   ` Eduardo Habkost
  2016-11-28 12:16 ` [Qemu-devel] [qemu patch V3 0/2] improve " Marcelo Tosatti
  2 siblings, 1 reply; 18+ messages in thread
From: Marcelo Tosatti @ 2016-11-21 10:50 UTC (permalink / raw)
  To: kvm
  Cc: qemu-devel, Dr David Alan Gilbert, Paolo Bonzini, Juan Quintela,
	Radim Krcmar, Eduardo Habkost, Marcelo Tosatti

[-- Attachment #1: kvmclock-advance-clock.patch --]
[-- Type: text/plain, Size: 7864 bytes --]

Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
indicates that KVM_GET_CLOCK returns a value as seen by the guest at
that moment.

For new machine types, use this value rather than reading 
from guest memory.

This reduces kvmclock difference on migration from 5s to 0.1s
(when max_downtime == 5s).

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 hw/i386/kvm/clock.c    |  107 ++++++++++++++++++++++++++++++++++++++++++-------
 include/hw/i386/pc.h   |    5 ++
 target-i386/kvm.c      |    7 +++
 target-i386/kvm_i386.h |    1 
 4 files changed, 106 insertions(+), 14 deletions(-)

v2: 
- improve variable names (Juan)
- consolidate code on kvm_get_clock function (Paolo)
- return mach_use_reliable_get_clock from needed function (Paolo)
v3: 
- simplify check for src_use_reliable_get_clock (Eduardo)
- move src_use_reliable_get_clock initialization to realize (Eduardo)


Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
===================================================================
--- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
+++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
@@ -36,6 +36,12 @@
 
     uint64_t clock;
     bool clock_valid;
+
+    /* whether machine supports reliable KVM_GET_CLOCK */
+    bool mach_use_reliable_get_clock;
+
+    /* whether source host supported reliable KVM_GET_CLOCK */
+    bool src_use_reliable_get_clock;
 } KVMClockState;
 
 struct pvclock_vcpu_time_info {
@@ -81,6 +87,19 @@
     return nsec + time.system_time;
 }
 
+static uint64_t kvm_get_clock(void)
+{
+    struct kvm_clock_data data;
+    int ret;
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
+                abort();
+    }
+    return data.clock;
+}
+
 static void kvmclock_vm_state_change(void *opaque, int running,
                                      RunState state)
 {
@@ -91,15 +110,36 @@
 
     if (running) {
         struct kvm_clock_data data = {};
-        uint64_t time_at_migration = kvmclock_current_nsec(s);
+        uint64_t pvclock_via_mem = 0;
 
-        s->clock_valid = false;
+        /* local (running VM) restore */
+        if (s->clock_valid) {
+            /*
+             * if host does not support reliable KVM_GET_CLOCK,
+             * read kvmclock value from memory
+             */
+            if (!kvm_has_adjust_clock_stable()) {
+                pvclock_via_mem = kvmclock_current_nsec(s);
+            }
+        /* migration/savevm/init restore */
+        } else {
+            /*
+             * use s->clock in case machine uses reliable
+             * get clock and source host supported
+             * reliable get clock
+             */
+            if (!s->src_use_reliable_get_clock) {
+                pvclock_via_mem = kvmclock_current_nsec(s);
+            }
+        }
 
-        /* We can't rely on the migrated clock value, just discard it */
-        if (time_at_migration) {
-            s->clock = time_at_migration;
+        /* We can't rely on the saved clock value, just discard it */
+        if (pvclock_via_mem) {
+            s->clock = pvclock_via_mem;
         }
 
+        s->clock_valid = false;
+
         data.clock = s->clock;
         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
         if (ret < 0) {
@@ -120,8 +160,6 @@
             }
         }
     } else {
-        struct kvm_clock_data data;
-        int ret;
 
         if (s->clock_valid) {
             return;
@@ -129,13 +167,7 @@
 
         kvm_synchronize_all_tsc();
 
-        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
-        if (ret < 0) {
-            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
-            abort();
-        }
-        s->clock = data.clock;
-
+        s->clock = kvm_get_clock();
         /*
          * If the VM is stopped, declare the clock state valid to
          * avoid re-reading it on next vmsave (which would return
@@ -149,25 +181,72 @@
 {
     KVMClockState *s = KVM_CLOCK(dev);
 
+    /*
+     * On machine types that support reliable KVM_GET_CLOCK,
+     * if host kernel does provide reliable KVM_GET_CLOCK,
+     * set src_use_reliable_get_clock=true so that destination
+     * avoids reading kvmclock from memory.
+     */
+    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
+        s->src_use_reliable_get_clock = true;
+    }
+
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
+static bool kvmclock_src_use_reliable_get_clock(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    return s->mach_use_reliable_get_clock;
+}
+
+static const VMStateDescription kvmclock_reliable_get_clock = {
+    .name = "kvmclock/src_use_reliable_get_clock",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = kvmclock_src_use_reliable_get_clock,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    s->clock = kvm_get_clock();
+}
+
 static const VMStateDescription kvmclock_vmsd = {
     .name = "kvmclock",
     .version_id = 1,
     .minimum_version_id = 1,
+    .pre_save = kvmclock_pre_save,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(clock, KVMClockState),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription * []) {
+        &kvmclock_reliable_get_clock,
+        NULL
     }
 };
 
+static Property kvmclock_properties[] = {
+    DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState,
+                      mach_use_reliable_get_clock, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void kvmclock_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     dc->realize = kvmclock_realize;
     dc->vmsd = &kvmclock_vmsd;
+    dc->props = kvmclock_properties;
 }
 
 static const TypeInfo kvmclock_info = {
Index: qemu-mig-advance-clock/include/hw/i386/pc.h
===================================================================
--- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-17 15:07:11.220632761 -0200
+++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-17 15:08:58.096791416 -0200
@@ -370,6 +370,11 @@
 #define PC_COMPAT_2_7 \
     HW_COMPAT_2_7 \
     {\
+        .driver   = "kvmclock",\
+        .property = "x-mach-use-reliable-get-clock",\
+        .value    = "off",\
+    },\
+    {\
         .driver   = TYPE_X86_CPU,\
         .property = "l3-cache",\
         .value    = "off",\
Index: qemu-mig-advance-clock/target-i386/kvm.c
===================================================================
--- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-17 15:07:11.221632762 -0200
+++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-17 15:07:15.867639659 -0200
@@ -117,6 +117,13 @@
     return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
 }
 
+bool kvm_has_adjust_clock_stable(void)
+{
+    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
+
+    return (ret == KVM_CLOCK_TSC_STABLE);
+}
+
 bool kvm_allows_irq0_override(void)
 {
     return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
===================================================================
--- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-17 15:07:11.222632764 -0200
+++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-17 15:07:15.867639659 -0200
@@ -17,6 +17,7 @@
 
 bool kvm_allows_irq0_override(void);
 bool kvm_has_smm(void);
+bool kvm_has_adjust_clock_stable(void);
 void kvm_synchronize_all_tsc(void);
 void kvm_arch_reset_vcpu(X86CPU *cs);
 void kvm_arch_do_init_vcpu(X86CPU *cs);

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [qemu patch V3 0/2] improve kvmclock difference on migration
  2016-11-21 10:50 [Qemu-devel] [qemu patch V3 0/2] improve kvmclock difference on migration Marcelo Tosatti
  2016-11-21 10:50 ` [Qemu-devel] [qemu patch V3 1/2] kvm: sync linux headers Marcelo Tosatti
  2016-11-21 10:50 ` [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration Marcelo Tosatti
@ 2016-11-28 12:16 ` Marcelo Tosatti
  2 siblings, 0 replies; 18+ messages in thread
From: Marcelo Tosatti @ 2016-11-28 12:16 UTC (permalink / raw)
  To: kvm
  Cc: qemu-devel, Dr David Alan Gilbert, Paolo Bonzini, Juan Quintela,
	Radim Krcmar, Eduardo Habkost

On Mon, Nov 21, 2016 at 08:50:02AM -0200, Marcelo Tosatti wrote:
> See individual patches for details. This patchset depends on kernels
> "[PATCH] kvm: kvmclock: let KVM_GET_CLOCK return whether the master
> clock is in use" from Paolo.

Ping?

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-21 10:50 ` [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration Marcelo Tosatti
@ 2016-11-28 14:13   ` Eduardo Habkost
  2016-11-28 16:45     ` Marcelo Tosatti
  0 siblings, 1 reply; 18+ messages in thread
From: Eduardo Habkost @ 2016-11-28 14:13 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

Sorry for not noticing the following issues on the previous
reviews. I was only paying attention to the vmstate and machine
code:

On Mon, Nov 21, 2016 at 08:50:04AM -0200, Marcelo Tosatti wrote:
> Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> that moment.
> 
> For new machine types, use this value rather than reading 
> from guest memory.
> 
> This reduces kvmclock difference on migration from 5s to 0.1s
> (when max_downtime == 5s).
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> ---
>  hw/i386/kvm/clock.c    |  107 ++++++++++++++++++++++++++++++++++++++++++-------
>  include/hw/i386/pc.h   |    5 ++
>  target-i386/kvm.c      |    7 +++
>  target-i386/kvm_i386.h |    1 
>  4 files changed, 106 insertions(+), 14 deletions(-)
> 
> v2: 
> - improve variable names (Juan)
> - consolidate code on kvm_get_clock function (Paolo)
> - return mach_use_reliable_get_clock from needed function (Paolo)
> v3: 
> - simplify check for src_use_reliable_get_clock (Eduardo)
> - move src_use_reliable_get_clock initialization to realize (Eduardo)
> 
> 
> Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> ===================================================================
> --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> @@ -36,6 +36,12 @@
>  
>      uint64_t clock;
>      bool clock_valid;
> +
> +    /* whether machine supports reliable KVM_GET_CLOCK */
> +    bool mach_use_reliable_get_clock;
> +
> +    /* whether source host supported reliable KVM_GET_CLOCK */
> +    bool src_use_reliable_get_clock;
>  } KVMClockState;
>  
>  struct pvclock_vcpu_time_info {
> @@ -81,6 +87,19 @@
>      return nsec + time.system_time;
>  }
>  
> +static uint64_t kvm_get_clock(void)
> +{
> +    struct kvm_clock_data data;
> +    int ret;
> +
> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> +    if (ret < 0) {
> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> +                abort();
> +    }
> +    return data.clock;
> +}
> +
>  static void kvmclock_vm_state_change(void *opaque, int running,
>                                       RunState state)
>  {
> @@ -91,15 +110,36 @@

Can you please use "diff -p" on your patches?

>  
>      if (running) {
>          struct kvm_clock_data data = {};
> -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> +        uint64_t pvclock_via_mem = 0;
>  
> -        s->clock_valid = false;
> +        /* local (running VM) restore */
> +        if (s->clock_valid) {
> +            /*
> +             * if host does not support reliable KVM_GET_CLOCK,
> +             * read kvmclock value from memory
> +             */
> +            if (!kvm_has_adjust_clock_stable()) {

Why not use src_use_reliable_get_clock here, too? (Maybe rename
it to simply clock_is_reliable?)

You can change kvm_get_clock() to kvm_get_clock(KVMClockState*),
and make it update both s->clock and s->clock_is_reliable. With
this, s->clock and s->clock_is_reliable will be always in sync,
and have a single code path for both local restore and migration:

    /*
     * If host that set s->clock did not support reliable
     * KVM_GET_CLOCK, read kvmclock value from memory
     */
    if (!s->clock_is_reliable) {
        pvclock_via_mem = kvmclock_current_nsec(s);
    }

Updating the table from my previous message, the result would be:

--------+------------+---------------+-----------+---------+
        | kvm_has_adj_clock_stable() | clock_is_reliable   |
machine | src        | dst           | src       | dst (*) |
--------+------------+---------------+-----------+---------+
pc-2.8  | false      | false         | false     | false   |
pc-2.8  | false      | true          | false     | false   |
pc-2.8  | true       | false         | true      | true    |
pc-2.8  | true       | true          | true      | true    |
--------+------------+---------------+-----------+---------+
pc-2.7  | false      | (any)         | false     | false   |
pc-2.7  | true       | (any)         | true (**) | false   |
--------+------------+---------------+-----------+---------+

(*) More precisely: in the first kvmclock_vm_state_change() call
    just after migration. It will get updated the next time
    s->clock is set.

(**) On purpose, so the code I suggest above would work for local
     restore, too.


> +                pvclock_via_mem = kvmclock_current_nsec(s);
> +            }
> +        /* migration/savevm/init restore */
> +        } else {
> +            /*
> +             * use s->clock in case machine uses reliable
> +             * get clock and source host supported
> +             * reliable get clock
> +             */
> +            if (!s->src_use_reliable_get_clock) {
> +                pvclock_via_mem = kvmclock_current_nsec(s);
> +            }
> +        }
>  
> -        /* We can't rely on the migrated clock value, just discard it */
> -        if (time_at_migration) {
> -            s->clock = time_at_migration;
> +        /* We can't rely on the saved clock value, just discard it */
> +        if (pvclock_via_mem) {
> +            s->clock = pvclock_via_mem;
>          }
>  
> +        s->clock_valid = false;
> +
>          data.clock = s->clock;
>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>          if (ret < 0) {
> @@ -120,8 +160,6 @@
>              }
>          }
>      } else {
> -        struct kvm_clock_data data;
> -        int ret;
>  
>          if (s->clock_valid) {
>              return;
> @@ -129,13 +167,7 @@
>  
>          kvm_synchronize_all_tsc();
>  
> -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> -        if (ret < 0) {
> -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> -            abort();
> -        }
> -        s->clock = data.clock;
> -
> +        s->clock = kvm_get_clock();
>          /*
>           * If the VM is stopped, declare the clock state valid to
>           * avoid re-reading it on next vmsave (which would return
> @@ -149,25 +181,72 @@
>  {
>      KVMClockState *s = KVM_CLOCK(dev);
>  
> +    /*
> +     * On machine types that support reliable KVM_GET_CLOCK,
> +     * if host kernel does provide reliable KVM_GET_CLOCK,
> +     * set src_use_reliable_get_clock=true so that destination
> +     * avoids reading kvmclock from memory.
> +     */
> +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> +        s->src_use_reliable_get_clock = true;
> +    }
> +
>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>  }
>  
> +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    return s->mach_use_reliable_get_clock;
> +}
> +
> +static const VMStateDescription kvmclock_reliable_get_clock = {
> +    .name = "kvmclock/src_use_reliable_get_clock",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = kvmclock_src_use_reliable_get_clock,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static void kvmclock_pre_save(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    s->clock = kvm_get_clock();

I can't tell if s->clock_valid here is a bug, or a feature. In
either case, we seem to be contradicting the comment at
kvmclock_vm_state_change():

    /*
     * If the VM is stopped, declare the clock state valid to
     * avoid re-reading it on next vmsave (which would return
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     * a different value). Will be reset when the VM is continued.
     */
    s->clock_valid = true;

> +}
> +
>  static const VMStateDescription kvmclock_vmsd = {
>      .name = "kvmclock",
>      .version_id = 1,
>      .minimum_version_id = 1,
> +    .pre_save = kvmclock_pre_save,
>      .fields = (VMStateField[]) {
>          VMSTATE_UINT64(clock, KVMClockState),
>          VMSTATE_END_OF_LIST()
> +    },
> +    .subsections = (const VMStateDescription * []) {
> +        &kvmclock_reliable_get_clock,
> +        NULL
>      }
>  };
>  
> +static Property kvmclock_properties[] = {
> +    DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState,
> +                      mach_use_reliable_get_clock, true),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
>  static void kvmclock_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *dc = DEVICE_CLASS(klass);
>  
>      dc->realize = kvmclock_realize;
>      dc->vmsd = &kvmclock_vmsd;
> +    dc->props = kvmclock_properties;
>  }
>  
>  static const TypeInfo kvmclock_info = {
> Index: qemu-mig-advance-clock/include/hw/i386/pc.h
> ===================================================================
> --- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-17 15:07:11.220632761 -0200
> +++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-17 15:08:58.096791416 -0200
> @@ -370,6 +370,11 @@
>  #define PC_COMPAT_2_7 \
>      HW_COMPAT_2_7 \
>      {\
> +        .driver   = "kvmclock",\
> +        .property = "x-mach-use-reliable-get-clock",\
> +        .value    = "off",\
> +    },\
> +    {\
>          .driver   = TYPE_X86_CPU,\
>          .property = "l3-cache",\
>          .value    = "off",\
> Index: qemu-mig-advance-clock/target-i386/kvm.c
> ===================================================================
> --- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-17 15:07:11.221632762 -0200
> +++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-17 15:07:15.867639659 -0200
> @@ -117,6 +117,13 @@
>      return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
>  }
>  
> +bool kvm_has_adjust_clock_stable(void)
> +{
> +    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
> +
> +    return (ret == KVM_CLOCK_TSC_STABLE);
> +}
> +
>  bool kvm_allows_irq0_override(void)
>  {
>      return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
> Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
> ===================================================================
> --- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-17 15:07:11.222632764 -0200
> +++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-17 15:07:15.867639659 -0200
> @@ -17,6 +17,7 @@
>  
>  bool kvm_allows_irq0_override(void);
>  bool kvm_has_smm(void);
> +bool kvm_has_adjust_clock_stable(void);
>  void kvm_synchronize_all_tsc(void);
>  void kvm_arch_reset_vcpu(X86CPU *cs);
>  void kvm_arch_do_init_vcpu(X86CPU *cs);
> 
> 

-- 
Eduardo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-28 14:13   ` Eduardo Habkost
@ 2016-11-28 16:45     ` Marcelo Tosatti
  2016-11-28 17:12       ` Eduardo Habkost
  0 siblings, 1 reply; 18+ messages in thread
From: Marcelo Tosatti @ 2016-11-28 16:45 UTC (permalink / raw)
  To: Eduardo Habkost
  Cc: kvm, qemu-devel, Dr David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Mon, Nov 28, 2016 at 12:13:22PM -0200, Eduardo Habkost wrote:
> Sorry for not noticing the following issues on the previous
> reviews. I was only paying attention to the vmstate and machine
> code:
> 
> On Mon, Nov 21, 2016 at 08:50:04AM -0200, Marcelo Tosatti wrote:
> > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > that moment.
> > 
> > For new machine types, use this value rather than reading 
> > from guest memory.
> > 
> > This reduces kvmclock difference on migration from 5s to 0.1s
> > (when max_downtime == 5s).
> > 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > ---
> >  hw/i386/kvm/clock.c    |  107 ++++++++++++++++++++++++++++++++++++++++++-------
> >  include/hw/i386/pc.h   |    5 ++
> >  target-i386/kvm.c      |    7 +++
> >  target-i386/kvm_i386.h |    1 
> >  4 files changed, 106 insertions(+), 14 deletions(-)
> > 
> > v2: 
> > - improve variable names (Juan)
> > - consolidate code on kvm_get_clock function (Paolo)
> > - return mach_use_reliable_get_clock from needed function (Paolo)
> > v3: 
> > - simplify check for src_use_reliable_get_clock (Eduardo)
> > - move src_use_reliable_get_clock initialization to realize (Eduardo)
> > 
> > 
> > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > @@ -36,6 +36,12 @@
> >  
> >      uint64_t clock;
> >      bool clock_valid;
> > +
> > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > +    bool mach_use_reliable_get_clock;
> > +
> > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > +    bool src_use_reliable_get_clock;
> >  } KVMClockState;
> >  
> >  struct pvclock_vcpu_time_info {
> > @@ -81,6 +87,19 @@
> >      return nsec + time.system_time;
> >  }
> >  
> > +static uint64_t kvm_get_clock(void)
> > +{
> > +    struct kvm_clock_data data;
> > +    int ret;
> > +
> > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > +    if (ret < 0) {
> > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > +                abort();
> > +    }
> > +    return data.clock;
> > +}
> > +
> >  static void kvmclock_vm_state_change(void *opaque, int running,
> >                                       RunState state)
> >  {
> > @@ -91,15 +110,36 @@
> 
> Can you please use "diff -p" on your patches?
> 
> >  
> >      if (running) {
> >          struct kvm_clock_data data = {};
> > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > +        uint64_t pvclock_via_mem = 0;
> >  
> > -        s->clock_valid = false;
> > +        /* local (running VM) restore */
> > +        if (s->clock_valid) {
> > +            /*
> > +             * if host does not support reliable KVM_GET_CLOCK,
> > +             * read kvmclock value from memory
> > +             */
> > +            if (!kvm_has_adjust_clock_stable()) {
> 
> Why not use src_use_reliable_get_clock here, too? (Maybe rename
> it to simply clock_is_reliable?)

Because you end up mixing two mental ideas (one: whether the source
has KVM_GET_CLOCK, second: whether the destination has KVM_GET_CLOCK 
into one variable). I find it more confusing than not.
Maybe its just my limited brain but i find it
confusing.

> You can change kvm_get_clock() to kvm_get_clock(KVMClockState*),
> and make it update both s->clock and s->clock_is_reliable. With
> this, s->clock and s->clock_is_reliable will be always in sync,
> and have a single code path for both local restore and migration:

There should not be a single path for migration/runtime cases: 
on migration, we care whether the source used reliable KVM_GET_CLOCK.

On runtime, we care whether the destination uses reliable KVM_GET_CLOCK.

>     /*
>      * If host that set s->clock did not support reliable
>      * KVM_GET_CLOCK, read kvmclock value from memory
>      */
>     if (!s->clock_is_reliable) {
>         pvclock_via_mem = kvmclock_current_nsec(s);
>     }
> 
> Updating the table from my previous message, the result would be:
> 
> --------+------------+---------------+-----------+---------+
>         | kvm_has_adj_clock_stable() | clock_is_reliable   |
> machine | src        | dst           | src       | dst (*) |
> --------+------------+---------------+-----------+---------+
> pc-2.8  | false      | false         | false     | false   |
> pc-2.8  | false      | true          | false     | false   |
> pc-2.8  | true       | false         | true      | true    |
> pc-2.8  | true       | true          | true      | true    |
> --------+------------+---------------+-----------+---------+
> pc-2.7  | false      | (any)         | false     | false   |
> pc-2.7  | true       | (any)         | true (**) | false   |
> --------+------------+---------------+-----------+---------+
> 
> (*) More precisely: in the first kvmclock_vm_state_change() call
>     just after migration. It will get updated the next time
>     s->clock is set.
> 
> (**) On purpose, so the code I suggest above would work for local
>      restore, too.
> 
> 
> > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > +            }
> > +        /* migration/savevm/init restore */
> > +        } else {
> > +            /*
> > +             * use s->clock in case machine uses reliable
> > +             * get clock and source host supported
> > +             * reliable get clock
> > +             */
> > +            if (!s->src_use_reliable_get_clock) {
> > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > +            }
> > +        }
> >  
> > -        /* We can't rely on the migrated clock value, just discard it */
> > -        if (time_at_migration) {
> > -            s->clock = time_at_migration;
> > +        /* We can't rely on the saved clock value, just discard it */
> > +        if (pvclock_via_mem) {
> > +            s->clock = pvclock_via_mem;
> >          }
> >  
> > +        s->clock_valid = false;
> > +
> >          data.clock = s->clock;
> >          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> >          if (ret < 0) {
> > @@ -120,8 +160,6 @@
> >              }
> >          }
> >      } else {
> > -        struct kvm_clock_data data;
> > -        int ret;
> >  
> >          if (s->clock_valid) {
> >              return;
> > @@ -129,13 +167,7 @@
> >  
> >          kvm_synchronize_all_tsc();
> >  
> > -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > -        if (ret < 0) {
> > -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > -            abort();
> > -        }
> > -        s->clock = data.clock;
> > -
> > +        s->clock = kvm_get_clock();
> >          /*
> >           * If the VM is stopped, declare the clock state valid to
> >           * avoid re-reading it on next vmsave (which would return
> > @@ -149,25 +181,72 @@
> >  {
> >      KVMClockState *s = KVM_CLOCK(dev);
> >  
> > +    /*
> > +     * On machine types that support reliable KVM_GET_CLOCK,
> > +     * if host kernel does provide reliable KVM_GET_CLOCK,
> > +     * set src_use_reliable_get_clock=true so that destination
> > +     * avoids reading kvmclock from memory.
> > +     */
> > +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > +        s->src_use_reliable_get_clock = true;
> > +    }
> > +
> >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> >  }
> >  
> > +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > +{
> > +    KVMClockState *s = opaque;
> > +
> > +    return s->mach_use_reliable_get_clock;
> > +}
> > +
> > +static const VMStateDescription kvmclock_reliable_get_clock = {
> > +    .name = "kvmclock/src_use_reliable_get_clock",
> > +    .version_id = 1,
> > +    .minimum_version_id = 1,
> > +    .needed = kvmclock_src_use_reliable_get_clock,
> > +    .fields = (VMStateField[]) {
> > +        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> > +        VMSTATE_END_OF_LIST()
> > +    }
> > +};
> > +
> > +static void kvmclock_pre_save(void *opaque)
> > +{
> > +    KVMClockState *s = opaque;
> > +
> > +    s->clock = kvm_get_clock();
> 
> I can't tell if s->clock_valid here is a bug, or a feature. In
> either case, we seem to be contradicting the comment at
> kvmclock_vm_state_change():
> 
>     /*
>      * If the VM is stopped, declare the clock state valid to
>      * avoid re-reading it on next vmsave (which would return
>        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>      * a different value). Will be reset when the VM is continued.
>      */
>     s->clock_valid = true;

Sorry, what is the problem??

> 
> > +}
> > +
> >  static const VMStateDescription kvmclock_vmsd = {
> >      .name = "kvmclock",
> >      .version_id = 1,
> >      .minimum_version_id = 1,
> > +    .pre_save = kvmclock_pre_save,
> >      .fields = (VMStateField[]) {
> >          VMSTATE_UINT64(clock, KVMClockState),
> >          VMSTATE_END_OF_LIST()
> > +    },
> > +    .subsections = (const VMStateDescription * []) {
> > +        &kvmclock_reliable_get_clock,
> > +        NULL
> >      }
> >  };
> >  
> > +static Property kvmclock_properties[] = {
> > +    DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState,
> > +                      mach_use_reliable_get_clock, true),
> > +    DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> >  static void kvmclock_class_init(ObjectClass *klass, void *data)
> >  {
> >      DeviceClass *dc = DEVICE_CLASS(klass);
> >  
> >      dc->realize = kvmclock_realize;
> >      dc->vmsd = &kvmclock_vmsd;
> > +    dc->props = kvmclock_properties;
> >  }
> >  
> >  static const TypeInfo kvmclock_info = {
> > Index: qemu-mig-advance-clock/include/hw/i386/pc.h
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-17 15:07:11.220632761 -0200
> > +++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-17 15:08:58.096791416 -0200
> > @@ -370,6 +370,11 @@
> >  #define PC_COMPAT_2_7 \
> >      HW_COMPAT_2_7 \
> >      {\
> > +        .driver   = "kvmclock",\
> > +        .property = "x-mach-use-reliable-get-clock",\
> > +        .value    = "off",\
> > +    },\
> > +    {\
> >          .driver   = TYPE_X86_CPU,\
> >          .property = "l3-cache",\
> >          .value    = "off",\
> > Index: qemu-mig-advance-clock/target-i386/kvm.c
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-17 15:07:11.221632762 -0200
> > +++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-17 15:07:15.867639659 -0200
> > @@ -117,6 +117,13 @@
> >      return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
> >  }
> >  
> > +bool kvm_has_adjust_clock_stable(void)
> > +{
> > +    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
> > +
> > +    return (ret == KVM_CLOCK_TSC_STABLE);
> > +}
> > +
> >  bool kvm_allows_irq0_override(void)
> >  {
> >      return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
> > Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-17 15:07:11.222632764 -0200
> > +++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-17 15:07:15.867639659 -0200
> > @@ -17,6 +17,7 @@
> >  
> >  bool kvm_allows_irq0_override(void);
> >  bool kvm_has_smm(void);
> > +bool kvm_has_adjust_clock_stable(void);
> >  void kvm_synchronize_all_tsc(void);
> >  void kvm_arch_reset_vcpu(X86CPU *cs);
> >  void kvm_arch_do_init_vcpu(X86CPU *cs);
> > 
> > 
> 
> -- 
> Eduardo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-28 16:45     ` Marcelo Tosatti
@ 2016-11-28 17:12       ` Eduardo Habkost
  2016-11-28 17:31         ` Paolo Bonzini
  2016-11-29 10:56         ` Marcelo Tosatti
  0 siblings, 2 replies; 18+ messages in thread
From: Eduardo Habkost @ 2016-11-28 17:12 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Mon, Nov 28, 2016 at 02:45:24PM -0200, Marcelo Tosatti wrote:
> On Mon, Nov 28, 2016 at 12:13:22PM -0200, Eduardo Habkost wrote:
> > Sorry for not noticing the following issues on the previous
> > reviews. I was only paying attention to the vmstate and machine
> > code:
> > 
> > On Mon, Nov 21, 2016 at 08:50:04AM -0200, Marcelo Tosatti wrote:
> > > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > > that moment.
> > > 
> > > For new machine types, use this value rather than reading 
> > > from guest memory.
> > > 
> > > This reduces kvmclock difference on migration from 5s to 0.1s
> > > (when max_downtime == 5s).
> > > 
> > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > 
> > > ---
> > >  hw/i386/kvm/clock.c    |  107 ++++++++++++++++++++++++++++++++++++++++++-------
> > >  include/hw/i386/pc.h   |    5 ++
> > >  target-i386/kvm.c      |    7 +++
> > >  target-i386/kvm_i386.h |    1 
> > >  4 files changed, 106 insertions(+), 14 deletions(-)
> > > 
> > > v2: 
> > > - improve variable names (Juan)
> > > - consolidate code on kvm_get_clock function (Paolo)
> > > - return mach_use_reliable_get_clock from needed function (Paolo)
> > > v3: 
> > > - simplify check for src_use_reliable_get_clock (Eduardo)
> > > - move src_use_reliable_get_clock initialization to realize (Eduardo)
> > > 
> > > 
> > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > ===================================================================
> > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > > @@ -36,6 +36,12 @@
> > >  
> > >      uint64_t clock;
> > >      bool clock_valid;
> > > +
> > > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > > +    bool mach_use_reliable_get_clock;
> > > +
> > > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > > +    bool src_use_reliable_get_clock;
> > >  } KVMClockState;
> > >  
> > >  struct pvclock_vcpu_time_info {
> > > @@ -81,6 +87,19 @@
> > >      return nsec + time.system_time;
> > >  }
> > >  
> > > +static uint64_t kvm_get_clock(void)
> > > +{
> > > +    struct kvm_clock_data data;
> > > +    int ret;
> > > +
> > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > +    if (ret < 0) {
> > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > +                abort();
> > > +    }
> > > +    return data.clock;
> > > +}
> > > +
> > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > >                                       RunState state)
> > >  {
> > > @@ -91,15 +110,36 @@
> > 
> > Can you please use "diff -p" on your patches?
> > 
> > >  
> > >      if (running) {
> > >          struct kvm_clock_data data = {};
> > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > +        uint64_t pvclock_via_mem = 0;
> > >  
> > > -        s->clock_valid = false;
> > > +        /* local (running VM) restore */
> > > +        if (s->clock_valid) {
> > > +            /*
> > > +             * if host does not support reliable KVM_GET_CLOCK,
> > > +             * read kvmclock value from memory
> > > +             */
> > > +            if (!kvm_has_adjust_clock_stable()) {
> > 
> > Why not use src_use_reliable_get_clock here, too? (Maybe rename
> > it to simply clock_is_reliable?)
> 
> Because you end up mixing two mental ideas (one: whether the source
> has KVM_GET_CLOCK, second: whether the destination has KVM_GET_CLOCK 
> into one variable). I find it more confusing than not.
> Maybe its just my limited brain but i find it
> confusing.

I find it simpler and easier to reason about.

> 
> > You can change kvm_get_clock() to kvm_get_clock(KVMClockState*),
> > and make it update both s->clock and s->clock_is_reliable. With
> > this, s->clock and s->clock_is_reliable will be always in sync,
> > and have a single code path for both local restore and migration:
> 
> There should not be a single path for migration/runtime cases: 
> on migration, we care whether the source used reliable KVM_GET_CLOCK.
> 
> On runtime, we care whether the destination uses reliable KVM_GET_CLOCK.

The conditions may be different, but both can be translated to a
"should we trust the value in s->clock" flag, to simplify the
code (just like Paolo's suggestion on his reply to v1 today).

> 
> >     /*
> >      * If host that set s->clock did not support reliable
> >      * KVM_GET_CLOCK, read kvmclock value from memory
> >      */
> >     if (!s->clock_is_reliable) {
> >         pvclock_via_mem = kvmclock_current_nsec(s);
> >     }
> > 
> > Updating the table from my previous message, the result would be:
> > 
> > --------+------------+---------------+-----------+---------+
> >         | kvm_has_adj_clock_stable() | clock_is_reliable   |
> > machine | src        | dst           | src       | dst (*) |
> > --------+------------+---------------+-----------+---------+
> > pc-2.8  | false      | false         | false     | false   |
> > pc-2.8  | false      | true          | false     | false   |
> > pc-2.8  | true       | false         | true      | true    |
> > pc-2.8  | true       | true          | true      | true    |
> > --------+------------+---------------+-----------+---------+
> > pc-2.7  | false      | (any)         | false     | false   |
> > pc-2.7  | true       | (any)         | true (**) | false   |
> > --------+------------+---------------+-----------+---------+
> > 
> > (*) More precisely: in the first kvmclock_vm_state_change() call
> >     just after migration. It will get updated the next time
> >     s->clock is set.
> > 
> > (**) On purpose, so the code I suggest above would work for local
> >      restore, too.
> > 
> > 
> > > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > > +            }
> > > +        /* migration/savevm/init restore */
> > > +        } else {
> > > +            /*
> > > +             * use s->clock in case machine uses reliable
> > > +             * get clock and source host supported
> > > +             * reliable get clock
> > > +             */
> > > +            if (!s->src_use_reliable_get_clock) {
> > > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > > +            }
> > > +        }
> > >  
> > > -        /* We can't rely on the migrated clock value, just discard it */
> > > -        if (time_at_migration) {
> > > -            s->clock = time_at_migration;
> > > +        /* We can't rely on the saved clock value, just discard it */
> > > +        if (pvclock_via_mem) {
> > > +            s->clock = pvclock_via_mem;
> > >          }
> > >  
> > > +        s->clock_valid = false;
> > > +
> > >          data.clock = s->clock;
> > >          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> > >          if (ret < 0) {
> > > @@ -120,8 +160,6 @@
> > >              }
> > >          }
> > >      } else {
> > > -        struct kvm_clock_data data;
> > > -        int ret;
> > >  
> > >          if (s->clock_valid) {
> > >              return;
> > > @@ -129,13 +167,7 @@
> > >  
> > >          kvm_synchronize_all_tsc();
> > >  
> > > -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > -        if (ret < 0) {
> > > -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > -            abort();
> > > -        }
> > > -        s->clock = data.clock;
> > > -
> > > +        s->clock = kvm_get_clock();
> > >          /*
> > >           * If the VM is stopped, declare the clock state valid to
> > >           * avoid re-reading it on next vmsave (which would return
> > > @@ -149,25 +181,72 @@
> > >  {
> > >      KVMClockState *s = KVM_CLOCK(dev);
> > >  
> > > +    /*
> > > +     * On machine types that support reliable KVM_GET_CLOCK,
> > > +     * if host kernel does provide reliable KVM_GET_CLOCK,
> > > +     * set src_use_reliable_get_clock=true so that destination
> > > +     * avoids reading kvmclock from memory.
> > > +     */
> > > +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > > +        s->src_use_reliable_get_clock = true;
> > > +    }
> > > +
> > >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> > >  }
> > >  
> > > +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > > +{
> > > +    KVMClockState *s = opaque;
> > > +
> > > +    return s->mach_use_reliable_get_clock;
> > > +}
> > > +
> > > +static const VMStateDescription kvmclock_reliable_get_clock = {
> > > +    .name = "kvmclock/src_use_reliable_get_clock",
> > > +    .version_id = 1,
> > > +    .minimum_version_id = 1,
> > > +    .needed = kvmclock_src_use_reliable_get_clock,
> > > +    .fields = (VMStateField[]) {
> > > +        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> > > +        VMSTATE_END_OF_LIST()
> > > +    }
> > > +};
> > > +
> > > +static void kvmclock_pre_save(void *opaque)
> > > +{
> > > +    KVMClockState *s = opaque;
> > > +
> > > +    s->clock = kvm_get_clock();
> > 
> > I can't tell if s->clock_valid here is a bug, or a feature. In
> > either case, we seem to be contradicting the comment at
> > kvmclock_vm_state_change():
> > 
> >     /*
> >      * If the VM is stopped, declare the clock state valid to
> >      * avoid re-reading it on next vmsave (which would return
> >        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> >      * a different value). Will be reset when the VM is continued.
> >      */
> >     s->clock_valid = true;
> 
> Sorry, what is the problem??

The comment says clock_valid will avoid re-reading the clock, but
kvmclock_pre_save() ignores clock_valid and will re-read it
unconditionally. Either the comment is incorrect, or
kvmclock_pre_save() is incorrect (I don't know which is the
case).

-- 
Eduardo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-28 17:12       ` Eduardo Habkost
@ 2016-11-28 17:31         ` Paolo Bonzini
  2016-11-29 10:56         ` Marcelo Tosatti
  1 sibling, 0 replies; 18+ messages in thread
From: Paolo Bonzini @ 2016-11-28 17:31 UTC (permalink / raw)
  To: Eduardo Habkost, Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr David Alan Gilbert, Juan Quintela,
	Radim Krcmar



On 28/11/2016 18:12, Eduardo Habkost wrote:
>>> > > 
>>> > > Why not use src_use_reliable_get_clock here, too? (Maybe rename
>>> > > it to simply clock_is_reliable?)
>> > 
>> > Because you end up mixing two mental ideas (one: whether the source
>> > has KVM_GET_CLOCK, second: whether the destination has KVM_GET_CLOCK 
>> > into one variable). I find it more confusing than not.
>> > Maybe its just my limited brain but i find it
>> > confusing.
> I find it simpler and easier to reason about.

Me too---note that it's not "whether X had reliable KVM_GET_CLOCK" but
rather "whether the last read came from a reliable KVM_GET_CLOCK".
However...

>>> > > You can change kvm_get_clock() to kvm_get_clock(KVMClockState*),
>>> > > and make it update both s->clock and s->clock_is_reliable. With
>>> > > this, s->clock and s->clock_is_reliable will be always in sync,
>>> > > and have a single code path for both local restore and migration:
>> > 
>> > There should not be a single path for migration/runtime cases: 
>> > on migration, we care whether the source used reliable KVM_GET_CLOCK.
>> > 
>> > On runtime, we care whether the destination uses reliable KVM_GET_CLOCK.
> The conditions may be different, but both can be translated to a
> "should we trust the value in s->clock" flag, to simplify the
> code (just like Paolo's suggestion on his reply to v1 today).

... I think I understand now: Marcelo is approaching the problem
differently.  For him it's not an issue of "should we trust the value"
but rather it's "is the value going to mess up the guest with backwards
time".  Which is fine, it just requires some more comments because it's
subtle and it looks a lot like the kernel API is being misused (while
it's fine).

Paolo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-28 17:12       ` Eduardo Habkost
  2016-11-28 17:31         ` Paolo Bonzini
@ 2016-11-29 10:56         ` Marcelo Tosatti
  2016-11-29 12:34           ` Eduardo Habkost
  1 sibling, 1 reply; 18+ messages in thread
From: Marcelo Tosatti @ 2016-11-29 10:56 UTC (permalink / raw)
  To: Eduardo Habkost
  Cc: kvm, qemu-devel, Dr David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Mon, Nov 28, 2016 at 03:12:01PM -0200, Eduardo Habkost wrote:
> On Mon, Nov 28, 2016 at 02:45:24PM -0200, Marcelo Tosatti wrote:
> > On Mon, Nov 28, 2016 at 12:13:22PM -0200, Eduardo Habkost wrote:
> > > Sorry for not noticing the following issues on the previous
> > > reviews. I was only paying attention to the vmstate and machine
> > > code:
> > > 
> > > On Mon, Nov 21, 2016 at 08:50:04AM -0200, Marcelo Tosatti wrote:
> > > > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > > > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > > > that moment.
> > > > 
> > > > For new machine types, use this value rather than reading 
> > > > from guest memory.
> > > > 
> > > > This reduces kvmclock difference on migration from 5s to 0.1s
> > > > (when max_downtime == 5s).
> > > > 
> > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > 
> > > > ---
> > > >  hw/i386/kvm/clock.c    |  107 ++++++++++++++++++++++++++++++++++++++++++-------
> > > >  include/hw/i386/pc.h   |    5 ++
> > > >  target-i386/kvm.c      |    7 +++
> > > >  target-i386/kvm_i386.h |    1 
> > > >  4 files changed, 106 insertions(+), 14 deletions(-)
> > > > 
> > > > v2: 
> > > > - improve variable names (Juan)
> > > > - consolidate code on kvm_get_clock function (Paolo)
> > > > - return mach_use_reliable_get_clock from needed function (Paolo)
> > > > v3: 
> > > > - simplify check for src_use_reliable_get_clock (Eduardo)
> > > > - move src_use_reliable_get_clock initialization to realize (Eduardo)
> > > > 
> > > > 
> > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > ===================================================================
> > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > > > @@ -36,6 +36,12 @@
> > > >  
> > > >      uint64_t clock;
> > > >      bool clock_valid;
> > > > +
> > > > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > +    bool mach_use_reliable_get_clock;
> > > > +
> > > > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > +    bool src_use_reliable_get_clock;
> > > >  } KVMClockState;
> > > >  
> > > >  struct pvclock_vcpu_time_info {
> > > > @@ -81,6 +87,19 @@
> > > >      return nsec + time.system_time;
> > > >  }
> > > >  
> > > > +static uint64_t kvm_get_clock(void)
> > > > +{
> > > > +    struct kvm_clock_data data;
> > > > +    int ret;
> > > > +
> > > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > +    if (ret < 0) {
> > > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > +                abort();
> > > > +    }
> > > > +    return data.clock;
> > > > +}
> > > > +
> > > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > > >                                       RunState state)
> > > >  {
> > > > @@ -91,15 +110,36 @@
> > > 
> > > Can you please use "diff -p" on your patches?
> > > 
> > > >  
> > > >      if (running) {
> > > >          struct kvm_clock_data data = {};
> > > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > > +        uint64_t pvclock_via_mem = 0;
> > > >  
> > > > -        s->clock_valid = false;
> > > > +        /* local (running VM) restore */
> > > > +        if (s->clock_valid) {
> > > > +            /*
> > > > +             * if host does not support reliable KVM_GET_CLOCK,
> > > > +             * read kvmclock value from memory
> > > > +             */
> > > > +            if (!kvm_has_adjust_clock_stable()) {
> > > 
> > > Why not use src_use_reliable_get_clock here, too? (Maybe rename
> > > it to simply clock_is_reliable?)
> > 
> > Because you end up mixing two mental ideas (one: whether the source
> > has KVM_GET_CLOCK, second: whether the destination has KVM_GET_CLOCK 
> > into one variable). I find it more confusing than not.
> > Maybe its just my limited brain but i find it
> > confusing.
> 
> I find it simpler and easier to reason about.

I disagree, but i don't mind. Is this what you refer to:

Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
===================================================================
--- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
+++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-29 08:53:20.609817026 -0200
@@ -37,11 +37,11 @@
     uint64_t clock;
     bool clock_valid;
 
-    /* whether machine supports reliable KVM_GET_CLOCK */
+    /* whether machine type supports reliable get clock */
     bool mach_use_reliable_get_clock;
 
-    /* whether source host supported reliable KVM_GET_CLOCK */
-    bool src_use_reliable_get_clock;
+    /* whether clock is reliable */
+    bool clock_is_reliable;
 } KVMClockState;
 
 struct pvclock_vcpu_time_info {
@@ -112,25 +112,17 @@
         struct kvm_clock_data data = {};
         uint64_t pvclock_via_mem = 0;
 
-        /* local (running VM) restore */
-        if (s->clock_valid) {
-            /*
-             * if host does not support reliable KVM_GET_CLOCK,
-             * read kvmclock value from memory
-             */
-            if (!kvm_has_adjust_clock_stable()) {
-                pvclock_via_mem = kvmclock_current_nsec(s);
-            }
-        /* migration/savevm/init restore */
-        } else {
-            /*
-             * use s->clock in case machine uses reliable
-             * get clock and source host supported
-             * reliable get clock
-             */
-            if (!s->src_use_reliable_get_clock) {
-                pvclock_via_mem = kvmclock_current_nsec(s);
-            }
+
+        if (!s->clock_is_reliable) {
+            pvclock_via_mem = kvmclock_current_nsec(s);
+        }
+
+        /* migration/savevm/init restore
+         * update clock_is_reliable to match local
+         * host capabilities.
+         */
+        if (s->clock_valid == false) {
+            s->clock_is_reliable = kvm_has_adjust_clock_stable();
         }
 
         /* We can't rely on the saved clock value, just discard it */
@@ -181,20 +173,14 @@
 {
     KVMClockState *s = KVM_CLOCK(dev);
 
-    /*
-     * On machine types that support reliable KVM_GET_CLOCK,
-     * if host kernel does provide reliable KVM_GET_CLOCK,
-     * set src_use_reliable_get_clock=true so that destination
-     * avoids reading kvmclock from memory.
-     */
-    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
-        s->src_use_reliable_get_clock = true;
+    if (kvm_has_adjust_clock_stable()) {
+        s->clock_is_reliable = true;
     }
 
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
-static bool kvmclock_src_use_reliable_get_clock(void *opaque)
+static bool kvmclock_clock_is_reliable_needed(void *opaque)
 {
     KVMClockState *s = opaque;
 
@@ -202,12 +188,12 @@
 }
 
 static const VMStateDescription kvmclock_reliable_get_clock = {
-    .name = "kvmclock/src_use_reliable_get_clock",
+    .name = "kvmclock/clock_is_reliable",
     .version_id = 1,
     .minimum_version_id = 1,
-    .needed = kvmclock_src_use_reliable_get_clock,
+    .needed = kvmclock_clock_is_reliable_needed,
     .fields = (VMStateField[]) {
-        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
+        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
         VMSTATE_END_OF_LIST()
     }
 };

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-29 10:56         ` Marcelo Tosatti
@ 2016-11-29 12:34           ` Eduardo Habkost
  2016-11-29 23:54             ` Marcelo Tosatti
  0 siblings, 1 reply; 18+ messages in thread
From: Eduardo Habkost @ 2016-11-29 12:34 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Tue, Nov 29, 2016 at 08:56:00AM -0200, Marcelo Tosatti wrote:
> On Mon, Nov 28, 2016 at 03:12:01PM -0200, Eduardo Habkost wrote:
> > On Mon, Nov 28, 2016 at 02:45:24PM -0200, Marcelo Tosatti wrote:
> > > On Mon, Nov 28, 2016 at 12:13:22PM -0200, Eduardo Habkost wrote:
> > > > Sorry for not noticing the following issues on the previous
> > > > reviews. I was only paying attention to the vmstate and machine
> > > > code:
> > > > 
> > > > On Mon, Nov 21, 2016 at 08:50:04AM -0200, Marcelo Tosatti wrote:
> > > > > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > > > > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > > > > that moment.
> > > > > 
> > > > > For new machine types, use this value rather than reading 
> > > > > from guest memory.
> > > > > 
> > > > > This reduces kvmclock difference on migration from 5s to 0.1s
> > > > > (when max_downtime == 5s).
> > > > > 
> > > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > 
> > > > > ---
> > > > >  hw/i386/kvm/clock.c    |  107 ++++++++++++++++++++++++++++++++++++++++++-------
> > > > >  include/hw/i386/pc.h   |    5 ++
> > > > >  target-i386/kvm.c      |    7 +++
> > > > >  target-i386/kvm_i386.h |    1 
> > > > >  4 files changed, 106 insertions(+), 14 deletions(-)
> > > > > 
> > > > > v2: 
> > > > > - improve variable names (Juan)
> > > > > - consolidate code on kvm_get_clock function (Paolo)
> > > > > - return mach_use_reliable_get_clock from needed function (Paolo)
> > > > > v3: 
> > > > > - simplify check for src_use_reliable_get_clock (Eduardo)
> > > > > - move src_use_reliable_get_clock initialization to realize (Eduardo)
> > > > > 
> > > > > 
> > > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > > ===================================================================
> > > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> > > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > > > > @@ -36,6 +36,12 @@
> > > > >  
> > > > >      uint64_t clock;
> > > > >      bool clock_valid;
> > > > > +
> > > > > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > > +    bool mach_use_reliable_get_clock;
> > > > > +
> > > > > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > > +    bool src_use_reliable_get_clock;
> > > > >  } KVMClockState;
> > > > >  
> > > > >  struct pvclock_vcpu_time_info {
> > > > > @@ -81,6 +87,19 @@
> > > > >      return nsec + time.system_time;
> > > > >  }
> > > > >  
> > > > > +static uint64_t kvm_get_clock(void)
> > > > > +{
> > > > > +    struct kvm_clock_data data;
> > > > > +    int ret;
> > > > > +
> > > > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > > +    if (ret < 0) {
> > > > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > > +                abort();
> > > > > +    }
> > > > > +    return data.clock;
> > > > > +}
> > > > > +
> > > > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > > > >                                       RunState state)
> > > > >  {
> > > > > @@ -91,15 +110,36 @@
> > > > 
> > > > Can you please use "diff -p" on your patches?
> > > > 
> > > > >  
> > > > >      if (running) {
> > > > >          struct kvm_clock_data data = {};
> > > > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > > > +        uint64_t pvclock_via_mem = 0;
> > > > >  
> > > > > -        s->clock_valid = false;
> > > > > +        /* local (running VM) restore */
> > > > > +        if (s->clock_valid) {
> > > > > +            /*
> > > > > +             * if host does not support reliable KVM_GET_CLOCK,
> > > > > +             * read kvmclock value from memory
> > > > > +             */
> > > > > +            if (!kvm_has_adjust_clock_stable()) {
> > > > 
> > > > Why not use src_use_reliable_get_clock here, too? (Maybe rename
> > > > it to simply clock_is_reliable?)
> > > 
> > > Because you end up mixing two mental ideas (one: whether the source
> > > has KVM_GET_CLOCK, second: whether the destination has KVM_GET_CLOCK 
> > > into one variable). I find it more confusing than not.
> > > Maybe its just my limited brain but i find it
> > > confusing.
> > 
> > I find it simpler and easier to reason about.
> 
> I disagree, but i don't mind. Is this what you refer to:
> 
> Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> ===================================================================
> --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-29 08:53:20.609817026 -0200
> @@ -37,11 +37,11 @@
>      uint64_t clock;
>      bool clock_valid;
>  
> -    /* whether machine supports reliable KVM_GET_CLOCK */
> +    /* whether machine type supports reliable get clock */
>      bool mach_use_reliable_get_clock;
>  
> -    /* whether source host supported reliable KVM_GET_CLOCK */
> -    bool src_use_reliable_get_clock;
> +    /* whether clock is reliable */

I would make the comment more explicit about its meaning:
  /* whether the 'clock' value was obtained in a host with
   * reliable KVM_GET_CLOCK */

> +    bool clock_is_reliable;
>  } KVMClockState;
>  
>  struct pvclock_vcpu_time_info {
> @@ -112,25 +112,17 @@
>          struct kvm_clock_data data = {};
>          uint64_t pvclock_via_mem = 0;
>  
> -        /* local (running VM) restore */
> -        if (s->clock_valid) {
> -            /*
> -             * if host does not support reliable KVM_GET_CLOCK,
> -             * read kvmclock value from memory
> -             */
> -            if (!kvm_has_adjust_clock_stable()) {
> -                pvclock_via_mem = kvmclock_current_nsec(s);
> -            }
> -        /* migration/savevm/init restore */
> -        } else {
> -            /*
> -             * use s->clock in case machine uses reliable
> -             * get clock and source host supported
> -             * reliable get clock
> -             */
> -            if (!s->src_use_reliable_get_clock) {
> -                pvclock_via_mem = kvmclock_current_nsec(s);
> -            }
> +

A comment here would be welcome. Something like: "If the host
where s->clock was read did not support reliable KVM_GET_CLOCK,
read kvmclock value from memory".

> +        if (!s->clock_is_reliable) {
> +            pvclock_via_mem = kvmclock_current_nsec(s);
> +        }
> +
> +        /* migration/savevm/init restore
> +         * update clock_is_reliable to match local
> +         * host capabilities.
> +         */
> +        if (s->clock_valid == false) {

Why the s->clock_valid check? It seems unnecessary.

(If this is an optimization to avoid extra ioctl() calls, you can
cache it inside kvm_has_adjust_clock_stable()).

> +            s->clock_is_reliable = kvm_has_adjust_clock_stable();
>          }

This should go to kvm_get_clock(), so s->clock and
s->clock_is_reliable are guaranteed to be always in sync.

>  
>          /* We can't rely on the saved clock value, just discard it */
> @@ -181,20 +173,14 @@
>  {
>      KVMClockState *s = KVM_CLOCK(dev);
>  
> -    /*
> -     * On machine types that support reliable KVM_GET_CLOCK,
> -     * if host kernel does provide reliable KVM_GET_CLOCK,
> -     * set src_use_reliable_get_clock=true so that destination
> -     * avoids reading kvmclock from memory.
> -     */
> -    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> -        s->src_use_reliable_get_clock = true;
> +    if (kvm_has_adjust_clock_stable()) {
> +        s->clock_is_reliable = true;
>      }
>  
>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>  }
>  
> -static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> +static bool kvmclock_clock_is_reliable_needed(void *opaque)
>  {
>      KVMClockState *s = opaque;
>  
> @@ -202,12 +188,12 @@
>  }
>  
>  static const VMStateDescription kvmclock_reliable_get_clock = {
> -    .name = "kvmclock/src_use_reliable_get_clock",
> +    .name = "kvmclock/clock_is_reliable",
>      .version_id = 1,
>      .minimum_version_id = 1,
> -    .needed = kvmclock_src_use_reliable_get_clock,
> +    .needed = kvmclock_clock_is_reliable_needed,
>      .fields = (VMStateField[]) {
> -        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> +        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
>          VMSTATE_END_OF_LIST()
>      }
>  };

-- 
Eduardo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-29 12:34           ` Eduardo Habkost
@ 2016-11-29 23:54             ` Marcelo Tosatti
  2016-11-30 13:09               ` Eduardo Habkost
  0 siblings, 1 reply; 18+ messages in thread
From: Marcelo Tosatti @ 2016-11-29 23:54 UTC (permalink / raw)
  To: Eduardo Habkost
  Cc: kvm, qemu-devel, Dr David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Tue, Nov 29, 2016 at 10:34:05AM -0200, Eduardo Habkost wrote:
> On Tue, Nov 29, 2016 at 08:56:00AM -0200, Marcelo Tosatti wrote:
> > On Mon, Nov 28, 2016 at 03:12:01PM -0200, Eduardo Habkost wrote:
> > > On Mon, Nov 28, 2016 at 02:45:24PM -0200, Marcelo Tosatti wrote:
> > > > On Mon, Nov 28, 2016 at 12:13:22PM -0200, Eduardo Habkost wrote:
> > > > > Sorry for not noticing the following issues on the previous
> > > > > reviews. I was only paying attention to the vmstate and machine
> > > > > code:
> > > > > 
> > > > > On Mon, Nov 21, 2016 at 08:50:04AM -0200, Marcelo Tosatti wrote:
> > > > > > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > > > > > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > > > > > that moment.
> > > > > > 
> > > > > > For new machine types, use this value rather than reading 
> > > > > > from guest memory.
> > > > > > 
> > > > > > This reduces kvmclock difference on migration from 5s to 0.1s
> > > > > > (when max_downtime == 5s).
> > > > > > 
> > > > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > > 
> > > > > > ---
> > > > > >  hw/i386/kvm/clock.c    |  107 ++++++++++++++++++++++++++++++++++++++++++-------
> > > > > >  include/hw/i386/pc.h   |    5 ++
> > > > > >  target-i386/kvm.c      |    7 +++
> > > > > >  target-i386/kvm_i386.h |    1 
> > > > > >  4 files changed, 106 insertions(+), 14 deletions(-)
> > > > > > 
> > > > > > v2: 
> > > > > > - improve variable names (Juan)
> > > > > > - consolidate code on kvm_get_clock function (Paolo)
> > > > > > - return mach_use_reliable_get_clock from needed function (Paolo)
> > > > > > v3: 
> > > > > > - simplify check for src_use_reliable_get_clock (Eduardo)
> > > > > > - move src_use_reliable_get_clock initialization to realize (Eduardo)
> > > > > > 
> > > > > > 
> > > > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > > > ===================================================================
> > > > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> > > > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > > > > > @@ -36,6 +36,12 @@
> > > > > >  
> > > > > >      uint64_t clock;
> > > > > >      bool clock_valid;
> > > > > > +
> > > > > > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > > > +    bool mach_use_reliable_get_clock;
> > > > > > +
> > > > > > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > > > +    bool src_use_reliable_get_clock;
> > > > > >  } KVMClockState;
> > > > > >  
> > > > > >  struct pvclock_vcpu_time_info {
> > > > > > @@ -81,6 +87,19 @@
> > > > > >      return nsec + time.system_time;
> > > > > >  }
> > > > > >  
> > > > > > +static uint64_t kvm_get_clock(void)
> > > > > > +{
> > > > > > +    struct kvm_clock_data data;
> > > > > > +    int ret;
> > > > > > +
> > > > > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > > > +    if (ret < 0) {
> > > > > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > > > +                abort();
> > > > > > +    }
> > > > > > +    return data.clock;
> > > > > > +}
> > > > > > +
> > > > > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > > > > >                                       RunState state)
> > > > > >  {
> > > > > > @@ -91,15 +110,36 @@
> > > > > 
> > > > > Can you please use "diff -p" on your patches?
> > > > > 
> > > > > >  
> > > > > >      if (running) {
> > > > > >          struct kvm_clock_data data = {};
> > > > > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > > > > +        uint64_t pvclock_via_mem = 0;
> > > > > >  
> > > > > > -        s->clock_valid = false;
> > > > > > +        /* local (running VM) restore */
> > > > > > +        if (s->clock_valid) {
> > > > > > +            /*
> > > > > > +             * if host does not support reliable KVM_GET_CLOCK,
> > > > > > +             * read kvmclock value from memory
> > > > > > +             */
> > > > > > +            if (!kvm_has_adjust_clock_stable()) {
> > > > > 
> > > > > Why not use src_use_reliable_get_clock here, too? (Maybe rename
> > > > > it to simply clock_is_reliable?)
> > > > 
> > > > Because you end up mixing two mental ideas (one: whether the source
> > > > has KVM_GET_CLOCK, second: whether the destination has KVM_GET_CLOCK 
> > > > into one variable). I find it more confusing than not.
> > > > Maybe its just my limited brain but i find it
> > > > confusing.
> > > 
> > > I find it simpler and easier to reason about.
> > 
> > I disagree, but i don't mind. Is this what you refer to:
> > 
> > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-29 08:53:20.609817026 -0200
> > @@ -37,11 +37,11 @@
> >      uint64_t clock;
> >      bool clock_valid;
> >  
> > -    /* whether machine supports reliable KVM_GET_CLOCK */
> > +    /* whether machine type supports reliable get clock */
> >      bool mach_use_reliable_get_clock;
> >  
> > -    /* whether source host supported reliable KVM_GET_CLOCK */
> > -    bool src_use_reliable_get_clock;
> > +    /* whether clock is reliable */
> 
> I would make the comment more explicit about its meaning:
>   /* whether the 'clock' value was obtained in a host with
>    * reliable KVM_GET_CLOCK */

Done.

> > +    bool clock_is_reliable;
>
>  } KVMClockState;
> >  
> >  struct pvclock_vcpu_time_info {
> > @@ -112,25 +112,17 @@
> >          struct kvm_clock_data data = {};
> >          uint64_t pvclock_via_mem = 0;
> >  
> > -        /* local (running VM) restore */
> > -        if (s->clock_valid) {
> > -            /*
> > -             * if host does not support reliable KVM_GET_CLOCK,
> > -             * read kvmclock value from memory
> > -             */
> > -            if (!kvm_has_adjust_clock_stable()) {
> > -                pvclock_via_mem = kvmclock_current_nsec(s);
> > -            }
> > -        /* migration/savevm/init restore */
> > -        } else {
> > -            /*
> > -             * use s->clock in case machine uses reliable
> > -             * get clock and source host supported
> > -             * reliable get clock
> > -             */
> > -            if (!s->src_use_reliable_get_clock) {
> > -                pvclock_via_mem = kvmclock_current_nsec(s);
> > -            }
> > +
> 
> A comment here would be welcome. Something like: "If the host
> where s->clock was read did not support reliable KVM_GET_CLOCK,
> read kvmclock value from memory".

Done.

> > +        if (!s->clock_is_reliable) {
> > +            pvclock_via_mem = kvmclock_current_nsec(s);
> > +        }
> > +
> > +        /* migration/savevm/init restore
> > +         * update clock_is_reliable to match local
> > +         * host capabilities.
> > +         */
> > +        if (s->clock_valid == false) {
> 
> Why the s->clock_valid check? It seems unnecessary.

Because if s->clock_valid == false (from migration),
kvm_has_adjust_clock_stable() from the local host is irrelevant.

If s->clock_valid == true, kvm_has_adjust_clock_stable() is
what decides whether to use KVM_GET_CLOCK or not.

> (If this is an optimization to avoid extra ioctl() calls, you can
> cache it inside kvm_has_adjust_clock_stable()).

Its not.

> > +            s->clock_is_reliable = kvm_has_adjust_clock_stable();
> >          }
> 
> This should go to kvm_get_clock(), so s->clock and
> s->clock_is_reliable are guaranteed to be always in sync.

No: the point to change clock_is_reliable is the following: whether 
its the first read of the clock (migration), or subsequent 
reads (normal machine operation).

Now kvm_get_clock() just reads the clock, so i don't see it as
the right place to change the variable.

Agreed?

> >          /* We can't rely on the saved clock value, just discard it */
> > @@ -181,20 +173,14 @@
> >  {
> >      KVMClockState *s = KVM_CLOCK(dev);
> >  
> > -    /*
> > -     * On machine types that support reliable KVM_GET_CLOCK,
> > -     * if host kernel does provide reliable KVM_GET_CLOCK,
> > -     * set src_use_reliable_get_clock=true so that destination
> > -     * avoids reading kvmclock from memory.
> > -     */
> > -    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > -        s->src_use_reliable_get_clock = true;
> > +    if (kvm_has_adjust_clock_stable()) {
> > +        s->clock_is_reliable = true;
> >      }
> >  
> >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> >  }
> >  
> > -static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > +static bool kvmclock_clock_is_reliable_needed(void *opaque)
> >  {
> >      KVMClockState *s = opaque;
> >  
> > @@ -202,12 +188,12 @@
> >  }
> >  
> >  static const VMStateDescription kvmclock_reliable_get_clock = {
> > -    .name = "kvmclock/src_use_reliable_get_clock",
> > +    .name = "kvmclock/clock_is_reliable",
> >      .version_id = 1,
> >      .minimum_version_id = 1,
> > -    .needed = kvmclock_src_use_reliable_get_clock,
> > +    .needed = kvmclock_clock_is_reliable_needed,
> >      .fields = (VMStateField[]) {
> > -        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> > +        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
> >          VMSTATE_END_OF_LIST()
> >      }
> >  };
> 
> -- 
> Eduardo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-29 23:54             ` Marcelo Tosatti
@ 2016-11-30 13:09               ` Eduardo Habkost
  2016-12-01 14:39                 ` Marcelo Tosatti
  0 siblings, 1 reply; 18+ messages in thread
From: Eduardo Habkost @ 2016-11-30 13:09 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Tue, Nov 29, 2016 at 09:54:29PM -0200, Marcelo Tosatti wrote:
> On Tue, Nov 29, 2016 at 10:34:05AM -0200, Eduardo Habkost wrote:
> > On Tue, Nov 29, 2016 at 08:56:00AM -0200, Marcelo Tosatti wrote:
> > > On Mon, Nov 28, 2016 at 03:12:01PM -0200, Eduardo Habkost wrote:
> > > > On Mon, Nov 28, 2016 at 02:45:24PM -0200, Marcelo Tosatti wrote:
> > > > > On Mon, Nov 28, 2016 at 12:13:22PM -0200, Eduardo Habkost wrote:
> > > > > > Sorry for not noticing the following issues on the previous
> > > > > > reviews. I was only paying attention to the vmstate and machine
> > > > > > code:
> > > > > > 
> > > > > > On Mon, Nov 21, 2016 at 08:50:04AM -0200, Marcelo Tosatti wrote:
> > > > > > > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > > > > > > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > > > > > > that moment.
> > > > > > > 
> > > > > > > For new machine types, use this value rather than reading 
> > > > > > > from guest memory.
> > > > > > > 
> > > > > > > This reduces kvmclock difference on migration from 5s to 0.1s
> > > > > > > (when max_downtime == 5s).
> > > > > > > 
> > > > > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > > > 
> > > > > > > ---
> > > > > > >  hw/i386/kvm/clock.c    |  107 ++++++++++++++++++++++++++++++++++++++++++-------
> > > > > > >  include/hw/i386/pc.h   |    5 ++
> > > > > > >  target-i386/kvm.c      |    7 +++
> > > > > > >  target-i386/kvm_i386.h |    1 
> > > > > > >  4 files changed, 106 insertions(+), 14 deletions(-)
> > > > > > > 
> > > > > > > v2: 
> > > > > > > - improve variable names (Juan)
> > > > > > > - consolidate code on kvm_get_clock function (Paolo)
> > > > > > > - return mach_use_reliable_get_clock from needed function (Paolo)
> > > > > > > v3: 
> > > > > > > - simplify check for src_use_reliable_get_clock (Eduardo)
> > > > > > > - move src_use_reliable_get_clock initialization to realize (Eduardo)
> > > > > > > 
> > > > > > > 
> > > > > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > > > > ===================================================================
> > > > > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> > > > > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > > > > > > @@ -36,6 +36,12 @@
> > > > > > >  
> > > > > > >      uint64_t clock;
> > > > > > >      bool clock_valid;
> > > > > > > +
> > > > > > > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > > > > +    bool mach_use_reliable_get_clock;
> > > > > > > +
> > > > > > > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > > > > +    bool src_use_reliable_get_clock;
> > > > > > >  } KVMClockState;
> > > > > > >  
> > > > > > >  struct pvclock_vcpu_time_info {
> > > > > > > @@ -81,6 +87,19 @@
> > > > > > >      return nsec + time.system_time;
> > > > > > >  }
> > > > > > >  
> > > > > > > +static uint64_t kvm_get_clock(void)
> > > > > > > +{
> > > > > > > +    struct kvm_clock_data data;
> > > > > > > +    int ret;
> > > > > > > +
> > > > > > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > > > > +    if (ret < 0) {
> > > > > > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > > > > +                abort();
> > > > > > > +    }
> > > > > > > +    return data.clock;
> > > > > > > +}
> > > > > > > +
> > > > > > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > > > > > >                                       RunState state)
> > > > > > >  {
> > > > > > > @@ -91,15 +110,36 @@
> > > > > > 
> > > > > > Can you please use "diff -p" on your patches?
> > > > > > 
> > > > > > >  
> > > > > > >      if (running) {
> > > > > > >          struct kvm_clock_data data = {};
> > > > > > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > > > > > +        uint64_t pvclock_via_mem = 0;
> > > > > > >  
> > > > > > > -        s->clock_valid = false;
> > > > > > > +        /* local (running VM) restore */
> > > > > > > +        if (s->clock_valid) {
> > > > > > > +            /*
> > > > > > > +             * if host does not support reliable KVM_GET_CLOCK,
> > > > > > > +             * read kvmclock value from memory
> > > > > > > +             */
> > > > > > > +            if (!kvm_has_adjust_clock_stable()) {
> > > > > > 
> > > > > > Why not use src_use_reliable_get_clock here, too? (Maybe rename
> > > > > > it to simply clock_is_reliable?)
> > > > > 
> > > > > Because you end up mixing two mental ideas (one: whether the source
> > > > > has KVM_GET_CLOCK, second: whether the destination has KVM_GET_CLOCK 
> > > > > into one variable). I find it more confusing than not.
> > > > > Maybe its just my limited brain but i find it
> > > > > confusing.
> > > > 
> > > > I find it simpler and easier to reason about.
> > > 
> > > I disagree, but i don't mind. Is this what you refer to:
> > > 
> > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > ===================================================================
> > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-29 08:53:20.609817026 -0200
> > > @@ -37,11 +37,11 @@
> > >      uint64_t clock;
> > >      bool clock_valid;
> > >  
> > > -    /* whether machine supports reliable KVM_GET_CLOCK */
> > > +    /* whether machine type supports reliable get clock */
> > >      bool mach_use_reliable_get_clock;
> > >  
> > > -    /* whether source host supported reliable KVM_GET_CLOCK */
> > > -    bool src_use_reliable_get_clock;
> > > +    /* whether clock is reliable */
> > 
> > I would make the comment more explicit about its meaning:
> >   /* whether the 'clock' value was obtained in a host with
> >    * reliable KVM_GET_CLOCK */
> 
> Done.
> 
> > > +    bool clock_is_reliable;
> >
> >  } KVMClockState;
> > >  
> > >  struct pvclock_vcpu_time_info {
> > > @@ -112,25 +112,17 @@
> > >          struct kvm_clock_data data = {};
> > >          uint64_t pvclock_via_mem = 0;
> > >  
> > > -        /* local (running VM) restore */
> > > -        if (s->clock_valid) {
> > > -            /*
> > > -             * if host does not support reliable KVM_GET_CLOCK,
> > > -             * read kvmclock value from memory
> > > -             */
> > > -            if (!kvm_has_adjust_clock_stable()) {
> > > -                pvclock_via_mem = kvmclock_current_nsec(s);
> > > -            }
> > > -        /* migration/savevm/init restore */
> > > -        } else {
> > > -            /*
> > > -             * use s->clock in case machine uses reliable
> > > -             * get clock and source host supported
> > > -             * reliable get clock
> > > -             */
> > > -            if (!s->src_use_reliable_get_clock) {
> > > -                pvclock_via_mem = kvmclock_current_nsec(s);
> > > -            }
> > > +
> > 
> > A comment here would be welcome. Something like: "If the host
> > where s->clock was read did not support reliable KVM_GET_CLOCK,
> > read kvmclock value from memory".
> 
> Done.
> 
> > > +        if (!s->clock_is_reliable) {
> > > +            pvclock_via_mem = kvmclock_current_nsec(s);
> > > +        }
> > > +
> > > +        /* migration/savevm/init restore
> > > +         * update clock_is_reliable to match local
> > > +         * host capabilities.
> > > +         */
> > > +        if (s->clock_valid == false) {
> > 
> > Why the s->clock_valid check? It seems unnecessary.
> 
> Because if s->clock_valid == false (from migration),
> kvm_has_adjust_clock_stable() from the local host is irrelevant.

That's true, and that's why updating it on kvm_get_clock() looks
like a more logical place.

> 
> If s->clock_valid == true, kvm_has_adjust_clock_stable() is
> what decides whether to use KVM_GET_CLOCK or not.
> 
> > (If this is an optimization to avoid extra ioctl() calls, you can
> > cache it inside kvm_has_adjust_clock_stable()).
> 
> Its not.
> 
> > > +            s->clock_is_reliable = kvm_has_adjust_clock_stable();
> > >          }
> > 
> > This should go to kvm_get_clock(), so s->clock and
> > s->clock_is_reliable are guaranteed to be always in sync.
> 
> No: the point to change clock_is_reliable is the following: whether 
> its the first read of the clock (migration), or subsequent 
> reads (normal machine operation).
> 
> Now kvm_get_clock() just reads the clock, so i don't see it as
> the right place to change the variable.
> 
> Agreed?

I don't agree. clock_is_reliable is information about the current
value in s->clock, I don't see why we need to change s->clock and
s->clock_is_reliable at different places in the code.

Your code might work too, but I can't tell without reviewing very
carefully the state transitions KVMClockState could go through. I
don't see the point of making the state transitions in this code
even more complex, if we can simply stick to a simple "s->clock
and s->clock_is_reliable are always in sync" rule.

> 
> > >          /* We can't rely on the saved clock value, just discard it */
> > > @@ -181,20 +173,14 @@
> > >  {
> > >      KVMClockState *s = KVM_CLOCK(dev);
> > >  
> > > -    /*
> > > -     * On machine types that support reliable KVM_GET_CLOCK,
> > > -     * if host kernel does provide reliable KVM_GET_CLOCK,
> > > -     * set src_use_reliable_get_clock=true so that destination
> > > -     * avoids reading kvmclock from memory.
> > > -     */
> > > -    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > > -        s->src_use_reliable_get_clock = true;
> > > +    if (kvm_has_adjust_clock_stable()) {
> > > +        s->clock_is_reliable = true;
> > >      }
> > >  
> > >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> > >  }
> > >  
> > > -static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > > +static bool kvmclock_clock_is_reliable_needed(void *opaque)
> > >  {
> > >      KVMClockState *s = opaque;
> > >  
> > > @@ -202,12 +188,12 @@
> > >  }
> > >  
> > >  static const VMStateDescription kvmclock_reliable_get_clock = {
> > > -    .name = "kvmclock/src_use_reliable_get_clock",
> > > +    .name = "kvmclock/clock_is_reliable",
> > >      .version_id = 1,
> > >      .minimum_version_id = 1,
> > > -    .needed = kvmclock_src_use_reliable_get_clock,
> > > +    .needed = kvmclock_clock_is_reliable_needed,
> > >      .fields = (VMStateField[]) {
> > > -        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> > > +        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
> > >          VMSTATE_END_OF_LIST()
> > >      }
> > >  };
> > 
> > -- 
> > Eduardo

-- 
Eduardo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-30 13:09               ` Eduardo Habkost
@ 2016-12-01 14:39                 ` Marcelo Tosatti
  2016-12-02 13:56                   ` Eduardo Habkost
  0 siblings, 1 reply; 18+ messages in thread
From: Marcelo Tosatti @ 2016-12-01 14:39 UTC (permalink / raw)
  To: Eduardo Habkost
  Cc: kvm, qemu-devel, Dr David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Wed, Nov 30, 2016 at 11:09:28AM -0200, Eduardo Habkost wrote:
> On Tue, Nov 29, 2016 at 09:54:29PM -0200, Marcelo Tosatti wrote:
> > On Tue, Nov 29, 2016 at 10:34:05AM -0200, Eduardo Habkost wrote:
> > > On Tue, Nov 29, 2016 at 08:56:00AM -0200, Marcelo Tosatti wrote:
> > > > On Mon, Nov 28, 2016 at 03:12:01PM -0200, Eduardo Habkost wrote:
> > > > > On Mon, Nov 28, 2016 at 02:45:24PM -0200, Marcelo Tosatti wrote:
> > > > > > On Mon, Nov 28, 2016 at 12:13:22PM -0200, Eduardo Habkost wrote:
> > > > > > > Sorry for not noticing the following issues on the previous
> > > > > > > reviews. I was only paying attention to the vmstate and machine
> > > > > > > code:
> > > > > > > 
> > > > > > > On Mon, Nov 21, 2016 at 08:50:04AM -0200, Marcelo Tosatti wrote:
> > > > > > > > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > > > > > > > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > > > > > > > that moment.
> > > > > > > > 
> > > > > > > > For new machine types, use this value rather than reading 
> > > > > > > > from guest memory.
> > > > > > > > 
> > > > > > > > This reduces kvmclock difference on migration from 5s to 0.1s
> > > > > > > > (when max_downtime == 5s).
> > > > > > > > 
> > > > > > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > > > > 
> > > > > > > > ---
> > > > > > > >  hw/i386/kvm/clock.c    |  107 ++++++++++++++++++++++++++++++++++++++++++-------
> > > > > > > >  include/hw/i386/pc.h   |    5 ++
> > > > > > > >  target-i386/kvm.c      |    7 +++
> > > > > > > >  target-i386/kvm_i386.h |    1 
> > > > > > > >  4 files changed, 106 insertions(+), 14 deletions(-)
> > > > > > > > 
> > > > > > > > v2: 
> > > > > > > > - improve variable names (Juan)
> > > > > > > > - consolidate code on kvm_get_clock function (Paolo)
> > > > > > > > - return mach_use_reliable_get_clock from needed function (Paolo)
> > > > > > > > v3: 
> > > > > > > > - simplify check for src_use_reliable_get_clock (Eduardo)
> > > > > > > > - move src_use_reliable_get_clock initialization to realize (Eduardo)
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > > > > > ===================================================================
> > > > > > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> > > > > > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > > > > > > > @@ -36,6 +36,12 @@
> > > > > > > >  
> > > > > > > >      uint64_t clock;
> > > > > > > >      bool clock_valid;
> > > > > > > > +
> > > > > > > > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > > > > > +    bool mach_use_reliable_get_clock;
> > > > > > > > +
> > > > > > > > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > > > > > +    bool src_use_reliable_get_clock;
> > > > > > > >  } KVMClockState;
> > > > > > > >  
> > > > > > > >  struct pvclock_vcpu_time_info {
> > > > > > > > @@ -81,6 +87,19 @@
> > > > > > > >      return nsec + time.system_time;
> > > > > > > >  }
> > > > > > > >  
> > > > > > > > +static uint64_t kvm_get_clock(void)
> > > > > > > > +{
> > > > > > > > +    struct kvm_clock_data data;
> > > > > > > > +    int ret;
> > > > > > > > +
> > > > > > > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > > > > > +    if (ret < 0) {
> > > > > > > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > > > > > +                abort();
> > > > > > > > +    }
> > > > > > > > +    return data.clock;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > > > > > > >                                       RunState state)
> > > > > > > >  {
> > > > > > > > @@ -91,15 +110,36 @@
> > > > > > > 
> > > > > > > Can you please use "diff -p" on your patches?
> > > > > > > 
> > > > > > > >  
> > > > > > > >      if (running) {
> > > > > > > >          struct kvm_clock_data data = {};
> > > > > > > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > > > > > > +        uint64_t pvclock_via_mem = 0;
> > > > > > > >  
> > > > > > > > -        s->clock_valid = false;
> > > > > > > > +        /* local (running VM) restore */
> > > > > > > > +        if (s->clock_valid) {
> > > > > > > > +            /*
> > > > > > > > +             * if host does not support reliable KVM_GET_CLOCK,
> > > > > > > > +             * read kvmclock value from memory
> > > > > > > > +             */
> > > > > > > > +            if (!kvm_has_adjust_clock_stable()) {
> > > > > > > 
> > > > > > > Why not use src_use_reliable_get_clock here, too? (Maybe rename
> > > > > > > it to simply clock_is_reliable?)
> > > > > > 
> > > > > > Because you end up mixing two mental ideas (one: whether the source
> > > > > > has KVM_GET_CLOCK, second: whether the destination has KVM_GET_CLOCK 
> > > > > > into one variable). I find it more confusing than not.
> > > > > > Maybe its just my limited brain but i find it
> > > > > > confusing.
> > > > > 
> > > > > I find it simpler and easier to reason about.
> > > > 
> > > > I disagree, but i don't mind. Is this what you refer to:
> > > > 
> > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > ===================================================================
> > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-29 08:53:20.609817026 -0200
> > > > @@ -37,11 +37,11 @@
> > > >      uint64_t clock;
> > > >      bool clock_valid;
> > > >  
> > > > -    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > +    /* whether machine type supports reliable get clock */
> > > >      bool mach_use_reliable_get_clock;
> > > >  
> > > > -    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > -    bool src_use_reliable_get_clock;
> > > > +    /* whether clock is reliable */
> > > 
> > > I would make the comment more explicit about its meaning:
> > >   /* whether the 'clock' value was obtained in a host with
> > >    * reliable KVM_GET_CLOCK */
> > 
> > Done.
> > 
> > > > +    bool clock_is_reliable;
> > >
> > >  } KVMClockState;
> > > >  
> > > >  struct pvclock_vcpu_time_info {
> > > > @@ -112,25 +112,17 @@
> > > >          struct kvm_clock_data data = {};
> > > >          uint64_t pvclock_via_mem = 0;
> > > >  
> > > > -        /* local (running VM) restore */
> > > > -        if (s->clock_valid) {
> > > > -            /*
> > > > -             * if host does not support reliable KVM_GET_CLOCK,
> > > > -             * read kvmclock value from memory
> > > > -             */
> > > > -            if (!kvm_has_adjust_clock_stable()) {
> > > > -                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > -            }
> > > > -        /* migration/savevm/init restore */
> > > > -        } else {
> > > > -            /*
> > > > -             * use s->clock in case machine uses reliable
> > > > -             * get clock and source host supported
> > > > -             * reliable get clock
> > > > -             */
> > > > -            if (!s->src_use_reliable_get_clock) {
> > > > -                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > -            }
> > > > +
> > > 
> > > A comment here would be welcome. Something like: "If the host
> > > where s->clock was read did not support reliable KVM_GET_CLOCK,
> > > read kvmclock value from memory".
> > 
> > Done.
> > 
> > > > +        if (!s->clock_is_reliable) {
> > > > +            pvclock_via_mem = kvmclock_current_nsec(s);
> > > > +        }
> > > > +
> > > > +        /* migration/savevm/init restore
> > > > +         * update clock_is_reliable to match local
> > > > +         * host capabilities.
> > > > +         */
> > > > +        if (s->clock_valid == false) {
> > > 
> > > Why the s->clock_valid check? It seems unnecessary.
> > 
> > Because if s->clock_valid == false (from migration),
> > kvm_has_adjust_clock_stable() from the local host is irrelevant.
> 
> That's true, and that's why updating it on kvm_get_clock() looks
> like a more logical place.

But the current patch updates
s->clock_is_reliable at the "if (running)"
case of kvmclock_vm_state_change().

There, both s->clock_is_reliable and s->clock are
updated in sync:

        if (s->clock_valid == false) {
            s->clock_is_reliable = kvm_has_adjust_clock_stable();
        }

        /* We can't rely on the saved clock value, just discard it */
        if (pvclock_via_mem) {
            s->clock = pvclock_via_mem;
        }

However, there is no need to call kvm_get_clock() there.

kvm_get_clock() is called at the "} else {" case.

Are you suggesting to

* move s->clock_is_reliable assignment to kvm_get_clock()
* move s->clock assignment to kvm_get_clock()
* call kvm_get_clock() from the "if (running)" because
the assignments are in that function now.

But you don't need KVM_GET_CLOCK to be called there.


> 
> > 
> > If s->clock_valid == true, kvm_has_adjust_clock_stable() is
> > what decides whether to use KVM_GET_CLOCK or not.
> > 
> > > (If this is an optimization to avoid extra ioctl() calls, you can
> > > cache it inside kvm_has_adjust_clock_stable()).
> > 
> > Its not.
> > 
> > > > +            s->clock_is_reliable = kvm_has_adjust_clock_stable();
> > > >          }
> > > 
> > > This should go to kvm_get_clock(), so s->clock and
> > > s->clock_is_reliable are guaranteed to be always in sync.
> > 
> > No: the point to change clock_is_reliable is the following: whether 
> > its the first read of the clock (migration), or subsequent 
> > reads (normal machine operation).
> > 
> > Now kvm_get_clock() just reads the clock, so i don't see it as
> > the right place to change the variable.
> > 
> > Agreed?
> 
> I don't agree. clock_is_reliable is information about the current
> value in s->clock, I don't see why we need to change s->clock and
> s->clock_is_reliable at different places in the code.
> 
> Your code might work too, but I can't tell without reviewing very
> carefully the state transitions KVMClockState could go through. I
> don't see the point of making the state transitions in this code
> even more complex, if we can simply stick to a simple "s->clock
> and s->clock_is_reliable are always in sync" rule.

It is updating from the same place:

        /* migration/savevm/init restore
         * update clock_is_reliable to match local
         * host capabilities.
         */
        if (s->clock_valid == false) {
            s->clock_is_reliable = kvm_has_adjust_clock_stable();
        }

        /* We can't rely on the saved clock value, just discard it */
        if (pvclock_via_mem) {
            s->clock = pvclock_via_mem;
        }


> 
> > 
> > > >          /* We can't rely on the saved clock value, just discard it */
> > > > @@ -181,20 +173,14 @@
> > > >  {
> > > >      KVMClockState *s = KVM_CLOCK(dev);
> > > >  
> > > > -    /*
> > > > -     * On machine types that support reliable KVM_GET_CLOCK,
> > > > -     * if host kernel does provide reliable KVM_GET_CLOCK,
> > > > -     * set src_use_reliable_get_clock=true so that destination
> > > > -     * avoids reading kvmclock from memory.
> > > > -     */
> > > > -    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > > > -        s->src_use_reliable_get_clock = true;
> > > > +    if (kvm_has_adjust_clock_stable()) {
> > > > +        s->clock_is_reliable = true;
> > > >      }
> > > >  
> > > >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> > > >  }
> > > >  
> > > > -static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > > > +static bool kvmclock_clock_is_reliable_needed(void *opaque)
> > > >  {
> > > >      KVMClockState *s = opaque;
> > > >  
> > > > @@ -202,12 +188,12 @@
> > > >  }
> > > >  
> > > >  static const VMStateDescription kvmclock_reliable_get_clock = {
> > > > -    .name = "kvmclock/src_use_reliable_get_clock",
> > > > +    .name = "kvmclock/clock_is_reliable",
> > > >      .version_id = 1,
> > > >      .minimum_version_id = 1,
> > > > -    .needed = kvmclock_src_use_reliable_get_clock,
> > > > +    .needed = kvmclock_clock_is_reliable_needed,
> > > >      .fields = (VMStateField[]) {
> > > > -        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> > > > +        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
> > > >          VMSTATE_END_OF_LIST()
> > > >      }
> > > >  };
> > > 
> > > -- 
> > > Eduardo
> 
> -- 
> Eduardo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration
  2016-12-01 14:39                 ` Marcelo Tosatti
@ 2016-12-02 13:56                   ` Eduardo Habkost
  2016-12-02 22:30                     ` Marcelo Tosatti
  0 siblings, 1 reply; 18+ messages in thread
From: Eduardo Habkost @ 2016-12-02 13:56 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Thu, Dec 01, 2016 at 12:39:03PM -0200, Marcelo Tosatti wrote:
> On Wed, Nov 30, 2016 at 11:09:28AM -0200, Eduardo Habkost wrote:
> > On Tue, Nov 29, 2016 at 09:54:29PM -0200, Marcelo Tosatti wrote:
> > > On Tue, Nov 29, 2016 at 10:34:05AM -0200, Eduardo Habkost wrote:
> > > > On Tue, Nov 29, 2016 at 08:56:00AM -0200, Marcelo Tosatti wrote:
> > > > > On Mon, Nov 28, 2016 at 03:12:01PM -0200, Eduardo Habkost wrote:
> > > > > > On Mon, Nov 28, 2016 at 02:45:24PM -0200, Marcelo Tosatti wrote:
> > > > > > > On Mon, Nov 28, 2016 at 12:13:22PM -0200, Eduardo Habkost wrote:
> > > > > > > > Sorry for not noticing the following issues on the previous
> > > > > > > > reviews. I was only paying attention to the vmstate and machine
> > > > > > > > code:
> > > > > > > > 
> > > > > > > > On Mon, Nov 21, 2016 at 08:50:04AM -0200, Marcelo Tosatti wrote:
> > > > > > > > > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > > > > > > > > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > > > > > > > > that moment.
> > > > > > > > > 
> > > > > > > > > For new machine types, use this value rather than reading 
> > > > > > > > > from guest memory.
> > > > > > > > > 
> > > > > > > > > This reduces kvmclock difference on migration from 5s to 0.1s
> > > > > > > > > (when max_downtime == 5s).
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > > > > > 
> > > > > > > > > ---
> > > > > > > > >  hw/i386/kvm/clock.c    |  107 ++++++++++++++++++++++++++++++++++++++++++-------
> > > > > > > > >  include/hw/i386/pc.h   |    5 ++
> > > > > > > > >  target-i386/kvm.c      |    7 +++
> > > > > > > > >  target-i386/kvm_i386.h |    1 
> > > > > > > > >  4 files changed, 106 insertions(+), 14 deletions(-)
> > > > > > > > > 
> > > > > > > > > v2: 
> > > > > > > > > - improve variable names (Juan)
> > > > > > > > > - consolidate code on kvm_get_clock function (Paolo)
> > > > > > > > > - return mach_use_reliable_get_clock from needed function (Paolo)
> > > > > > > > > v3: 
> > > > > > > > > - simplify check for src_use_reliable_get_clock (Eduardo)
> > > > > > > > > - move src_use_reliable_get_clock initialization to realize (Eduardo)
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > > > > > > ===================================================================
> > > > > > > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> > > > > > > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > > > > > > > > @@ -36,6 +36,12 @@
> > > > > > > > >  
> > > > > > > > >      uint64_t clock;
> > > > > > > > >      bool clock_valid;
> > > > > > > > > +
> > > > > > > > > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > > > > > > +    bool mach_use_reliable_get_clock;
> > > > > > > > > +
> > > > > > > > > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > > > > > > +    bool src_use_reliable_get_clock;
> > > > > > > > >  } KVMClockState;
> > > > > > > > >  
> > > > > > > > >  struct pvclock_vcpu_time_info {
> > > > > > > > > @@ -81,6 +87,19 @@
> > > > > > > > >      return nsec + time.system_time;
> > > > > > > > >  }
> > > > > > > > >  
> > > > > > > > > +static uint64_t kvm_get_clock(void)
> > > > > > > > > +{
> > > > > > > > > +    struct kvm_clock_data data;
> > > > > > > > > +    int ret;
> > > > > > > > > +
> > > > > > > > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > > > > > > +    if (ret < 0) {
> > > > > > > > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > > > > > > +                abort();
> > > > > > > > > +    }
> > > > > > > > > +    return data.clock;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > > > > > > > >                                       RunState state)
> > > > > > > > >  {
> > > > > > > > > @@ -91,15 +110,36 @@
> > > > > > > > 
> > > > > > > > Can you please use "diff -p" on your patches?
> > > > > > > > 
> > > > > > > > >  
> > > > > > > > >      if (running) {
> > > > > > > > >          struct kvm_clock_data data = {};
> > > > > > > > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > > > > > > > +        uint64_t pvclock_via_mem = 0;
> > > > > > > > >  
> > > > > > > > > -        s->clock_valid = false;
> > > > > > > > > +        /* local (running VM) restore */
> > > > > > > > > +        if (s->clock_valid) {
> > > > > > > > > +            /*
> > > > > > > > > +             * if host does not support reliable KVM_GET_CLOCK,
> > > > > > > > > +             * read kvmclock value from memory
> > > > > > > > > +             */
> > > > > > > > > +            if (!kvm_has_adjust_clock_stable()) {
> > > > > > > > 
> > > > > > > > Why not use src_use_reliable_get_clock here, too? (Maybe rename
> > > > > > > > it to simply clock_is_reliable?)
> > > > > > > 
> > > > > > > Because you end up mixing two mental ideas (one: whether the source
> > > > > > > has KVM_GET_CLOCK, second: whether the destination has KVM_GET_CLOCK 
> > > > > > > into one variable). I find it more confusing than not.
> > > > > > > Maybe its just my limited brain but i find it
> > > > > > > confusing.
> > > > > > 
> > > > > > I find it simpler and easier to reason about.
> > > > > 
> > > > > I disagree, but i don't mind. Is this what you refer to:
> > > > > 
> > > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > > ===================================================================
> > > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-29 08:53:20.609817026 -0200
> > > > > @@ -37,11 +37,11 @@
> > > > >      uint64_t clock;
> > > > >      bool clock_valid;
> > > > >  
> > > > > -    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > > +    /* whether machine type supports reliable get clock */
> > > > >      bool mach_use_reliable_get_clock;
> > > > >  
> > > > > -    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > > -    bool src_use_reliable_get_clock;
> > > > > +    /* whether clock is reliable */
> > > > 
> > > > I would make the comment more explicit about its meaning:
> > > >   /* whether the 'clock' value was obtained in a host with
> > > >    * reliable KVM_GET_CLOCK */
> > > 
> > > Done.
> > > 
> > > > > +    bool clock_is_reliable;
> > > >
> > > >  } KVMClockState;
> > > > >  
> > > > >  struct pvclock_vcpu_time_info {
> > > > > @@ -112,25 +112,17 @@
> > > > >          struct kvm_clock_data data = {};
> > > > >          uint64_t pvclock_via_mem = 0;
> > > > >  
> > > > > -        /* local (running VM) restore */
> > > > > -        if (s->clock_valid) {
> > > > > -            /*
> > > > > -             * if host does not support reliable KVM_GET_CLOCK,
> > > > > -             * read kvmclock value from memory
> > > > > -             */
> > > > > -            if (!kvm_has_adjust_clock_stable()) {
> > > > > -                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > > -            }
> > > > > -        /* migration/savevm/init restore */
> > > > > -        } else {
> > > > > -            /*
> > > > > -             * use s->clock in case machine uses reliable
> > > > > -             * get clock and source host supported
> > > > > -             * reliable get clock
> > > > > -             */
> > > > > -            if (!s->src_use_reliable_get_clock) {
> > > > > -                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > > -            }
> > > > > +
> > > > 
> > > > A comment here would be welcome. Something like: "If the host
> > > > where s->clock was read did not support reliable KVM_GET_CLOCK,
> > > > read kvmclock value from memory".
> > > 
> > > Done.
> > > 
> > > > > +        if (!s->clock_is_reliable) {
> > > > > +            pvclock_via_mem = kvmclock_current_nsec(s);
> > > > > +        }
> > > > > +
> > > > > +        /* migration/savevm/init restore
> > > > > +         * update clock_is_reliable to match local
> > > > > +         * host capabilities.
> > > > > +         */
> > > > > +        if (s->clock_valid == false) {
> > > > 
> > > > Why the s->clock_valid check? It seems unnecessary.
> > > 
> > > Because if s->clock_valid == false (from migration),
> > > kvm_has_adjust_clock_stable() from the local host is irrelevant.
> > 
> > That's true, and that's why updating it on kvm_get_clock() looks
> > like a more logical place.
> 
> But the current patch updates
> s->clock_is_reliable at the "if (running)"
> case of kvmclock_vm_state_change().
> 
> There, both s->clock_is_reliable and s->clock are
> updated in sync:
> 
>         if (s->clock_valid == false) {
>             s->clock_is_reliable = kvm_has_adjust_clock_stable();
>         }
> 
>         /* We can't rely on the saved clock value, just discard it */
>         if (pvclock_via_mem) {
>             s->clock = pvclock_via_mem;
>         }
> 

Here s->clock is updated only to be used to set data.clock in the
next line (and never used again). I'm talking about every other
s->clock assignment in the code.

> However, there is no need to call kvm_get_clock() there.

I'm talking about the existing kvm_get_clock() calls.

> 
> kvm_get_clock() is called at the "} else {" case.
> 
> Are you suggesting to
> 
> * move s->clock_is_reliable assignment to kvm_get_clock()
> * move s->clock assignment to kvm_get_clock()
> * call kvm_get_clock() from the "if (running)" because
> the assignments are in that function now.

No.

> 
> But you don't need KVM_GET_CLOCK to be called there.

This is what I'm suggesting:

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
[ehabkost: simplified migration logic]
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
---
 hw/i386/kvm/clock.c    | 95 ++++++++++++++++++++++++++++++++++++++++++--------
 include/hw/i386/pc.h   |  5 +++
 target-i386/kvm.c      |  7 ++++
 target-i386/kvm_i386.h |  1 +
 4 files changed, 94 insertions(+), 14 deletions(-)

diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index 0f75dd3..df07ec7 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -35,6 +35,13 @@ typedef struct KVMClockState {
     /*< public >*/
 
     uint64_t clock;
+    /* whether the value in 'clock' is reliable (i.e. was set by a host
+     * with reliable KVM_GET_CLOCK) */
+    bool clock_is_reliable;
+    /* Enable migration of clock_is_reliable. Some machine-types
+     * will disable it for compatibility with older QEMU versions.
+     */
+    bool migrate_reliable_flag;
     bool clock_valid;
 } KVMClockState;
 
@@ -81,6 +88,20 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s)
     return nsec + time.system_time;
 }
 
+static void kvm_get_clock(KVMClockState *s)
+{
+    struct kvm_clock_data data;
+    int ret;
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
+        abort();
+    }
+    s->clock = data.clock;
+    s->clock_is_reliable = kvm_has_adjust_clock_stable();
+}
+
 static void kvmclock_vm_state_change(void *opaque, int running,
                                      RunState state)
 {
@@ -91,16 +112,18 @@ static void kvmclock_vm_state_change(void *opaque, int running,
 
     if (running) {
         struct kvm_clock_data data = {};
-        uint64_t time_at_migration = kvmclock_current_nsec(s);
 
         s->clock_valid = false;
 
-        /* We can't rely on the migrated clock value, just discard it */
-        if (time_at_migration) {
-            s->clock = time_at_migration;
+        /* if host that set s->clock did not support reliable KVM_GET_CLOCK,
+         * read kvmclock value from memory
+         */
+        if (!s->clock_is_reliable) {
+            data.clock = kvmclock_current_nsec(s);
+        } else {
+            data.clock = s->clock;
         }
 
-        data.clock = s->clock;
         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
         if (ret < 0) {
             fprintf(stderr, "KVM_SET_CLOCK failed: %s\n", strerror(ret));
@@ -120,22 +143,23 @@ static void kvmclock_vm_state_change(void *opaque, int running,
             }
         }
     } else {
-        struct kvm_clock_data data;
-        int ret;
 
+        /*FIXME: is this really possible to happen?
+         * Can kvmclock_vm_state_change(s, 0) be called twice without
+         * a kvmclock_vm_state_change(s, 1) call in between?
+         */
         if (s->clock_valid) {
             return;
         }
 
         kvm_synchronize_all_tsc();
 
-        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
-        if (ret < 0) {
-            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
-            abort();
-        }
-        s->clock = data.clock;
-
+        kvm_get_clock(s);
+        /* FIXME: the comment below doesn't match what is done at
+         * kvmclock_pre_save(). The comment must be updated, or
+         * kvmclock_pre_save() must be changed. I don't know what's
+         * the right thing to do.
+         */
         /*
          * If the VM is stopped, declare the clock state valid to
          * avoid re-reading it on next vmsave (which would return
@@ -152,22 +176,65 @@ static void kvmclock_realize(DeviceState *dev, Error **errp)
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
+static bool kvmclock_src_use_reliable_get_clock(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    return s->migrate_reliable_flag;
+}
+
+static const VMStateDescription kvmclock_reliable_get_clock = {
+    .name = "kvmclock/clock_is_reliable",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = kvmclock_src_use_reliable_get_clock,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+    /*TODO: add comment explaining why this is here:
+     * "re-read clock value here because [...]"" */
+
+    /*XXX: is it correct to call kvm_get_clock() here
+     * if s->clock_is_valid is true? (meaning the kvmclock was
+     * already stopped)
+     */
+    kvm_get_clock(s);
+}
+
 static const VMStateDescription kvmclock_vmsd = {
     .name = "kvmclock",
     .version_id = 1,
     .minimum_version_id = 1,
+    .pre_save = kvmclock_pre_save,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(clock, KVMClockState),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription * []) {
+        &kvmclock_reliable_get_clock,
+        NULL
     }
 };
 
+static Property kvmclock_properties[] = {
+    DEFINE_PROP_BOOL("x-migrate-reliable-flag", KVMClockState,
+                      migrate_reliable_flag, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void kvmclock_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     dc->realize = kvmclock_realize;
     dc->vmsd = &kvmclock_vmsd;
+    dc->props = kvmclock_properties;
 }
 
 static const TypeInfo kvmclock_info = {
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 4b74130..fb80d69 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -372,6 +372,11 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
 #define PC_COMPAT_2_7 \
     HW_COMPAT_2_7 \
     {\
+        .driver   = "kvmclock",\
+        .property = "x-migrate-reliable-flag",\
+        .value    = "off",\
+    },\
+    {\
         .driver   = TYPE_X86_CPU,\
         .property = "l3-cache",\
         .value    = "off",\
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index f62264a..10a9cd8 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -117,6 +117,13 @@ bool kvm_has_smm(void)
     return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
 }
 
+bool kvm_has_adjust_clock_stable(void)
+{
+    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
+
+    return (ret == KVM_CLOCK_TSC_STABLE);
+}
+
 bool kvm_allows_irq0_override(void)
 {
     return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
diff --git a/target-i386/kvm_i386.h b/target-i386/kvm_i386.h
index 7607929..bfce427 100644
--- a/target-i386/kvm_i386.h
+++ b/target-i386/kvm_i386.h
@@ -17,6 +17,7 @@
 
 bool kvm_allows_irq0_override(void);
 bool kvm_has_smm(void);
+bool kvm_has_adjust_clock_stable(void);
 void kvm_synchronize_all_tsc(void);
 void kvm_arch_reset_vcpu(X86CPU *cs);
 void kvm_arch_do_init_vcpu(X86CPU *cs);
-- 
2.7.4


> 
> 
> > 
> > > 
> > > If s->clock_valid == true, kvm_has_adjust_clock_stable() is
> > > what decides whether to use KVM_GET_CLOCK or not.
> > > 
> > > > (If this is an optimization to avoid extra ioctl() calls, you can
> > > > cache it inside kvm_has_adjust_clock_stable()).
> > > 
> > > Its not.
> > > 
> > > > > +            s->clock_is_reliable = kvm_has_adjust_clock_stable();
> > > > >          }
> > > > 
> > > > This should go to kvm_get_clock(), so s->clock and
> > > > s->clock_is_reliable are guaranteed to be always in sync.
> > > 
> > > No: the point to change clock_is_reliable is the following: whether 
> > > its the first read of the clock (migration), or subsequent 
> > > reads (normal machine operation).
> > > 
> > > Now kvm_get_clock() just reads the clock, so i don't see it as
> > > the right place to change the variable.
> > > 
> > > Agreed?
> > 
> > I don't agree. clock_is_reliable is information about the current
> > value in s->clock, I don't see why we need to change s->clock and
> > s->clock_is_reliable at different places in the code.
> > 
> > Your code might work too, but I can't tell without reviewing very
> > carefully the state transitions KVMClockState could go through. I
> > don't see the point of making the state transitions in this code
> > even more complex, if we can simply stick to a simple "s->clock
> > and s->clock_is_reliable are always in sync" rule.
> 
> It is updating from the same place:
> 
>         /* migration/savevm/init restore
>          * update clock_is_reliable to match local
>          * host capabilities.
>          */
>         if (s->clock_valid == false) {
>             s->clock_is_reliable = kvm_has_adjust_clock_stable();
>         }
> 
>         /* We can't rely on the saved clock value, just discard it */
>         if (pvclock_via_mem) {
>             s->clock = pvclock_via_mem;
>         }

I'm talking about the other s->clock assignments. This one
doesn't even seem to be necessary.

> 
> 
> > 
> > > 
> > > > >          /* We can't rely on the saved clock value, just discard it */
> > > > > @@ -181,20 +173,14 @@
> > > > >  {
> > > > >      KVMClockState *s = KVM_CLOCK(dev);
> > > > >  
> > > > > -    /*
> > > > > -     * On machine types that support reliable KVM_GET_CLOCK,
> > > > > -     * if host kernel does provide reliable KVM_GET_CLOCK,
> > > > > -     * set src_use_reliable_get_clock=true so that destination
> > > > > -     * avoids reading kvmclock from memory.
> > > > > -     */
> > > > > -    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > > > > -        s->src_use_reliable_get_clock = true;
> > > > > +    if (kvm_has_adjust_clock_stable()) {
> > > > > +        s->clock_is_reliable = true;
> > > > >      }
> > > > >  
> > > > >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> > > > >  }
> > > > >  
> > > > > -static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > > > > +static bool kvmclock_clock_is_reliable_needed(void *opaque)
> > > > >  {
> > > > >      KVMClockState *s = opaque;
> > > > >  
> > > > > @@ -202,12 +188,12 @@
> > > > >  }
> > > > >  
> > > > >  static const VMStateDescription kvmclock_reliable_get_clock = {
> > > > > -    .name = "kvmclock/src_use_reliable_get_clock",
> > > > > +    .name = "kvmclock/clock_is_reliable",
> > > > >      .version_id = 1,
> > > > >      .minimum_version_id = 1,
> > > > > -    .needed = kvmclock_src_use_reliable_get_clock,
> > > > > +    .needed = kvmclock_clock_is_reliable_needed,
> > > > >      .fields = (VMStateField[]) {
> > > > > -        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> > > > > +        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
> > > > >          VMSTATE_END_OF_LIST()
> > > > >      }
> > > > >  };
> > > > 
> > > > -- 
> > > > Eduardo
> > 
> > -- 
> > Eduardo

-- 
Eduardo

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration
  2016-12-02 13:56                   ` Eduardo Habkost
@ 2016-12-02 22:30                     ` Marcelo Tosatti
  2016-12-05 15:17                       ` Eduardo Habkost
  0 siblings, 1 reply; 18+ messages in thread
From: Marcelo Tosatti @ 2016-12-02 22:30 UTC (permalink / raw)
  To: Eduardo Habkost
  Cc: kvm, qemu-devel, Dr David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Fri, Dec 02, 2016 at 11:56:09AM -0200, Eduardo Habkost wrote:
> On Thu, Dec 01, 2016 at 12:39:03PM -0200, Marcelo Tosatti wrote:
> > On Wed, Nov 30, 2016 at 11:09:28AM -0200, Eduardo Habkost wrote:
> > > On Tue, Nov 29, 2016 at 09:54:29PM -0200, Marcelo Tosatti wrote:
> > > > On Tue, Nov 29, 2016 at 10:34:05AM -0200, Eduardo Habkost wrote:
> > > > > On Tue, Nov 29, 2016 at 08:56:00AM -0200, Marcelo Tosatti wrote:
> > > > > > On Mon, Nov 28, 2016 at 03:12:01PM -0200, Eduardo Habkost wrote:
> > > > > > > On Mon, Nov 28, 2016 at 02:45:24PM -0200, Marcelo Tosatti wrote:
> > > > > > > > On Mon, Nov 28, 2016 at 12:13:22PM -0200, Eduardo Habkost wrote:
> > > > > > > > > Sorry for not noticing the following issues on the previous
> > > > > > > > > reviews. I was only paying attention to the vmstate and machine
> > > > > > > > > code:
> > > > > > > > > 
> > > > > > > > > On Mon, Nov 21, 2016 at 08:50:04AM -0200, Marcelo Tosatti wrote:
> > > > > > > > > > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > > > > > > > > > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > > > > > > > > > that moment.
> > > > > > > > > > 
> > > > > > > > > > For new machine types, use this value rather than reading 
> > > > > > > > > > from guest memory.
> > > > > > > > > > 
> > > > > > > > > > This reduces kvmclock difference on migration from 5s to 0.1s
> > > > > > > > > > (when max_downtime == 5s).
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > > > > > > 
> > > > > > > > > > ---
> > > > > > > > > >  hw/i386/kvm/clock.c    |  107 ++++++++++++++++++++++++++++++++++++++++++-------
> > > > > > > > > >  include/hw/i386/pc.h   |    5 ++
> > > > > > > > > >  target-i386/kvm.c      |    7 +++
> > > > > > > > > >  target-i386/kvm_i386.h |    1 
> > > > > > > > > >  4 files changed, 106 insertions(+), 14 deletions(-)
> > > > > > > > > > 
> > > > > > > > > > v2: 
> > > > > > > > > > - improve variable names (Juan)
> > > > > > > > > > - consolidate code on kvm_get_clock function (Paolo)
> > > > > > > > > > - return mach_use_reliable_get_clock from needed function (Paolo)
> > > > > > > > > > v3: 
> > > > > > > > > > - simplify check for src_use_reliable_get_clock (Eduardo)
> > > > > > > > > > - move src_use_reliable_get_clock initialization to realize (Eduardo)
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > > > > > > > ===================================================================
> > > > > > > > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> > > > > > > > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > > > > > > > > > @@ -36,6 +36,12 @@
> > > > > > > > > >  
> > > > > > > > > >      uint64_t clock;
> > > > > > > > > >      bool clock_valid;
> > > > > > > > > > +
> > > > > > > > > > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > > > > > > > +    bool mach_use_reliable_get_clock;
> > > > > > > > > > +
> > > > > > > > > > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > > > > > > > +    bool src_use_reliable_get_clock;
> > > > > > > > > >  } KVMClockState;
> > > > > > > > > >  
> > > > > > > > > >  struct pvclock_vcpu_time_info {
> > > > > > > > > > @@ -81,6 +87,19 @@
> > > > > > > > > >      return nsec + time.system_time;
> > > > > > > > > >  }
> > > > > > > > > >  
> > > > > > > > > > +static uint64_t kvm_get_clock(void)
> > > > > > > > > > +{
> > > > > > > > > > +    struct kvm_clock_data data;
> > > > > > > > > > +    int ret;
> > > > > > > > > > +
> > > > > > > > > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > > > > > > > +    if (ret < 0) {
> > > > > > > > > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > > > > > > > +                abort();
> > > > > > > > > > +    }
> > > > > > > > > > +    return data.clock;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > > > > > > > > >                                       RunState state)
> > > > > > > > > >  {
> > > > > > > > > > @@ -91,15 +110,36 @@
> > > > > > > > > 
> > > > > > > > > Can you please use "diff -p" on your patches?
> > > > > > > > > 
> > > > > > > > > >  
> > > > > > > > > >      if (running) {
> > > > > > > > > >          struct kvm_clock_data data = {};
> > > > > > > > > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > > > > > > > > +        uint64_t pvclock_via_mem = 0;
> > > > > > > > > >  
> > > > > > > > > > -        s->clock_valid = false;
> > > > > > > > > > +        /* local (running VM) restore */
> > > > > > > > > > +        if (s->clock_valid) {
> > > > > > > > > > +            /*
> > > > > > > > > > +             * if host does not support reliable KVM_GET_CLOCK,
> > > > > > > > > > +             * read kvmclock value from memory
> > > > > > > > > > +             */
> > > > > > > > > > +            if (!kvm_has_adjust_clock_stable()) {
> > > > > > > > > 
> > > > > > > > > Why not use src_use_reliable_get_clock here, too? (Maybe rename
> > > > > > > > > it to simply clock_is_reliable?)
> > > > > > > > 
> > > > > > > > Because you end up mixing two mental ideas (one: whether the source
> > > > > > > > has KVM_GET_CLOCK, second: whether the destination has KVM_GET_CLOCK 
> > > > > > > > into one variable). I find it more confusing than not.
> > > > > > > > Maybe its just my limited brain but i find it
> > > > > > > > confusing.
> > > > > > > 
> > > > > > > I find it simpler and easier to reason about.
> > > > > > 
> > > > > > I disagree, but i don't mind. Is this what you refer to:
> > > > > > 
> > > > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > > > ===================================================================
> > > > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > > > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-29 08:53:20.609817026 -0200
> > > > > > @@ -37,11 +37,11 @@
> > > > > >      uint64_t clock;
> > > > > >      bool clock_valid;
> > > > > >  
> > > > > > -    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > > > +    /* whether machine type supports reliable get clock */
> > > > > >      bool mach_use_reliable_get_clock;
> > > > > >  
> > > > > > -    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > > > -    bool src_use_reliable_get_clock;
> > > > > > +    /* whether clock is reliable */
> > > > > 
> > > > > I would make the comment more explicit about its meaning:
> > > > >   /* whether the 'clock' value was obtained in a host with
> > > > >    * reliable KVM_GET_CLOCK */
> > > > 
> > > > Done.
> > > > 
> > > > > > +    bool clock_is_reliable;
> > > > >
> > > > >  } KVMClockState;
> > > > > >  
> > > > > >  struct pvclock_vcpu_time_info {
> > > > > > @@ -112,25 +112,17 @@
> > > > > >          struct kvm_clock_data data = {};
> > > > > >          uint64_t pvclock_via_mem = 0;
> > > > > >  
> > > > > > -        /* local (running VM) restore */
> > > > > > -        if (s->clock_valid) {
> > > > > > -            /*
> > > > > > -             * if host does not support reliable KVM_GET_CLOCK,
> > > > > > -             * read kvmclock value from memory
> > > > > > -             */
> > > > > > -            if (!kvm_has_adjust_clock_stable()) {
> > > > > > -                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > > > -            }
> > > > > > -        /* migration/savevm/init restore */
> > > > > > -        } else {
> > > > > > -            /*
> > > > > > -             * use s->clock in case machine uses reliable
> > > > > > -             * get clock and source host supported
> > > > > > -             * reliable get clock
> > > > > > -             */
> > > > > > -            if (!s->src_use_reliable_get_clock) {
> > > > > > -                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > > > -            }
> > > > > > +
> > > > > 
> > > > > A comment here would be welcome. Something like: "If the host
> > > > > where s->clock was read did not support reliable KVM_GET_CLOCK,
> > > > > read kvmclock value from memory".
> > > > 
> > > > Done.
> > > > 
> > > > > > +        if (!s->clock_is_reliable) {
> > > > > > +            pvclock_via_mem = kvmclock_current_nsec(s);
> > > > > > +        }
> > > > > > +
> > > > > > +        /* migration/savevm/init restore
> > > > > > +         * update clock_is_reliable to match local
> > > > > > +         * host capabilities.
> > > > > > +         */
> > > > > > +        if (s->clock_valid == false) {
> > > > > 
> > > > > Why the s->clock_valid check? It seems unnecessary.
> > > > 
> > > > Because if s->clock_valid == false (from migration),
> > > > kvm_has_adjust_clock_stable() from the local host is irrelevant.
> > > 
> > > That's true, and that's why updating it on kvm_get_clock() looks
> > > like a more logical place.
> > 
> > But the current patch updates
> > s->clock_is_reliable at the "if (running)"
> > case of kvmclock_vm_state_change().
> > 
> > There, both s->clock_is_reliable and s->clock are
> > updated in sync:
> > 
> >         if (s->clock_valid == false) {
> >             s->clock_is_reliable = kvm_has_adjust_clock_stable();
> >         }
> > 
> >         /* We can't rely on the saved clock value, just discard it */
> >         if (pvclock_via_mem) {
> >             s->clock = pvclock_via_mem;
> >         }
> > 
> 
> Here s->clock is updated only to be used to set data.clock in the
> next line (and never used again). I'm talking about every other
> s->clock assignment in the code.
> 
> > However, there is no need to call kvm_get_clock() there.
> 
> I'm talking about the existing kvm_get_clock() calls.
> 
> > 
> > kvm_get_clock() is called at the "} else {" case.
> > 
> > Are you suggesting to
> > 
> > * move s->clock_is_reliable assignment to kvm_get_clock()
> > * move s->clock assignment to kvm_get_clock()
> > * call kvm_get_clock() from the "if (running)" because
> > the assignments are in that function now.
> 
> No.
> 
> > 
> > But you don't need KVM_GET_CLOCK to be called there.
> 
> This is what I'm suggesting:
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> [ehabkost: simplified migration logic]
> Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
> ---
>  hw/i386/kvm/clock.c    | 95 ++++++++++++++++++++++++++++++++++++++++++--------
>  include/hw/i386/pc.h   |  5 +++
>  target-i386/kvm.c      |  7 ++++
>  target-i386/kvm_i386.h |  1 +
>  4 files changed, 94 insertions(+), 14 deletions(-)
> 
> diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
> index 0f75dd3..df07ec7 100644
> --- a/hw/i386/kvm/clock.c
> +++ b/hw/i386/kvm/clock.c
> @@ -35,6 +35,13 @@ typedef struct KVMClockState {
>      /*< public >*/
>  
>      uint64_t clock;
> +    /* whether the value in 'clock' is reliable (i.e. was set by a host
> +     * with reliable KVM_GET_CLOCK) */
> +    bool clock_is_reliable;
> +    /* Enable migration of clock_is_reliable. Some machine-types
> +     * will disable it for compatibility with older QEMU versions.
> +     */
> +    bool migrate_reliable_flag;
>      bool clock_valid;
>  } KVMClockState;
>  
> @@ -81,6 +88,20 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s)
>      return nsec + time.system_time;
>  }
>  
> +static void kvm_get_clock(KVMClockState *s)
> +{
> +    struct kvm_clock_data data;
> +    int ret;
> +
> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> +    if (ret < 0) {
> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> +        abort();
> +    }
> +    s->clock = data.clock;
> +    s->clock_is_reliable = kvm_has_adjust_clock_stable();
> +}
> +

s->clock_is_reliable has to updated at migration, after 
the first time KVM_SET_CLOCK is called. There is no other
place where it should be updated. This is the behaviour 
desired, according to this table:

index:
mig->c means use s->clock value from migration
mig->m means read pvclock value from memory 
(both at migration, only the first time kvmclock_change_vmstate
is called).

runt->c means use KVM_GET_CLOCK/KVM_SET_CLOCK at runtime
runt->m means read pvclock value from memory
(both at runtime, after the first time kvmclock_change_vmstate
is called).

SOURCE                DESTINATION             BEHAVIOUR
get_clock_reliable    get_clock_reliable      mig->c,runt->c  1
get_clock_reliable    ~get_clock_reliable     mig->c,runt->m  2
~get_clock_reliable   get_clock_reliable      mig->m,runt->c  3
~get_clock_reliable   ~get_clock_reliable     mig->m,runt->m  4

So looking at your patch below, and thinking of 
kvm_get_clock() updating s->clock_is_reliable from 
!running case (that is RUNNING -> ~RUNNING transition)
makes no sense at all. That should not happen.

s->clock_is_reliable should be updated after the first
time ~RUNNING -> RUNNING migration is done, which 
is my patch does.

I think your s->clock,s->clock_is_reliable should be in sync
suggestion comes from a misunderstanding of how they are used.

s->clock is not read if s->clock_is_reliable is set.

Look at my attached patch below (still untested, will test and then
resubmit) and check whether it matches the behaviour 
described in the table above.

>  static void kvmclock_vm_state_change(void *opaque, int running,
>                                       RunState state)
>  {
> @@ -91,16 +112,18 @@ static void kvmclock_vm_state_change(void *opaque, int running,
>  
>      if (running) {
>          struct kvm_clock_data data = {};
> -        uint64_t time_at_migration = kvmclock_current_nsec(s);
>  
>          s->clock_valid = false;
>  
> -        /* We can't rely on the migrated clock value, just discard it */
> -        if (time_at_migration) {
> -            s->clock = time_at_migration;
> +        /* if host that set s->clock did not support reliable KVM_GET_CLOCK,
> +         * read kvmclock value from memory
> +         */
> +        if (!s->clock_is_reliable) {
> +            data.clock = kvmclock_current_nsec(s);
> +        } else {
> +            data.clock = s->clock;
>          }
>  
> -        data.clock = s->clock;
>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>          if (ret < 0) {
>              fprintf(stderr, "KVM_SET_CLOCK failed: %s\n", strerror(ret));
> @@ -120,22 +143,23 @@ static void kvmclock_vm_state_change(void *opaque, int running,

>              }
>          }
>      } else {
> -        struct kvm_clock_data data;
> -        int ret;
>  
> +        /*FIXME: is this really possible to happen?
> +         * Can kvmclock_vm_state_change(s, 0) be called twice without
> +         * a kvmclock_vm_state_change(s, 1) call in between?
> +         */

It can't: check the code where notifiers are called.

>          if (s->clock_valid) {
>              return;
>          }
>  
>          kvm_synchronize_all_tsc();
>  
> -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> -        if (ret < 0) {
> -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> -            abort();
> -        }
> -        s->clock = data.clock;
> -
> +        kvm_get_clock(s);
> +        /* FIXME: the comment below doesn't match what is done at
> +         * kvmclock_pre_save(). The comment must be updated, or
> +         * kvmclock_pre_save() must be changed. I don't know what's
> +         * the right thing to do.
> +         */

This fails for all cases (1,2,3,4) in the table above.

>          /*
>           * If the VM is stopped, declare the clock state valid to
>           * avoid re-reading it on next vmsave (which would return
> @@ -152,22 +176,65 @@ static void kvmclock_realize(DeviceState *dev, Error **errp)
>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>  }
>  
> +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    return s->migrate_reliable_flag;
> +}
> +
> +static const VMStateDescription kvmclock_reliable_get_clock = {
> +    .name = "kvmclock/clock_is_reliable",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = kvmclock_src_use_reliable_get_clock,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static void kvmclock_pre_save(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +    /*TODO: add comment explaining why this is here:
> +     * "re-read clock value here because [...]"" */
> +
> +    /*XXX: is it correct to call kvm_get_clock() here
> +     * if s->clock_is_valid is true? (meaning the kvmclock was
> +     * already stopped)
> +     */
> +    kvm_get_clock(s);
> +}
> +
>  static const VMStateDescription kvmclock_vmsd = {
>      .name = "kvmclock",
>      .version_id = 1,
>      .minimum_version_id = 1,
> +    .pre_save = kvmclock_pre_save,
>      .fields = (VMStateField[]) {
>          VMSTATE_UINT64(clock, KVMClockState),
>          VMSTATE_END_OF_LIST()
> +    },
> +    .subsections = (const VMStateDescription * []) {
> +        &kvmclock_reliable_get_clock,
> +        NULL
>      }
>  };
>  
> +static Property kvmclock_properties[] = {
> +    DEFINE_PROP_BOOL("x-migrate-reliable-flag", KVMClockState,
> +                      migrate_reliable_flag, true),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
>  static void kvmclock_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *dc = DEVICE_CLASS(klass);
>  
>      dc->realize = kvmclock_realize;
>      dc->vmsd = &kvmclock_vmsd;
> +    dc->props = kvmclock_properties;
>  }
>  
>  static const TypeInfo kvmclock_info = {
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 4b74130..fb80d69 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -372,6 +372,11 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
>  #define PC_COMPAT_2_7 \
>      HW_COMPAT_2_7 \
>      {\
> +        .driver   = "kvmclock",\
> +        .property = "x-migrate-reliable-flag",\
> +        .value    = "off",\
> +    },\
> +    {\
>          .driver   = TYPE_X86_CPU,\
>          .property = "l3-cache",\
>          .value    = "off",\
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index f62264a..10a9cd8 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -117,6 +117,13 @@ bool kvm_has_smm(void)
>      return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
>  }
>  
> +bool kvm_has_adjust_clock_stable(void)
> +{
> +    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
> +
> +    return (ret == KVM_CLOCK_TSC_STABLE);
> +}
> +
>  bool kvm_allows_irq0_override(void)
>  {
>      return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
> diff --git a/target-i386/kvm_i386.h b/target-i386/kvm_i386.h
> index 7607929..bfce427 100644
> --- a/target-i386/kvm_i386.h
> +++ b/target-i386/kvm_i386.h
> @@ -17,6 +17,7 @@
>  
>  bool kvm_allows_irq0_override(void);
>  bool kvm_has_smm(void);
> +bool kvm_has_adjust_clock_stable(void);
>  void kvm_synchronize_all_tsc(void);
>  void kvm_arch_reset_vcpu(X86CPU *cs);
>  void kvm_arch_do_init_vcpu(X86CPU *cs);
> -- 
> 2.7.4
> 
> 
> > 
> > 
> > > 
> > > > 
> > > > If s->clock_valid == true, kvm_has_adjust_clock_stable() is
> > > > what decides whether to use KVM_GET_CLOCK or not.
> > > > 
> > > > > (If this is an optimization to avoid extra ioctl() calls, you can
> > > > > cache it inside kvm_has_adjust_clock_stable()).
> > > > 
> > > > Its not.
> > > > 
> > > > > > +            s->clock_is_reliable = kvm_has_adjust_clock_stable();
> > > > > >          }
> > > > > 
> > > > > This should go to kvm_get_clock(), so s->clock and
> > > > > s->clock_is_reliable are guaranteed to be always in sync.
> > > > 
> > > > No: the point to change clock_is_reliable is the following: whether 
> > > > its the first read of the clock (migration), or subsequent 
> > > > reads (normal machine operation).
> > > > 
> > > > Now kvm_get_clock() just reads the clock, so i don't see it as
> > > > the right place to change the variable.
> > > > 
> > > > Agreed?
> > > 
> > > I don't agree. clock_is_reliable is information about the current
> > > value in s->clock, I don't see why we need to change s->clock and
> > > s->clock_is_reliable at different places in the code.
> > > 
> > > Your code might work too, but I can't tell without reviewing very
> > > carefully the state transitions KVMClockState could go through. I
> > > don't see the point of making the state transitions in this code
> > > even more complex, if we can simply stick to a simple "s->clock
> > > and s->clock_is_reliable are always in sync" rule.
> > 
> > It is updating from the same place:
> > 
> >         /* migration/savevm/init restore
> >          * update clock_is_reliable to match local
> >          * host capabilities.
> >          */
> >         if (s->clock_valid == false) {
> >             s->clock_is_reliable = kvm_has_adjust_clock_stable();
> >         }
> > 
> >         /* We can't rely on the saved clock value, just discard it */
> >         if (pvclock_via_mem) {
> >             s->clock = pvclock_via_mem;
> >         }
> 
> I'm talking about the other s->clock assignments. This one
> doesn't even seem to be necessary.

It is necessary: this is how you read pvclock from memory.

> > 
> > 
> > > 
> > > > 
> > > > > >          /* We can't rely on the saved clock value, just discard it */
> > > > > > @@ -181,20 +173,14 @@
> > > > > >  {
> > > > > >      KVMClockState *s = KVM_CLOCK(dev);
> > > > > >  
> > > > > > -    /*
> > > > > > -     * On machine types that support reliable KVM_GET_CLOCK,
> > > > > > -     * if host kernel does provide reliable KVM_GET_CLOCK,
> > > > > > -     * set src_use_reliable_get_clock=true so that destination
> > > > > > -     * avoids reading kvmclock from memory.
> > > > > > -     */
> > > > > > -    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > > > > > -        s->src_use_reliable_get_clock = true;
> > > > > > +    if (kvm_has_adjust_clock_stable()) {
> > > > > > +        s->clock_is_reliable = true;
> > > > > >      }
> > > > > >  
> > > > > >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> > > > > >  }
> > > > > >  
> > > > > > -static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > > > > > +static bool kvmclock_clock_is_reliable_needed(void *opaque)
> > > > > >  {
> > > > > >      KVMClockState *s = opaque;
> > > > > >  
> > > > > > @@ -202,12 +188,12 @@
> > > > > >  }
> > > > > >  
> > > > > >  static const VMStateDescription kvmclock_reliable_get_clock = {
> > > > > > -    .name = "kvmclock/src_use_reliable_get_clock",
> > > > > > +    .name = "kvmclock/clock_is_reliable",
> > > > > >      .version_id = 1,
> > > > > >      .minimum_version_id = 1,
> > > > > > -    .needed = kvmclock_src_use_reliable_get_clock,
> > > > > > +    .needed = kvmclock_clock_is_reliable_needed,
> > > > > >      .fields = (VMStateField[]) {
> > > > > > -        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> > > > > > +        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
> > > > > >          VMSTATE_END_OF_LIST()
> > > > > >      }
> > > > > >  };
> > > > > 
> > > > > -- 
> > > > > Eduardo
> > > 
> > > -- 
> > > Eduardo
> 
> -- 
> Eduardo


kvmclock: reduce kvmclock difference on migration

Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
indicates that KVM_GET_CLOCK returns a value as seen by the guest at
that moment.

For new machine types, use this value rather than reading 
from guest memory.

This reduces kvmclock difference on migration from 5s to 0.1s
(when max_downtime == 5s).

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 hw/i386/kvm/clock.c    |   97 +++++++++++++++++++++++++++++++++++++++++--------
 include/hw/i386/pc.h   |    5 ++
 target-i386/kvm.c      |    7 +++
 target-i386/kvm_i386.h |    1 
 4 files changed, 96 insertions(+), 14 deletions(-)

v2: 
- improve variable names (Juan)
- consolidate code on kvm_get_clock function (Paolo)
- return mach_use_reliable_get_clock from needed function (Paolo)
v3: 
- simplify check for src_use_reliable_get_clock (Eduardo)
- move src_use_reliable_get_clock initialization to realize (Eduardo)


Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
===================================================================
--- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
+++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-12-01 12:43:18.606294830 -0200
@@ -36,6 +36,13 @@
 
     uint64_t clock;
     bool clock_valid;
+
+    /* whether machine type supports reliable get clock */
+    bool mach_use_reliable_get_clock;
+
+    /* whether the 'clock' value was obtained in a host with
+     * reliable KVM_GET_CLOCK */
+    bool clock_is_reliable;
 } KVMClockState;
 
 struct pvclock_vcpu_time_info {
@@ -81,6 +88,19 @@
     return nsec + time.system_time;
 }
 
+static uint64_t kvm_get_clock(void)
+{
+    struct kvm_clock_data data;
+    int ret;
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
+                abort();
+    }
+    return data.clock;
+}
+
 static void kvmclock_vm_state_change(void *opaque, int running,
                                      RunState state)
 {
@@ -91,15 +111,31 @@
 
     if (running) {
         struct kvm_clock_data data = {};
-        uint64_t time_at_migration = kvmclock_current_nsec(s);
+        uint64_t pvclock_via_mem = 0;
 
-        s->clock_valid = false;
+        /*
+         * If the host where s->clock was read did not support reliable
+         * KVM_GET_CLOCK, read kvmclock value from memory.
+         */
+        if (!s->clock_is_reliable) {
+            pvclock_via_mem = kvmclock_current_nsec(s);
+        }
 
-        /* We can't rely on the migrated clock value, just discard it */
-        if (time_at_migration) {
-            s->clock = time_at_migration;
+        /* migration/savevm/init restore
+         * update clock_is_reliable to match local
+         * host capabilities.
+         */
+        if (s->clock_valid == false) {
+            s->clock_is_reliable = kvm_has_adjust_clock_stable();
         }
 
+        /* We can't rely on the saved clock value, just discard it */
+        if (pvclock_via_mem) {
+            s->clock = pvclock_via_mem;
+        }
+
+        s->clock_valid = false;
+
         data.clock = s->clock;
         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
         if (ret < 0) {
@@ -120,8 +156,6 @@
             }
         }
     } else {
-        struct kvm_clock_data data;
-        int ret;
 
         if (s->clock_valid) {
             return;
@@ -129,13 +163,7 @@
 
         kvm_synchronize_all_tsc();
 
-        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
-        if (ret < 0) {
-            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
-            abort();
-        }
-        s->clock = data.clock;
-
+        s->clock = kvm_get_clock();
         /*
          * If the VM is stopped, declare the clock state valid to
          * avoid re-reading it on next vmsave (which would return
@@ -149,25 +177,66 @@
 {
     KVMClockState *s = KVM_CLOCK(dev);
 
+    if (kvm_has_adjust_clock_stable()) {
+        s->clock_is_reliable = true;
+    }
+
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
+static bool kvmclock_clock_is_reliable_needed(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    return s->mach_use_reliable_get_clock;
+}
+
+static const VMStateDescription kvmclock_reliable_get_clock = {
+    .name = "kvmclock/clock_is_reliable",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = kvmclock_clock_is_reliable_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    s->clock = kvm_get_clock();
+}
+
 static const VMStateDescription kvmclock_vmsd = {
     .name = "kvmclock",
     .version_id = 1,
     .minimum_version_id = 1,
+    .pre_save = kvmclock_pre_save,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(clock, KVMClockState),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription * []) {
+        &kvmclock_reliable_get_clock,
+        NULL
     }
 };
 
+static Property kvmclock_properties[] = {
+    DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState,
+                      mach_use_reliable_get_clock, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void kvmclock_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     dc->realize = kvmclock_realize;
     dc->vmsd = &kvmclock_vmsd;
+    dc->props = kvmclock_properties;
 }
 
 static const TypeInfo kvmclock_info = {
Index: qemu-mig-advance-clock/include/hw/i386/pc.h
===================================================================
--- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-17 15:07:11.220632761 -0200
+++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-17 15:08:58.096791416 -0200
@@ -370,6 +370,11 @@
 #define PC_COMPAT_2_7 \
     HW_COMPAT_2_7 \
     {\
+        .driver   = "kvmclock",\
+        .property = "x-mach-use-reliable-get-clock",\
+        .value    = "off",\
+    },\
+    {\
         .driver   = TYPE_X86_CPU,\
         .property = "l3-cache",\
         .value    = "off",\
Index: qemu-mig-advance-clock/target-i386/kvm.c
===================================================================
--- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-17 15:07:11.221632762 -0200
+++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-17 15:07:15.867639659 -0200
@@ -117,6 +117,13 @@
     return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
 }
 
+bool kvm_has_adjust_clock_stable(void)
+{
+    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
+
+    return (ret == KVM_CLOCK_TSC_STABLE);
+}
+
 bool kvm_allows_irq0_override(void)
 {
     return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
===================================================================
--- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-17 15:07:11.222632764 -0200
+++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-17 15:07:15.867639659 -0200
@@ -17,6 +17,7 @@
 
 bool kvm_allows_irq0_override(void);
 bool kvm_has_smm(void);
+bool kvm_has_adjust_clock_stable(void);
 void kvm_synchronize_all_tsc(void);
 void kvm_arch_reset_vcpu(X86CPU *cs);
 void kvm_arch_do_init_vcpu(X86CPU *cs);

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration
  2016-12-02 22:30                     ` Marcelo Tosatti
@ 2016-12-05 15:17                       ` Eduardo Habkost
  2016-12-05 17:20                         ` Marcelo Tosatti
  0 siblings, 1 reply; 18+ messages in thread
From: Eduardo Habkost @ 2016-12-05 15:17 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Fri, Dec 02, 2016 at 08:30:35PM -0200, Marcelo Tosatti wrote:
> On Fri, Dec 02, 2016 at 11:56:09AM -0200, Eduardo Habkost wrote:
> > On Thu, Dec 01, 2016 at 12:39:03PM -0200, Marcelo Tosatti wrote:
> > > On Wed, Nov 30, 2016 at 11:09:28AM -0200, Eduardo Habkost wrote:
> > > > On Tue, Nov 29, 2016 at 09:54:29PM -0200, Marcelo Tosatti wrote:
> > > > > On Tue, Nov 29, 2016 at 10:34:05AM -0200, Eduardo Habkost wrote:
> > > > > > On Tue, Nov 29, 2016 at 08:56:00AM -0200, Marcelo Tosatti wrote:
> > > > > > > On Mon, Nov 28, 2016 at 03:12:01PM -0200, Eduardo Habkost wrote:
> > > > > > > > On Mon, Nov 28, 2016 at 02:45:24PM -0200, Marcelo Tosatti wrote:
> > > > > > > > > On Mon, Nov 28, 2016 at 12:13:22PM -0200, Eduardo Habkost wrote:
> > > > > > > > > > Sorry for not noticing the following issues on the previous
> > > > > > > > > > reviews. I was only paying attention to the vmstate and machine
> > > > > > > > > > code:
> > > > > > > > > > 
> > > > > > > > > > On Mon, Nov 21, 2016 at 08:50:04AM -0200, Marcelo Tosatti wrote:
> > > > > > > > > > > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > > > > > > > > > > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > > > > > > > > > > that moment.
> > > > > > > > > > > 
> > > > > > > > > > > For new machine types, use this value rather than reading 
> > > > > > > > > > > from guest memory.
> > > > > > > > > > > 
> > > > > > > > > > > This reduces kvmclock difference on migration from 5s to 0.1s
> > > > > > > > > > > (when max_downtime == 5s).
> > > > > > > > > > > 
> > > > > > > > > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > > > > > > > 
> > > > > > > > > > > ---
> > > > > > > > > > >  hw/i386/kvm/clock.c    |  107 ++++++++++++++++++++++++++++++++++++++++++-------
> > > > > > > > > > >  include/hw/i386/pc.h   |    5 ++
> > > > > > > > > > >  target-i386/kvm.c      |    7 +++
> > > > > > > > > > >  target-i386/kvm_i386.h |    1 
> > > > > > > > > > >  4 files changed, 106 insertions(+), 14 deletions(-)
> > > > > > > > > > > 
> > > > > > > > > > > v2: 
> > > > > > > > > > > - improve variable names (Juan)
> > > > > > > > > > > - consolidate code on kvm_get_clock function (Paolo)
> > > > > > > > > > > - return mach_use_reliable_get_clock from needed function (Paolo)
> > > > > > > > > > > v3: 
> > > > > > > > > > > - simplify check for src_use_reliable_get_clock (Eduardo)
> > > > > > > > > > > - move src_use_reliable_get_clock initialization to realize (Eduardo)
> > > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > > > > > > > > ===================================================================
> > > > > > > > > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> > > > > > > > > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > > > > > > > > > > @@ -36,6 +36,12 @@
> > > > > > > > > > >  
> > > > > > > > > > >      uint64_t clock;
> > > > > > > > > > >      bool clock_valid;
> > > > > > > > > > > +
> > > > > > > > > > > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > > > > > > > > +    bool mach_use_reliable_get_clock;
> > > > > > > > > > > +
> > > > > > > > > > > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > > > > > > > > +    bool src_use_reliable_get_clock;
> > > > > > > > > > >  } KVMClockState;
> > > > > > > > > > >  
> > > > > > > > > > >  struct pvclock_vcpu_time_info {
> > > > > > > > > > > @@ -81,6 +87,19 @@
> > > > > > > > > > >      return nsec + time.system_time;
> > > > > > > > > > >  }
> > > > > > > > > > >  
> > > > > > > > > > > +static uint64_t kvm_get_clock(void)
> > > > > > > > > > > +{
> > > > > > > > > > > +    struct kvm_clock_data data;
> > > > > > > > > > > +    int ret;
> > > > > > > > > > > +
> > > > > > > > > > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > > > > > > > > +    if (ret < 0) {
> > > > > > > > > > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > > > > > > > > +                abort();
> > > > > > > > > > > +    }
> > > > > > > > > > > +    return data.clock;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > > > > > > > > > >                                       RunState state)
> > > > > > > > > > >  {
> > > > > > > > > > > @@ -91,15 +110,36 @@
> > > > > > > > > > 
> > > > > > > > > > Can you please use "diff -p" on your patches?
> > > > > > > > > > 
> > > > > > > > > > >  
> > > > > > > > > > >      if (running) {
> > > > > > > > > > >          struct kvm_clock_data data = {};
> > > > > > > > > > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > > > > > > > > > +        uint64_t pvclock_via_mem = 0;
> > > > > > > > > > >  
> > > > > > > > > > > -        s->clock_valid = false;
> > > > > > > > > > > +        /* local (running VM) restore */
> > > > > > > > > > > +        if (s->clock_valid) {
> > > > > > > > > > > +            /*
> > > > > > > > > > > +             * if host does not support reliable KVM_GET_CLOCK,
> > > > > > > > > > > +             * read kvmclock value from memory
> > > > > > > > > > > +             */
> > > > > > > > > > > +            if (!kvm_has_adjust_clock_stable()) {
> > > > > > > > > > 
> > > > > > > > > > Why not use src_use_reliable_get_clock here, too? (Maybe rename
> > > > > > > > > > it to simply clock_is_reliable?)
> > > > > > > > > 
> > > > > > > > > Because you end up mixing two mental ideas (one: whether the source
> > > > > > > > > has KVM_GET_CLOCK, second: whether the destination has KVM_GET_CLOCK 
> > > > > > > > > into one variable). I find it more confusing than not.
> > > > > > > > > Maybe its just my limited brain but i find it
> > > > > > > > > confusing.
> > > > > > > > 
> > > > > > > > I find it simpler and easier to reason about.
> > > > > > > 
> > > > > > > I disagree, but i don't mind. Is this what you refer to:
> > > > > > > 
> > > > > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > > > > ===================================================================
> > > > > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > > > > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-29 08:53:20.609817026 -0200
> > > > > > > @@ -37,11 +37,11 @@
> > > > > > >      uint64_t clock;
> > > > > > >      bool clock_valid;
> > > > > > >  
> > > > > > > -    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > > > > +    /* whether machine type supports reliable get clock */
> > > > > > >      bool mach_use_reliable_get_clock;
> > > > > > >  
> > > > > > > -    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > > > > -    bool src_use_reliable_get_clock;
> > > > > > > +    /* whether clock is reliable */
> > > > > > 
> > > > > > I would make the comment more explicit about its meaning:
> > > > > >   /* whether the 'clock' value was obtained in a host with
> > > > > >    * reliable KVM_GET_CLOCK */
> > > > > 
> > > > > Done.
> > > > > 
> > > > > > > +    bool clock_is_reliable;
> > > > > >
> > > > > >  } KVMClockState;
> > > > > > >  
> > > > > > >  struct pvclock_vcpu_time_info {
> > > > > > > @@ -112,25 +112,17 @@
> > > > > > >          struct kvm_clock_data data = {};
> > > > > > >          uint64_t pvclock_via_mem = 0;
> > > > > > >  
> > > > > > > -        /* local (running VM) restore */
> > > > > > > -        if (s->clock_valid) {
> > > > > > > -            /*
> > > > > > > -             * if host does not support reliable KVM_GET_CLOCK,
> > > > > > > -             * read kvmclock value from memory
> > > > > > > -             */
> > > > > > > -            if (!kvm_has_adjust_clock_stable()) {
> > > > > > > -                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > > > > -            }
> > > > > > > -        /* migration/savevm/init restore */
> > > > > > > -        } else {
> > > > > > > -            /*
> > > > > > > -             * use s->clock in case machine uses reliable
> > > > > > > -             * get clock and source host supported
> > > > > > > -             * reliable get clock
> > > > > > > -             */
> > > > > > > -            if (!s->src_use_reliable_get_clock) {
> > > > > > > -                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > > > > -            }
> > > > > > > +
> > > > > > 
> > > > > > A comment here would be welcome. Something like: "If the host
> > > > > > where s->clock was read did not support reliable KVM_GET_CLOCK,
> > > > > > read kvmclock value from memory".
> > > > > 
> > > > > Done.
> > > > > 
> > > > > > > +        if (!s->clock_is_reliable) {
> > > > > > > +            pvclock_via_mem = kvmclock_current_nsec(s);
> > > > > > > +        }
> > > > > > > +
> > > > > > > +        /* migration/savevm/init restore
> > > > > > > +         * update clock_is_reliable to match local
> > > > > > > +         * host capabilities.
> > > > > > > +         */
> > > > > > > +        if (s->clock_valid == false) {
> > > > > > 
> > > > > > Why the s->clock_valid check? It seems unnecessary.
> > > > > 
> > > > > Because if s->clock_valid == false (from migration),
> > > > > kvm_has_adjust_clock_stable() from the local host is irrelevant.
> > > > 
> > > > That's true, and that's why updating it on kvm_get_clock() looks
> > > > like a more logical place.
> > > 
> > > But the current patch updates
> > > s->clock_is_reliable at the "if (running)"
> > > case of kvmclock_vm_state_change().
> > > 
> > > There, both s->clock_is_reliable and s->clock are
> > > updated in sync:
> > > 
> > >         if (s->clock_valid == false) {
> > >             s->clock_is_reliable = kvm_has_adjust_clock_stable();
> > >         }
> > > 
> > >         /* We can't rely on the saved clock value, just discard it */
> > >         if (pvclock_via_mem) {
> > >             s->clock = pvclock_via_mem;
> > >         }
> > > 
> > 
> > Here s->clock is updated only to be used to set data.clock in the
> > next line (and never used again). I'm talking about every other
> > s->clock assignment in the code.
> > 
> > > However, there is no need to call kvm_get_clock() there.
> > 
> > I'm talking about the existing kvm_get_clock() calls.
> > 
> > > 
> > > kvm_get_clock() is called at the "} else {" case.
> > > 
> > > Are you suggesting to
> > > 
> > > * move s->clock_is_reliable assignment to kvm_get_clock()
> > > * move s->clock assignment to kvm_get_clock()
> > > * call kvm_get_clock() from the "if (running)" because
> > > the assignments are in that function now.
> > 
> > No.
> > 
> > > 
> > > But you don't need KVM_GET_CLOCK to be called there.
> > 
> > This is what I'm suggesting:
> > 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > [ehabkost: simplified migration logic]
> > Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
> > ---
> >  hw/i386/kvm/clock.c    | 95 ++++++++++++++++++++++++++++++++++++++++++--------
> >  include/hw/i386/pc.h   |  5 +++
> >  target-i386/kvm.c      |  7 ++++
> >  target-i386/kvm_i386.h |  1 +
> >  4 files changed, 94 insertions(+), 14 deletions(-)
> > 
> > diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
> > index 0f75dd3..df07ec7 100644
> > --- a/hw/i386/kvm/clock.c
> > +++ b/hw/i386/kvm/clock.c
> > @@ -35,6 +35,13 @@ typedef struct KVMClockState {
> >      /*< public >*/
> >  
> >      uint64_t clock;
> > +    /* whether the value in 'clock' is reliable (i.e. was set by a host
> > +     * with reliable KVM_GET_CLOCK) */
> > +    bool clock_is_reliable;
> > +    /* Enable migration of clock_is_reliable. Some machine-types
> > +     * will disable it for compatibility with older QEMU versions.
> > +     */
> > +    bool migrate_reliable_flag;
> >      bool clock_valid;
> >  } KVMClockState;
> >  
> > @@ -81,6 +88,20 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s)
> >      return nsec + time.system_time;
> >  }
> >  
> > +static void kvm_get_clock(KVMClockState *s)
> > +{
> > +    struct kvm_clock_data data;
> > +    int ret;
> > +
> > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > +    if (ret < 0) {
> > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > +        abort();
> > +    }
> > +    s->clock = data.clock;
> > +    s->clock_is_reliable = kvm_has_adjust_clock_stable();
> > +}
> > +
> 
> s->clock_is_reliable has to updated at migration, after 
> the first time KVM_SET_CLOCK is called. There is no other
> place where it should be updated.

I don't see why you claim that. We can simply update
s->clock_is_reliable the next time s->clock is updated. The only
code that reads s->clock checks s->clock_is_reliable first.
s->clock_is_reliable is a flag _about_ the value in s->clock. I
don't see why you want to update them at different times, if it
means more complex logic.

> This is the behaviour 
> desired, according to this table:
> 
> index:
> mig->c means use s->clock value from migration
> mig->m means read pvclock value from memory 
> (both at migration, only the first time kvmclock_change_vmstate
> is called).
> 
> runt->c means use KVM_GET_CLOCK/KVM_SET_CLOCK at runtime
> runt->m means read pvclock value from memory
> (both at runtime, after the first time kvmclock_change_vmstate
> is called).
> 
> SOURCE                DESTINATION             BEHAVIOUR
> get_clock_reliable    get_clock_reliable      mig->c,runt->c  1
> get_clock_reliable    ~get_clock_reliable     mig->c,runt->m  2
> ~get_clock_reliable   get_clock_reliable      mig->m,runt->c  3
> ~get_clock_reliable   ~get_clock_reliable     mig->m,runt->m  4
> 
> So looking at your patch below, and thinking of 
> kvm_get_clock() updating s->clock_is_reliable from 
> !running case (that is RUNNING -> ~RUNNING transition)
> makes no sense at all. That should not happen.

Why not?

> 
> s->clock_is_reliable should be updated after the first
> time ~RUNNING -> RUNNING migration is done, which 
> is my patch does.

I don't see why. All we need is that s->clock_is_reliable get
updated before any code try to read s->clock again.

> 
> I think your s->clock,s->clock_is_reliable should be in sync
> suggestion comes from a misunderstanding of how they are used.
> 
> s->clock is not read if s->clock_is_reliable is set.

Did you mean "s->clock is not read if s->clock_is_reliable is
false"? You are right (except when kvmclock_current_nsec()==0?).
But this also means that the only code that reads s->clock also
checks s->clock_is_reliable first.

> 
> Look at my attached patch below (still untested, will test and then
> resubmit) and check whether it matches the behaviour 
> described in the table above.

It seems to match. But I don't see why my patch doesn't match the
table above.

> 
> >  static void kvmclock_vm_state_change(void *opaque, int running,
> >                                       RunState state)
> >  {
> > @@ -91,16 +112,18 @@ static void kvmclock_vm_state_change(void *opaque, int running,
> >  
> >      if (running) {
> >          struct kvm_clock_data data = {};
> > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> >  
> >          s->clock_valid = false;
> >  
> > -        /* We can't rely on the migrated clock value, just discard it */
> > -        if (time_at_migration) {
> > -            s->clock = time_at_migration;
> > +        /* if host that set s->clock did not support reliable KVM_GET_CLOCK,
> > +         * read kvmclock value from memory
> > +         */
> > +        if (!s->clock_is_reliable) {
> > +            data.clock = kvmclock_current_nsec(s);
> > +        } else {
> > +            data.clock = s->clock;
> >          }
> >  
> > -        data.clock = s->clock;
> >          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> >          if (ret < 0) {
> >              fprintf(stderr, "KVM_SET_CLOCK failed: %s\n", strerror(ret));
> > @@ -120,22 +143,23 @@ static void kvmclock_vm_state_change(void *opaque, int running,
> 
> >              }
> >          }
> >      } else {
> > -        struct kvm_clock_data data;
> > -        int ret;
> >  
> > +        /*FIXME: is this really possible to happen?
> > +         * Can kvmclock_vm_state_change(s, 0) be called twice without
> > +         * a kvmclock_vm_state_change(s, 1) call in between?
> > +         */
> 
> It can't: check the code where notifiers are called.

Yet another reason to simplify the code: if we don't need
different code paths for local stop/resume vs migration, we can
remove s->clock_valid completely.

> 
> >          if (s->clock_valid) {
> >              return;
> >          }
> >  
> >          kvm_synchronize_all_tsc();
> >  
> > -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > -        if (ret < 0) {
> > -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > -            abort();
> > -        }
> > -        s->clock = data.clock;
> > -
> > +        kvm_get_clock(s);
> > +        /* FIXME: the comment below doesn't match what is done at
> > +         * kvmclock_pre_save(). The comment must be updated, or
> > +         * kvmclock_pre_save() must be changed. I don't know what's
> > +         * the right thing to do.
> > +         */

We still need to address this comment. Your patch keeps the
comment that contradicts kvmclock_pre_save().

> 
> This fails for all cases (1,2,3,4) in the table above.

I can't see why it fails. The ordering of events from migration
followed by vm stop/resume should be:

    | Event                       | s->clock         | s->clock_is_reliable
src | kvmclock_vm_state_change(0) | -                | -
src |   kvm_get_clock()           | TIME-AT-VM-STOP  | CAP_ADJUST_CLOCK-on-SRC
src | kvmclock_pre_save()         | TIME-AT-VM-STOP  | CAP_ADJUST_CLOCK-on-SRC
src |   kvm_get_clock()           | TIME-AT-PRE-SAVE | CAP_ADJUST_CLOCK-on-SRC
--- | MIGRATION                   | TIME-AT-PRE-SAVE | CAP_ADJUST_CLOCK-on-SRC
dst | kvmclock_vm_state_change(1) | TIME-AT-PRE-SAVE | CAP_ADJUST_CLOCK-on-SRC
dst |   KVM_SET_CLOCK             | TIME-AT-PRE-SAVE | CAP_ADJUST_CLOCK-on-SRC
dst | kvmclock_vm_state_change(0) | TIME-AT-PRE-SAVE | CAP_ADJUST_CLOCK-on-SRC
dst |   kvm_get_clock()           | TIME-AT-VM-STOP  | CAP_ADJUST_CLOCK-on-DEST
dst | kvmclock_vm_state_change(1) | TIME-AT-VM-STOP  | CAP_ADJUST_CLOCK-on-DEST
dst |   KVM_SET_CLOCK             | TIME-AT-VM-STOP  | CAP_ADJUST_CLOCK-on-DEST

Specific cases from your table are below. See the KVM_SET_CLOCK
rows.

Case 1:

    | Event                       | s->clock         | s->clock_is_reliable
src | kvmclock_vm_state_change(0) | -                | -
src |   kvm_get_clock()           | TIME-AT-VM-STOP  | 1
src | kvmclock_pre_save()         | TIME-AT-VM-STOP  | 1
src |   kvm_get_clock()           | TIME-AT-PRE-SAVE | 1
--- | MIGRATION                   | TIME-AT-PRE-SAVE | 1
dst | kvmclock_vm_state_change(1) | TIME-AT-PRE-SAVE | 1
dst |   KVM_SET_CLOCK(s->clock)   | TIME-AT-PRE-SAVE | 1
dst | kvmclock_vm_state_change(0) | TIME-AT-PRE-SAVE | 1
dst |   kvm_get_clock()           | TIME-AT-VM-STOP  | 1
dst | kvmclock_vm_state_change(1) | TIME-AT-VM-STOP  | 1
dst |   KVM_SET_CLOCK(s->clock)   | TIME-AT-VM-STOP  | 1

Case 2:

    | Event                       | s->clock         | s->clock_is_reliable
src | kvmclock_vm_state_change(0) | -                | -
src |   kvm_get_clock()           | TIME-AT-VM-STOP  | 1
src | kvmclock_pre_save()         | TIME-AT-VM-STOP  | 1
src |   kvm_get_clock()           | TIME-AT-PRE-SAVE | 1
--- | MIGRATION                   | TIME-AT-PRE-SAVE | 1
dst | kvmclock_vm_state_change(1) | TIME-AT-PRE-SAVE | 1
dst |   KVM_SET_CLOCK(s->clock)   | TIME-AT-PRE-SAVE | 1
dst | kvmclock_vm_state_change(0) | TIME-AT-PRE-SAVE | 1
dst |   kvm_get_clock()           | TIME-AT-VM-STOP  | 0
dst | kvmclock_vm_state_change(1) | TIME-AT-VM-STOP  | 0
dst |   KVM_SET_CLOCK(from_mem)   | TIME-AT-VM-STOP  | 0

Case 3:

    | Event                       | s->clock         | s->clock_is_reliable
src | kvmclock_vm_state_change(0) | -                | -
src |   kvm_get_clock()           | TIME-AT-VM-STOP  | 0
src | kvmclock_pre_save()         | TIME-AT-VM-STOP  | 0
src |   kvm_get_clock()           | TIME-AT-PRE-SAVE | 0
--- | MIGRATION                   | TIME-AT-PRE-SAVE | 0
dst | kvmclock_vm_state_change(1) | TIME-AT-PRE-SAVE | 0
dst |   KVM_SET_CLOCK(from_mem)   | TIME-AT-PRE-SAVE | 0
dst | kvmclock_vm_state_change(0) | TIME-AT-PRE-SAVE | 0
dst |   kvm_get_clock()           | TIME-AT-VM-STOP  | 1
dst | kvmclock_vm_state_change(1) | TIME-AT-VM-STOP  | 1
dst |   KVM_SET_CLOCK(s->clock)   | TIME-AT-VM-STOP  | 1

Case 4:

    | Event                       | s->clock         | s->clock_is_reliable
src | kvmclock_vm_state_change(0) | -                | -
src |   kvm_get_clock()           | TIME-AT-VM-STOP  | 0
src | kvmclock_pre_save()         | TIME-AT-VM-STOP  | 0
src |   kvm_get_clock()           | TIME-AT-PRE-SAVE | 0
--- | MIGRATION                   | TIME-AT-PRE-SAVE | 0
dst | kvmclock_vm_state_change(1) | TIME-AT-PRE-SAVE | 0
dst |   KVM_SET_CLOCK(from_mem)   | TIME-AT-PRE-SAVE | 0
dst | kvmclock_vm_state_change(0) | TIME-AT-PRE-SAVE | 0
dst |   kvm_get_clock()           | TIME-AT-VM-STOP  | 0
dst | kvmclock_vm_state_change(1) | TIME-AT-VM-STOP  | 0
dst |   KVM_SET_CLOCK(from_mem)   | TIME-AT-VM-STOP  | 0



> 
> >          /*
> >           * If the VM is stopped, declare the clock state valid to
> >           * avoid re-reading it on next vmsave (which would return
> > @@ -152,22 +176,65 @@ static void kvmclock_realize(DeviceState *dev, Error **errp)
> >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> >  }
> >  
> > +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > +{
> > +    KVMClockState *s = opaque;
> > +
> > +    return s->migrate_reliable_flag;
> > +}
> > +
> > +static const VMStateDescription kvmclock_reliable_get_clock = {
> > +    .name = "kvmclock/clock_is_reliable",
> > +    .version_id = 1,
> > +    .minimum_version_id = 1,
> > +    .needed = kvmclock_src_use_reliable_get_clock,
> > +    .fields = (VMStateField[]) {
> > +        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
> > +        VMSTATE_END_OF_LIST()
> > +    }
> > +};
> > +
> > +static void kvmclock_pre_save(void *opaque)
> > +{
> > +    KVMClockState *s = opaque;
> > +    /*TODO: add comment explaining why this is here:
> > +     * "re-read clock value here because [...]"" */
> > +
> > +    /*XXX: is it correct to call kvm_get_clock() here
> > +     * if s->clock_is_valid is true? (meaning the kvmclock was
> > +     * already stopped)
> > +     */
> > +    kvm_get_clock(s);
> > +}
> > +
> >  static const VMStateDescription kvmclock_vmsd = {
> >      .name = "kvmclock",
> >      .version_id = 1,
> >      .minimum_version_id = 1,
> > +    .pre_save = kvmclock_pre_save,
> >      .fields = (VMStateField[]) {
> >          VMSTATE_UINT64(clock, KVMClockState),
> >          VMSTATE_END_OF_LIST()
> > +    },
> > +    .subsections = (const VMStateDescription * []) {
> > +        &kvmclock_reliable_get_clock,
> > +        NULL
> >      }
> >  };
> >  
> > +static Property kvmclock_properties[] = {
> > +    DEFINE_PROP_BOOL("x-migrate-reliable-flag", KVMClockState,
> > +                      migrate_reliable_flag, true),
> > +    DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> >  static void kvmclock_class_init(ObjectClass *klass, void *data)
> >  {
> >      DeviceClass *dc = DEVICE_CLASS(klass);
> >  
> >      dc->realize = kvmclock_realize;
> >      dc->vmsd = &kvmclock_vmsd;
> > +    dc->props = kvmclock_properties;
> >  }
> >  
> >  static const TypeInfo kvmclock_info = {
> > diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> > index 4b74130..fb80d69 100644
> > --- a/include/hw/i386/pc.h
> > +++ b/include/hw/i386/pc.h
> > @@ -372,6 +372,11 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
> >  #define PC_COMPAT_2_7 \
> >      HW_COMPAT_2_7 \
> >      {\
> > +        .driver   = "kvmclock",\
> > +        .property = "x-migrate-reliable-flag",\
> > +        .value    = "off",\
> > +    },\
> > +    {\
> >          .driver   = TYPE_X86_CPU,\
> >          .property = "l3-cache",\
> >          .value    = "off",\
> > diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> > index f62264a..10a9cd8 100644
> > --- a/target-i386/kvm.c
> > +++ b/target-i386/kvm.c
> > @@ -117,6 +117,13 @@ bool kvm_has_smm(void)
> >      return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
> >  }
> >  
> > +bool kvm_has_adjust_clock_stable(void)
> > +{
> > +    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
> > +
> > +    return (ret == KVM_CLOCK_TSC_STABLE);
> > +}
> > +
> >  bool kvm_allows_irq0_override(void)
> >  {
> >      return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
> > diff --git a/target-i386/kvm_i386.h b/target-i386/kvm_i386.h
> > index 7607929..bfce427 100644
> > --- a/target-i386/kvm_i386.h
> > +++ b/target-i386/kvm_i386.h
> > @@ -17,6 +17,7 @@
> >  
> >  bool kvm_allows_irq0_override(void);
> >  bool kvm_has_smm(void);
> > +bool kvm_has_adjust_clock_stable(void);
> >  void kvm_synchronize_all_tsc(void);
> >  void kvm_arch_reset_vcpu(X86CPU *cs);
> >  void kvm_arch_do_init_vcpu(X86CPU *cs);
> > -- 
> > 2.7.4
> > 
> > 
> > > 
> > > 
> > > > 
> > > > > 
> > > > > If s->clock_valid == true, kvm_has_adjust_clock_stable() is
> > > > > what decides whether to use KVM_GET_CLOCK or not.
> > > > > 
> > > > > > (If this is an optimization to avoid extra ioctl() calls, you can
> > > > > > cache it inside kvm_has_adjust_clock_stable()).
> > > > > 
> > > > > Its not.
> > > > > 
> > > > > > > +            s->clock_is_reliable = kvm_has_adjust_clock_stable();
> > > > > > >          }
> > > > > > 
> > > > > > This should go to kvm_get_clock(), so s->clock and
> > > > > > s->clock_is_reliable are guaranteed to be always in sync.
> > > > > 
> > > > > No: the point to change clock_is_reliable is the following: whether 
> > > > > its the first read of the clock (migration), or subsequent 
> > > > > reads (normal machine operation).
> > > > > 
> > > > > Now kvm_get_clock() just reads the clock, so i don't see it as
> > > > > the right place to change the variable.
> > > > > 
> > > > > Agreed?
> > > > 
> > > > I don't agree. clock_is_reliable is information about the current
> > > > value in s->clock, I don't see why we need to change s->clock and
> > > > s->clock_is_reliable at different places in the code.
> > > > 
> > > > Your code might work too, but I can't tell without reviewing very
> > > > carefully the state transitions KVMClockState could go through. I
> > > > don't see the point of making the state transitions in this code
> > > > even more complex, if we can simply stick to a simple "s->clock
> > > > and s->clock_is_reliable are always in sync" rule.
> > > 
> > > It is updating from the same place:
> > > 
> > >         /* migration/savevm/init restore
> > >          * update clock_is_reliable to match local
> > >          * host capabilities.
> > >          */
> > >         if (s->clock_valid == false) {
> > >             s->clock_is_reliable = kvm_has_adjust_clock_stable();
> > >         }
> > > 
> > >         /* We can't rely on the saved clock value, just discard it */
> > >         if (pvclock_via_mem) {
> > >             s->clock = pvclock_via_mem;
> > >         }
> > 
> > I'm talking about the other s->clock assignments. This one
> > doesn't even seem to be necessary.
> 
> It is necessary: this is how you read pvclock from memory.

It is not necessary if you set data.clock directly.

But this doesn't matter for this discussion. My new patch below
keeps that line, just to make the diff smaller.

> 
> > > 
> > > 
> > > > 
> > > > > 
> > > > > > >          /* We can't rely on the saved clock value, just discard it */
> > > > > > > @@ -181,20 +173,14 @@
> > > > > > >  {
> > > > > > >      KVMClockState *s = KVM_CLOCK(dev);
> > > > > > >  
> > > > > > > -    /*
> > > > > > > -     * On machine types that support reliable KVM_GET_CLOCK,
> > > > > > > -     * if host kernel does provide reliable KVM_GET_CLOCK,
> > > > > > > -     * set src_use_reliable_get_clock=true so that destination
> > > > > > > -     * avoids reading kvmclock from memory.
> > > > > > > -     */
> > > > > > > -    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > > > > > > -        s->src_use_reliable_get_clock = true;
> > > > > > > +    if (kvm_has_adjust_clock_stable()) {
> > > > > > > +        s->clock_is_reliable = true;
> > > > > > >      }
> > > > > > >  
> > > > > > >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> > > > > > >  }
> > > > > > >  
> > > > > > > -static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > > > > > > +static bool kvmclock_clock_is_reliable_needed(void *opaque)
> > > > > > >  {
> > > > > > >      KVMClockState *s = opaque;
> > > > > > >  
> > > > > > > @@ -202,12 +188,12 @@
> > > > > > >  }
> > > > > > >  
> > > > > > >  static const VMStateDescription kvmclock_reliable_get_clock = {
> > > > > > > -    .name = "kvmclock/src_use_reliable_get_clock",
> > > > > > > +    .name = "kvmclock/clock_is_reliable",
> > > > > > >      .version_id = 1,
> > > > > > >      .minimum_version_id = 1,
> > > > > > > -    .needed = kvmclock_src_use_reliable_get_clock,
> > > > > > > +    .needed = kvmclock_clock_is_reliable_needed,
> > > > > > >      .fields = (VMStateField[]) {
> > > > > > > -        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> > > > > > > +        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
> > > > > > >          VMSTATE_END_OF_LIST()
> > > > > > >      }
> > > > > > >  };
> > > > > > 
> > > > > > -- 
> > > > > > Eduardo
> > > > 
> > > > -- 
> > > > Eduardo
> > 
> > -- 
> > Eduardo
> 
> 
> kvmclock: reduce kvmclock difference on migration
> 
> Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> that moment.
> 
> For new machine types, use this value rather than reading 
> from guest memory.
> 
> This reduces kvmclock difference on migration from 5s to 0.1s
> (when max_downtime == 5s).
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> ---
>  hw/i386/kvm/clock.c    |   97 +++++++++++++++++++++++++++++++++++++++++--------
>  include/hw/i386/pc.h   |    5 ++
>  target-i386/kvm.c      |    7 +++
>  target-i386/kvm_i386.h |    1 
>  4 files changed, 96 insertions(+), 14 deletions(-)
> 
> v2: 
> - improve variable names (Juan)
> - consolidate code on kvm_get_clock function (Paolo)
> - return mach_use_reliable_get_clock from needed function (Paolo)
> v3: 
> - simplify check for src_use_reliable_get_clock (Eduardo)
> - move src_use_reliable_get_clock initialization to realize (Eduardo)
> 
> 
> Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> ===================================================================
> --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-12-01 12:43:18.606294830 -0200
> @@ -36,6 +36,13 @@
>  
>      uint64_t clock;
>      bool clock_valid;
> +
> +    /* whether machine type supports reliable get clock */
> +    bool mach_use_reliable_get_clock;
> +
> +    /* whether the 'clock' value was obtained in a host with
> +     * reliable KVM_GET_CLOCK */
> +    bool clock_is_reliable;
>  } KVMClockState;
>  
>  struct pvclock_vcpu_time_info {
> @@ -81,6 +88,19 @@
>      return nsec + time.system_time;
>  }
>  
> +static uint64_t kvm_get_clock(void)
> +{
> +    struct kvm_clock_data data;
> +    int ret;
> +
> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> +    if (ret < 0) {
> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> +                abort();
> +    }
> +    return data.clock;
> +}
> +
>  static void kvmclock_vm_state_change(void *opaque, int running,
>                                       RunState state)
>  {
> @@ -91,15 +111,31 @@
>  
>      if (running) {
>          struct kvm_clock_data data = {};
> -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> +        uint64_t pvclock_via_mem = 0;
>  
> -        s->clock_valid = false;
> +        /*
> +         * If the host where s->clock was read did not support reliable
> +         * KVM_GET_CLOCK, read kvmclock value from memory.
> +         */
> +        if (!s->clock_is_reliable) {
> +            pvclock_via_mem = kvmclock_current_nsec(s);
> +        }
>  
> -        /* We can't rely on the migrated clock value, just discard it */
> -        if (time_at_migration) {
> -            s->clock = time_at_migration;
> +        /* migration/savevm/init restore
> +         * update clock_is_reliable to match local
> +         * host capabilities.
> +         */
> +        if (s->clock_valid == false) {
> +            s->clock_is_reliable = kvm_has_adjust_clock_stable();
>          }
>  

If we are reusing clock_valid for a different purpose, I suggest
we rename it to something clearer, like "clock_is_local" or
"clock_is_stopped".

But I still don't see the need for this convoluted logic, if we
can simply set s->clock_is_reliable and s->clock at the same
time. This even allow us to remove s->clock_valid completely.

> +        /* We can't rely on the saved clock value, just discard it */
> +        if (pvclock_via_mem) {
> +            s->clock = pvclock_via_mem;
> +        }
> +
> +        s->clock_valid = false;
> +

I suggest the following on top of your patch. It removes 27
unnecessary lines from the code.

But, really, I'm tired of this discussion. If you want to keep
the complex logic you suggest and others are happy with it, go
ahead. You just won't get my Reviewed-by.

But in addition to this discussion about migration_is_safe, I
have a question below about kvmclock_pre_save(), which seems to
be a real problem. See below.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
---
 hw/i386/kvm/clock.c | 37 +++++--------------------------------
 1 file changed, 5 insertions(+), 32 deletions(-)

diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index e179862..359abce 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -35,8 +35,6 @@ typedef struct KVMClockState {
     /*< public >*/
 
     uint64_t clock;
-    bool clock_valid;
-
     /* whether machine type supports reliable get clock */
     bool mach_use_reliable_get_clock;
 
@@ -88,7 +86,7 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s)
     return nsec + time.system_time;
 }
 
-static uint64_t kvm_get_clock(void)
+static void kvm_get_clock(KVMClockState *s)
 {
     struct kvm_clock_data data;
     int ret;
@@ -98,7 +96,8 @@ static uint64_t kvm_get_clock(void)
         fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
                 abort();
     }
-    return data.clock;
+    s->clock = data.clock;
+    s->clock_is_reliable = kvm_has_adjust_clock_stable();
 }
 
 static void kvmclock_vm_state_change(void *opaque, int running,
@@ -121,21 +120,11 @@ static void kvmclock_vm_state_change(void *opaque, int running,
             pvclock_via_mem = kvmclock_current_nsec(s);
         }
 
-        /* migration/savevm/init restore
-         * update clock_is_reliable to match local
-         * host capabilities.
-         */
-        if (s->clock_valid == false) {
-            s->clock_is_reliable = kvm_has_adjust_clock_stable();
-        }
-
         /* We can't rely on the saved clock value, just discard it */
         if (pvclock_via_mem) {
             s->clock = pvclock_via_mem;
         }
 
-        s->clock_valid = false;
-
         data.clock = s->clock;
         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
         if (ret < 0) {
@@ -156,20 +145,8 @@ static void kvmclock_vm_state_change(void *opaque, int running,
             }
         }
     } else {
-
-        if (s->clock_valid) {
-            return;
-        }
-
         kvm_synchronize_all_tsc();
-
-        s->clock = kvm_get_clock();
-        /*
-         * If the VM is stopped, declare the clock state valid to
-         * avoid re-reading it on next vmsave (which would return
-         * a different value). Will be reset when the VM is continued.
-         */
-        s->clock_valid = true;
+        kvm_get_clock(s);
     }
 }
 
@@ -177,10 +154,6 @@ static void kvmclock_realize(DeviceState *dev, Error **errp)
 {
     KVMClockState *s = KVM_CLOCK(dev);
 
-    if (kvm_has_adjust_clock_stable()) {
-        s->clock_is_reliable = true;
-    }
-
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
@@ -206,7 +179,7 @@ static void kvmclock_pre_save(void *opaque)
 {
     KVMClockState *s = opaque;
 
-    s->clock = kvm_get_clock();
+    kvm_get_clock(s);
 }
 
 static const VMStateDescription kvmclock_vmsd = {


> +
>          data.clock = s->clock;
>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>          if (ret < 0) {
> @@ -120,8 +156,6 @@
>              }
>          }
>      } else {
> -        struct kvm_clock_data data;
> -        int ret;
>  
>          if (s->clock_valid) {
>              return;
> @@ -129,13 +163,7 @@
>  
>          kvm_synchronize_all_tsc();
>  
> -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> -        if (ret < 0) {
> -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> -            abort();
> -        }
> -        s->clock = data.clock;
> -
> +        s->clock = kvm_get_clock();
>          /*
>           * If the VM is stopped, declare the clock state valid to
>           * avoid re-reading it on next vmsave (which would return
> @@ -149,25 +177,66 @@
>  {
>      KVMClockState *s = KVM_CLOCK(dev);
>  
> +    if (kvm_has_adjust_clock_stable()) {
> +        s->clock_is_reliable = true;
> +    }
> +
>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>  }
>  
> +static bool kvmclock_clock_is_reliable_needed(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    return s->mach_use_reliable_get_clock;
> +}
> +
> +static const VMStateDescription kvmclock_reliable_get_clock = {
> +    .name = "kvmclock/clock_is_reliable",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = kvmclock_clock_is_reliable_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static void kvmclock_pre_save(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +

We still need a comment here explaining why we need to re-read
the clock on pre_save if s->clock was already set on
kvmclock_vm_state_change().

Also, what if we are migrating a VM that was already paused 10
minutes ago? Should we migrate the s->clock value from
10 minutes ago, or the one read at pre_save?

> +    s->clock = kvm_get_clock();
> +}
> +
>  static const VMStateDescription kvmclock_vmsd = {
>      .name = "kvmclock",
>      .version_id = 1,
>      .minimum_version_id = 1,
> +    .pre_save = kvmclock_pre_save,
>      .fields = (VMStateField[]) {
>          VMSTATE_UINT64(clock, KVMClockState),
>          VMSTATE_END_OF_LIST()
> +    },
> +    .subsections = (const VMStateDescription * []) {
> +        &kvmclock_reliable_get_clock,
> +        NULL
>      }
>  };
>  
> +static Property kvmclock_properties[] = {
> +    DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState,
> +                      mach_use_reliable_get_clock, true),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
>  static void kvmclock_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *dc = DEVICE_CLASS(klass);
>  
>      dc->realize = kvmclock_realize;
>      dc->vmsd = &kvmclock_vmsd;
> +    dc->props = kvmclock_properties;
>  }
>  
>  static const TypeInfo kvmclock_info = {
> Index: qemu-mig-advance-clock/include/hw/i386/pc.h
> ===================================================================
> --- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-17 15:07:11.220632761 -0200
> +++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-17 15:08:58.096791416 -0200
> @@ -370,6 +370,11 @@
>  #define PC_COMPAT_2_7 \
>      HW_COMPAT_2_7 \
>      {\
> +        .driver   = "kvmclock",\
> +        .property = "x-mach-use-reliable-get-clock",\
> +        .value    = "off",\
> +    },\
> +    {\
>          .driver   = TYPE_X86_CPU,\
>          .property = "l3-cache",\
>          .value    = "off",\
> Index: qemu-mig-advance-clock/target-i386/kvm.c
> ===================================================================
> --- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-17 15:07:11.221632762 -0200
> +++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-17 15:07:15.867639659 -0200
> @@ -117,6 +117,13 @@
>      return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
>  }
>  
> +bool kvm_has_adjust_clock_stable(void)
> +{
> +    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
> +
> +    return (ret == KVM_CLOCK_TSC_STABLE);
> +}
> +
>  bool kvm_allows_irq0_override(void)
>  {
>      return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
> Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
> ===================================================================
> --- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-17 15:07:11.222632764 -0200
> +++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-17 15:07:15.867639659 -0200
> @@ -17,6 +17,7 @@
>  
>  bool kvm_allows_irq0_override(void);
>  bool kvm_has_smm(void);
> +bool kvm_has_adjust_clock_stable(void);
>  void kvm_synchronize_all_tsc(void);
>  void kvm_arch_reset_vcpu(X86CPU *cs);
>  void kvm_arch_do_init_vcpu(X86CPU *cs);

-- 
Eduardo

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration
  2016-12-05 15:17                       ` Eduardo Habkost
@ 2016-12-05 17:20                         ` Marcelo Tosatti
  2016-12-06  0:47                           ` Eduardo Habkost
  0 siblings, 1 reply; 18+ messages in thread
From: Marcelo Tosatti @ 2016-12-05 17:20 UTC (permalink / raw)
  To: Eduardo Habkost
  Cc: kvm, qemu-devel, Dr David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Mon, Dec 05, 2016 at 01:17:42PM -0200, Eduardo Habkost wrote:
> On Fri, Dec 02, 2016 at 08:30:35PM -0200, Marcelo Tosatti wrote:
> > On Fri, Dec 02, 2016 at 11:56:09AM -0200, Eduardo Habkost wrote:
> > > On Thu, Dec 01, 2016 at 12:39:03PM -0200, Marcelo Tosatti wrote:
> > > > On Wed, Nov 30, 2016 at 11:09:28AM -0200, Eduardo Habkost wrote:
> > > > > On Tue, Nov 29, 2016 at 09:54:29PM -0200, Marcelo Tosatti wrote:
> > > > > > On Tue, Nov 29, 2016 at 10:34:05AM -0200, Eduardo Habkost wrote:
> > > > > > > On Tue, Nov 29, 2016 at 08:56:00AM -0200, Marcelo Tosatti wrote:
> > > > > > > > On Mon, Nov 28, 2016 at 03:12:01PM -0200, Eduardo Habkost wrote:
> > > > > > > > > On Mon, Nov 28, 2016 at 02:45:24PM -0200, Marcelo Tosatti wrote:
> > > > > > > > > > On Mon, Nov 28, 2016 at 12:13:22PM -0200, Eduardo Habkost wrote:
> > > > > > > > > > > Sorry for not noticing the following issues on the previous
> > > > > > > > > > > reviews. I was only paying attention to the vmstate and machine
> > > > > > > > > > > code:
> > > > > > > > > > > 
> > > > > > > > > > > On Mon, Nov 21, 2016 at 08:50:04AM -0200, Marcelo Tosatti wrote:
> > > > > > > > > > > > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > > > > > > > > > > > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > > > > > > > > > > > that moment.
> > > > > > > > > > > > 
> > > > > > > > > > > > For new machine types, use this value rather than reading 
> > > > > > > > > > > > from guest memory.
> > > > > > > > > > > > 
> > > > > > > > > > > > This reduces kvmclock difference on migration from 5s to 0.1s
> > > > > > > > > > > > (when max_downtime == 5s).
> > > > > > > > > > > > 
> > > > > > > > > > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > > > > > > > > 
> > > > > > > > > > > > ---
> > > > > > > > > > > >  hw/i386/kvm/clock.c    |  107 ++++++++++++++++++++++++++++++++++++++++++-------
> > > > > > > > > > > >  include/hw/i386/pc.h   |    5 ++
> > > > > > > > > > > >  target-i386/kvm.c      |    7 +++
> > > > > > > > > > > >  target-i386/kvm_i386.h |    1 
> > > > > > > > > > > >  4 files changed, 106 insertions(+), 14 deletions(-)
> > > > > > > > > > > > 
> > > > > > > > > > > > v2: 
> > > > > > > > > > > > - improve variable names (Juan)
> > > > > > > > > > > > - consolidate code on kvm_get_clock function (Paolo)
> > > > > > > > > > > > - return mach_use_reliable_get_clock from needed function (Paolo)
> > > > > > > > > > > > v3: 
> > > > > > > > > > > > - simplify check for src_use_reliable_get_clock (Eduardo)
> > > > > > > > > > > > - move src_use_reliable_get_clock initialization to realize (Eduardo)
> > > > > > > > > > > > 
> > > > > > > > > > > > 
> > > > > > > > > > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > > > > > > > > > ===================================================================
> > > > > > > > > > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> > > > > > > > > > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > > > > > > > > > > > @@ -36,6 +36,12 @@
> > > > > > > > > > > >  
> > > > > > > > > > > >      uint64_t clock;
> > > > > > > > > > > >      bool clock_valid;
> > > > > > > > > > > > +
> > > > > > > > > > > > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > > > > > > > > > +    bool mach_use_reliable_get_clock;
> > > > > > > > > > > > +
> > > > > > > > > > > > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > > > > > > > > > +    bool src_use_reliable_get_clock;
> > > > > > > > > > > >  } KVMClockState;
> > > > > > > > > > > >  
> > > > > > > > > > > >  struct pvclock_vcpu_time_info {
> > > > > > > > > > > > @@ -81,6 +87,19 @@
> > > > > > > > > > > >      return nsec + time.system_time;
> > > > > > > > > > > >  }
> > > > > > > > > > > >  
> > > > > > > > > > > > +static uint64_t kvm_get_clock(void)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +    struct kvm_clock_data data;
> > > > > > > > > > > > +    int ret;
> > > > > > > > > > > > +
> > > > > > > > > > > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > > > > > > > > > +    if (ret < 0) {
> > > > > > > > > > > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > > > > > > > > > +                abort();
> > > > > > > > > > > > +    }
> > > > > > > > > > > > +    return data.clock;
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > > > > > > > > > > >                                       RunState state)
> > > > > > > > > > > >  {
> > > > > > > > > > > > @@ -91,15 +110,36 @@
> > > > > > > > > > > 
> > > > > > > > > > > Can you please use "diff -p" on your patches?
> > > > > > > > > > > 
> > > > > > > > > > > >  
> > > > > > > > > > > >      if (running) {
> > > > > > > > > > > >          struct kvm_clock_data data = {};
> > > > > > > > > > > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > > > > > > > > > > +        uint64_t pvclock_via_mem = 0;
> > > > > > > > > > > >  
> > > > > > > > > > > > -        s->clock_valid = false;
> > > > > > > > > > > > +        /* local (running VM) restore */
> > > > > > > > > > > > +        if (s->clock_valid) {
> > > > > > > > > > > > +            /*
> > > > > > > > > > > > +             * if host does not support reliable KVM_GET_CLOCK,
> > > > > > > > > > > > +             * read kvmclock value from memory
> > > > > > > > > > > > +             */
> > > > > > > > > > > > +            if (!kvm_has_adjust_clock_stable()) {
> > > > > > > > > > > 
> > > > > > > > > > > Why not use src_use_reliable_get_clock here, too? (Maybe rename
> > > > > > > > > > > it to simply clock_is_reliable?)
> > > > > > > > > > 
> > > > > > > > > > Because you end up mixing two mental ideas (one: whether the source
> > > > > > > > > > has KVM_GET_CLOCK, second: whether the destination has KVM_GET_CLOCK 
> > > > > > > > > > into one variable). I find it more confusing than not.
> > > > > > > > > > Maybe its just my limited brain but i find it
> > > > > > > > > > confusing.
> > > > > > > > > 
> > > > > > > > > I find it simpler and easier to reason about.
> > > > > > > > 
> > > > > > > > I disagree, but i don't mind. Is this what you refer to:
> > > > > > > > 
> > > > > > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > > > > > ===================================================================
> > > > > > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > > > > > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-29 08:53:20.609817026 -0200
> > > > > > > > @@ -37,11 +37,11 @@
> > > > > > > >      uint64_t clock;
> > > > > > > >      bool clock_valid;
> > > > > > > >  
> > > > > > > > -    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > > > > > +    /* whether machine type supports reliable get clock */
> > > > > > > >      bool mach_use_reliable_get_clock;
> > > > > > > >  
> > > > > > > > -    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > > > > > -    bool src_use_reliable_get_clock;
> > > > > > > > +    /* whether clock is reliable */
> > > > > > > 
> > > > > > > I would make the comment more explicit about its meaning:
> > > > > > >   /* whether the 'clock' value was obtained in a host with
> > > > > > >    * reliable KVM_GET_CLOCK */
> > > > > > 
> > > > > > Done.
> > > > > > 
> > > > > > > > +    bool clock_is_reliable;
> > > > > > >
> > > > > > >  } KVMClockState;
> > > > > > > >  
> > > > > > > >  struct pvclock_vcpu_time_info {
> > > > > > > > @@ -112,25 +112,17 @@
> > > > > > > >          struct kvm_clock_data data = {};
> > > > > > > >          uint64_t pvclock_via_mem = 0;
> > > > > > > >  
> > > > > > > > -        /* local (running VM) restore */
> > > > > > > > -        if (s->clock_valid) {
> > > > > > > > -            /*
> > > > > > > > -             * if host does not support reliable KVM_GET_CLOCK,
> > > > > > > > -             * read kvmclock value from memory
> > > > > > > > -             */
> > > > > > > > -            if (!kvm_has_adjust_clock_stable()) {
> > > > > > > > -                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > > > > > -            }
> > > > > > > > -        /* migration/savevm/init restore */
> > > > > > > > -        } else {
> > > > > > > > -            /*
> > > > > > > > -             * use s->clock in case machine uses reliable
> > > > > > > > -             * get clock and source host supported
> > > > > > > > -             * reliable get clock
> > > > > > > > -             */
> > > > > > > > -            if (!s->src_use_reliable_get_clock) {
> > > > > > > > -                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > > > > > -            }
> > > > > > > > +
> > > > > > > 
> > > > > > > A comment here would be welcome. Something like: "If the host
> > > > > > > where s->clock was read did not support reliable KVM_GET_CLOCK,
> > > > > > > read kvmclock value from memory".
> > > > > > 
> > > > > > Done.
> > > > > > 
> > > > > > > > +        if (!s->clock_is_reliable) {
> > > > > > > > +            pvclock_via_mem = kvmclock_current_nsec(s);
> > > > > > > > +        }
> > > > > > > > +
> > > > > > > > +        /* migration/savevm/init restore
> > > > > > > > +         * update clock_is_reliable to match local
> > > > > > > > +         * host capabilities.
> > > > > > > > +         */
> > > > > > > > +        if (s->clock_valid == false) {
> > > > > > > 
> > > > > > > Why the s->clock_valid check? It seems unnecessary.
> > > > > > 
> > > > > > Because if s->clock_valid == false (from migration),
> > > > > > kvm_has_adjust_clock_stable() from the local host is irrelevant.
> > > > > 
> > > > > That's true, and that's why updating it on kvm_get_clock() looks
> > > > > like a more logical place.
> > > > 
> > > > But the current patch updates
> > > > s->clock_is_reliable at the "if (running)"
> > > > case of kvmclock_vm_state_change().
> > > > 
> > > > There, both s->clock_is_reliable and s->clock are
> > > > updated in sync:
> > > > 
> > > >         if (s->clock_valid == false) {
> > > >             s->clock_is_reliable = kvm_has_adjust_clock_stable();
> > > >         }
> > > > 
> > > >         /* We can't rely on the saved clock value, just discard it */
> > > >         if (pvclock_via_mem) {
> > > >             s->clock = pvclock_via_mem;
> > > >         }
> > > > 
> > > 
> > > Here s->clock is updated only to be used to set data.clock in the
> > > next line (and never used again). I'm talking about every other
> > > s->clock assignment in the code.
> > > 
> > > > However, there is no need to call kvm_get_clock() there.
> > > 
> > > I'm talking about the existing kvm_get_clock() calls.
> > > 
> > > > 
> > > > kvm_get_clock() is called at the "} else {" case.
> > > > 
> > > > Are you suggesting to
> > > > 
> > > > * move s->clock_is_reliable assignment to kvm_get_clock()
> > > > * move s->clock assignment to kvm_get_clock()
> > > > * call kvm_get_clock() from the "if (running)" because
> > > > the assignments are in that function now.
> > > 
> > > No.
> > > 
> > > > 
> > > > But you don't need KVM_GET_CLOCK to be called there.
> > > 
> > > This is what I'm suggesting:
> > > 
> > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > [ehabkost: simplified migration logic]
> > > Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
> > > ---
> > >  hw/i386/kvm/clock.c    | 95 ++++++++++++++++++++++++++++++++++++++++++--------
> > >  include/hw/i386/pc.h   |  5 +++
> > >  target-i386/kvm.c      |  7 ++++
> > >  target-i386/kvm_i386.h |  1 +
> > >  4 files changed, 94 insertions(+), 14 deletions(-)
> > > 
> > > diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
> > > index 0f75dd3..df07ec7 100644
> > > --- a/hw/i386/kvm/clock.c
> > > +++ b/hw/i386/kvm/clock.c
> > > @@ -35,6 +35,13 @@ typedef struct KVMClockState {
> > >      /*< public >*/
> > >  
> > >      uint64_t clock;
> > > +    /* whether the value in 'clock' is reliable (i.e. was set by a host
> > > +     * with reliable KVM_GET_CLOCK) */
> > > +    bool clock_is_reliable;
> > > +    /* Enable migration of clock_is_reliable. Some machine-types
> > > +     * will disable it for compatibility with older QEMU versions.
> > > +     */
> > > +    bool migrate_reliable_flag;
> > >      bool clock_valid;
> > >  } KVMClockState;
> > >  
> > > @@ -81,6 +88,20 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s)
> > >      return nsec + time.system_time;
> > >  }
> > >  
> > > +static void kvm_get_clock(KVMClockState *s)
> > > +{
> > > +    struct kvm_clock_data data;
> > > +    int ret;
> > > +
> > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > +    if (ret < 0) {
> > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > +        abort();
> > > +    }
> > > +    s->clock = data.clock;
> > > +    s->clock_is_reliable = kvm_has_adjust_clock_stable();
> > > +}
> > > +
> > 
> > s->clock_is_reliable has to updated at migration, after 
> > the first time KVM_SET_CLOCK is called. There is no other
> > place where it should be updated.
> 
> I don't see why you claim that. We can simply update
> s->clock_is_reliable the next time s->clock is updated. The only
> code that reads s->clock checks s->clock_is_reliable first.
> s->clock_is_reliable is a flag _about_ the value in s->clock. I
> don't see why you want to update them at different times, if it
> means more complex logic.

Because the table of (wanted) behaviour dictates that.

> > This is the behaviour 
> > desired, according to this table:
> > 
> > index:
> > mig->c means use s->clock value from migration
> > mig->m means read pvclock value from memory 
> > (both at migration, only the first time kvmclock_change_vmstate
> > is called).
> > 
> > runt->c means use KVM_GET_CLOCK/KVM_SET_CLOCK at runtime
> > runt->m means read pvclock value from memory
> > (both at runtime, after the first time kvmclock_change_vmstate
> > is called).
> > 
> > SOURCE                DESTINATION             BEHAVIOUR
> > get_clock_reliable    get_clock_reliable      mig->c,runt->c  1
> > get_clock_reliable    ~get_clock_reliable     mig->c,runt->m  2
> > ~get_clock_reliable   get_clock_reliable      mig->m,runt->c  3
> > ~get_clock_reliable   ~get_clock_reliable     mig->m,runt->m  4
> > 
> > So looking at your patch below, and thinking of 
> > kvm_get_clock() updating s->clock_is_reliable from 
> > !running case (that is RUNNING -> ~RUNNING transition)
> > makes no sense at all. That should not happen.
> 
> Why not?

Because we want the behaviour of the table above as follows:

First execution of ~RUNNING -> RUNNING transition:

* If source host supports reliable KVM_GET_CLOCK, then 
its not necessary to read from guest memory (because there
is no danger of KVM_GET_CLOCK returning a value which will cause 
the guest to crash).

* If source host does NOT support reliable KVM_GET_CLOCK, then 
it is necessary to read from guest memory (because there
is danger of KVM_GET_CLOCK returning a value which will cause 
the guest to crash).

Second and subsequent executions of ~RUNNING -> RUNNING transition:
(say if you pause the guest and continue it):

* If host supports reliable KVM_GET_CLOCK, then 
its not necessary to read from guest memory (because there
is no danger of KVM_GET_CLOCK returning a value which will cause 
the guest to crash).

* If  host does NOT support reliable KVM_GET_CLOCK, then 
it is necessary to read from guest memory (because there
is danger of KVM_GET_CLOCK returning a value which will cause 
the guest to crash).

Does that make sense?

> > s->clock_is_reliable should be updated after the first
> > time ~RUNNING -> RUNNING migration is done, which 
> > is my patch does.
> 
> I don't see why. All we need is that s->clock_is_reliable get
> updated before any code try to read s->clock again.

> > I think your s->clock,s->clock_is_reliable should be in sync
> > suggestion comes from a misunderstanding of how they are used.
> > 
> > s->clock is not read if s->clock_is_reliable is set.
> 
> Did you mean "s->clock is not read if s->clock_is_reliable is
> false"? You are right (except when kvmclock_current_nsec()==0?).
> But this also means that the only code that reads s->clock also
> checks s->clock_is_reliable first.
> 
> > 
> > Look at my attached patch below (still untested, will test and then
> > resubmit) and check whether it matches the behaviour 
> > described in the table above.
> 
> It seems to match. But I don't see why my patch doesn't match the
> table above.

Its just fragile Eduardo: if anyone comes and changes the
code as follows.

Source: Does not support reliable KVM_GET_CLOCK.
Destination: Does support reliable KVM_GET_CLOCK.

* Migration from remote host that does not support
  reliable KVM_GET_CLOCK, to a host that supports it.

* Some modification is done so that:

    1) Incoming migration sets s->clock_is_reliable=false from source.
    2) _Before_ vm_start(), new code decides to issue kvm_get_clock().

Your code sets s->clock_reliable = true (because local host supports
it), and failure. Do you see the problem?

(*)

> > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > >                                       RunState state)
> > >  {
> > > @@ -91,16 +112,18 @@ static void kvmclock_vm_state_change(void *opaque, int running,
> > >  
> > >      if (running) {
> > >          struct kvm_clock_data data = {};
> > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > >  
> > >          s->clock_valid = false;
> > >  
> > > -        /* We can't rely on the migrated clock value, just discard it */
> > > -        if (time_at_migration) {
> > > -            s->clock = time_at_migration;
> > > +        /* if host that set s->clock did not support reliable KVM_GET_CLOCK,
> > > +         * read kvmclock value from memory
> > > +         */
> > > +        if (!s->clock_is_reliable) {
> > > +            data.clock = kvmclock_current_nsec(s);
> > > +        } else {
> > > +            data.clock = s->clock;
> > >          }
> > >  
> > > -        data.clock = s->clock;
> > >          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> > >          if (ret < 0) {
> > >              fprintf(stderr, "KVM_SET_CLOCK failed: %s\n", strerror(ret));
> > > @@ -120,22 +143,23 @@ static void kvmclock_vm_state_change(void *opaque, int running,
> > 
> > >              }
> > >          }
> > >      } else {
> > > -        struct kvm_clock_data data;
> > > -        int ret;
> > >  
> > > +        /*FIXME: is this really possible to happen?
> > > +         * Can kvmclock_vm_state_change(s, 0) be called twice without
> > > +         * a kvmclock_vm_state_change(s, 1) call in between?
> > > +         */
> > 
> > It can't: check the code where notifiers are called.
> 
> Yet another reason to simplify the code: if we don't need
> different code paths for local stop/resume vs migration, we can
> remove s->clock_valid completely.

Yes it can be removed... But i'm just trying to fix a bug, not rewrite
code while at it.

> > >          if (s->clock_valid) {
> > >              return;
> > >          }
> > >  
> > >          kvm_synchronize_all_tsc();
> > >  
> > > -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > -        if (ret < 0) {
> > > -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > -            abort();
> > > -        }
> > > -        s->clock = data.clock;
> > > -
> > > +        kvm_get_clock(s);
> > > +        /* FIXME: the comment below doesn't match what is done at
> > > +         * kvmclock_pre_save(). The comment must be updated, or
> > > +         * kvmclock_pre_save() must be changed. I don't know what's
> > > +         * the right thing to do.
> > > +         */
> 
> We still need to address this comment. Your patch keeps the
> comment that contradicts kvmclock_pre_save().

The comment refers to the fact that you should not do:

    Event stop_vm()
       saved_clock1 = KVM_GET_CLOCK.

    Then again with the VM stopped, overwrite saved_clock:
        saved_clock2 = KVM_GET_CLOCK.

    You should always use saved_clock1. Thats what its about.

Updated it.

> 
> > 
> > This fails for all cases (1,2,3,4) in the table above.
> 
> I can't see why it fails. The ordering of events from migration
> followed by vm stop/resume should be:
> 
>     | Event                       | s->clock         | s->clock_is_reliable
> src | kvmclock_vm_state_change(0) | -                | -
> src |   kvm_get_clock()           | TIME-AT-VM-STOP  | CAP_ADJUST_CLOCK-on-SRC
> src | kvmclock_pre_save()         | TIME-AT-VM-STOP  | CAP_ADJUST_CLOCK-on-SRC
> src |   kvm_get_clock()           | TIME-AT-PRE-SAVE | CAP_ADJUST_CLOCK-on-SRC
> --- | MIGRATION                   | TIME-AT-PRE-SAVE | CAP_ADJUST_CLOCK-on-SRC
> dst | kvmclock_vm_state_change(1) | TIME-AT-PRE-SAVE | CAP_ADJUST_CLOCK-on-SRC
> dst |   KVM_SET_CLOCK             | TIME-AT-PRE-SAVE | CAP_ADJUST_CLOCK-on-SRC
> dst | kvmclock_vm_state_change(0) | TIME-AT-PRE-SAVE | CAP_ADJUST_CLOCK-on-SRC
> dst |   kvm_get_clock()           | TIME-AT-VM-STOP  | CAP_ADJUST_CLOCK-on-DEST
> dst | kvmclock_vm_state_change(1) | TIME-AT-VM-STOP  | CAP_ADJUST_CLOCK-on-DEST
> dst |   KVM_SET_CLOCK             | TIME-AT-VM-STOP  | CAP_ADJUST_CLOCK-on-DEST
> 
> Specific cases from your table are below. See the KVM_SET_CLOCK
> rows.
> 
> Case 1:
> 
>     | Event                       | s->clock         | s->clock_is_reliable
> src | kvmclock_vm_state_change(0) | -                | -
> src |   kvm_get_clock()           | TIME-AT-VM-STOP  | 1
> src | kvmclock_pre_save()         | TIME-AT-VM-STOP  | 1
> src |   kvm_get_clock()           | TIME-AT-PRE-SAVE | 1
> --- | MIGRATION                   | TIME-AT-PRE-SAVE | 1
> dst | kvmclock_vm_state_change(1) | TIME-AT-PRE-SAVE | 1
> dst |   KVM_SET_CLOCK(s->clock)   | TIME-AT-PRE-SAVE | 1
> dst | kvmclock_vm_state_change(0) | TIME-AT-PRE-SAVE | 1
> dst |   kvm_get_clock()           | TIME-AT-VM-STOP  | 1
> dst | kvmclock_vm_state_change(1) | TIME-AT-VM-STOP  | 1
> dst |   KVM_SET_CLOCK(s->clock)   | TIME-AT-VM-STOP  | 1
> 
> Case 2:
> 
>     | Event                       | s->clock         | s->clock_is_reliable
> src | kvmclock_vm_state_change(0) | -                | -
> src |   kvm_get_clock()           | TIME-AT-VM-STOP  | 1
> src | kvmclock_pre_save()         | TIME-AT-VM-STOP  | 1
> src |   kvm_get_clock()           | TIME-AT-PRE-SAVE | 1
> --- | MIGRATION                   | TIME-AT-PRE-SAVE | 1
> dst | kvmclock_vm_state_change(1) | TIME-AT-PRE-SAVE | 1
> dst |   KVM_SET_CLOCK(s->clock)   | TIME-AT-PRE-SAVE | 1
> dst | kvmclock_vm_state_change(0) | TIME-AT-PRE-SAVE | 1
> dst |   kvm_get_clock()           | TIME-AT-VM-STOP  | 0
> dst | kvmclock_vm_state_change(1) | TIME-AT-VM-STOP  | 0
> dst |   KVM_SET_CLOCK(from_mem)   | TIME-AT-VM-STOP  | 0
> 
> Case 3:
> 
>     | Event                       | s->clock         | s->clock_is_reliable
> src | kvmclock_vm_state_change(0) | -                | -
> src |   kvm_get_clock()           | TIME-AT-VM-STOP  | 0
> src | kvmclock_pre_save()         | TIME-AT-VM-STOP  | 0
> src |   kvm_get_clock()           | TIME-AT-PRE-SAVE | 0
> --- | MIGRATION                   | TIME-AT-PRE-SAVE | 0
> dst | kvmclock_vm_state_change(1) | TIME-AT-PRE-SAVE | 0
> dst |   KVM_SET_CLOCK(from_mem)   | TIME-AT-PRE-SAVE | 0
> dst | kvmclock_vm_state_change(0) | TIME-AT-PRE-SAVE | 0
> dst |   kvm_get_clock()           | TIME-AT-VM-STOP  | 1
> dst | kvmclock_vm_state_change(1) | TIME-AT-VM-STOP  | 1
> dst |   KVM_SET_CLOCK(s->clock)   | TIME-AT-VM-STOP  | 1
> 
> Case 4:
> 
>     | Event                       | s->clock         | s->clock_is_reliable
> src | kvmclock_vm_state_change(0) | -                | -
> src |   kvm_get_clock()           | TIME-AT-VM-STOP  | 0
> src | kvmclock_pre_save()         | TIME-AT-VM-STOP  | 0
> src |   kvm_get_clock()           | TIME-AT-PRE-SAVE | 0
> --- | MIGRATION                   | TIME-AT-PRE-SAVE | 0
> dst | kvmclock_vm_state_change(1) | TIME-AT-PRE-SAVE | 0
> dst |   KVM_SET_CLOCK(from_mem)   | TIME-AT-PRE-SAVE | 0
> dst | kvmclock_vm_state_change(0) | TIME-AT-PRE-SAVE | 0
> dst |   kvm_get_clock()           | TIME-AT-VM-STOP  | 0
> dst | kvmclock_vm_state_change(1) | TIME-AT-VM-STOP  | 0
> dst |   KVM_SET_CLOCK(from_mem)   | TIME-AT-VM-STOP  | 0
> 
> 
> 
> > 
> > >          /*
> > >           * If the VM is stopped, declare the clock state valid to
> > >           * avoid re-reading it on next vmsave (which would return
> > > @@ -152,22 +176,65 @@ static void kvmclock_realize(DeviceState *dev, Error **errp)
> > >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> > >  }
> > >  
> > > +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > > +{
> > > +    KVMClockState *s = opaque;
> > > +
> > > +    return s->migrate_reliable_flag;
> > > +}
> > > +
> > > +static const VMStateDescription kvmclock_reliable_get_clock = {
> > > +    .name = "kvmclock/clock_is_reliable",
> > > +    .version_id = 1,
> > > +    .minimum_version_id = 1,
> > > +    .needed = kvmclock_src_use_reliable_get_clock,
> > > +    .fields = (VMStateField[]) {
> > > +        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
> > > +        VMSTATE_END_OF_LIST()
> > > +    }
> > > +};
> > > +
> > > +static void kvmclock_pre_save(void *opaque)
> > > +{
> > > +    KVMClockState *s = opaque;
> > > +    /*TODO: add comment explaining why this is here:
> > > +     * "re-read clock value here because [...]"" */
> > > +
> > > +    /*XXX: is it correct to call kvm_get_clock() here
> > > +     * if s->clock_is_valid is true? (meaning the kvmclock was
> > > +     * already stopped)
> > > +     */
> > > +    kvm_get_clock(s);
> > > +}
> > > +
> > >  static const VMStateDescription kvmclock_vmsd = {
> > >      .name = "kvmclock",
> > >      .version_id = 1,
> > >      .minimum_version_id = 1,
> > > +    .pre_save = kvmclock_pre_save,
> > >      .fields = (VMStateField[]) {
> > >          VMSTATE_UINT64(clock, KVMClockState),
> > >          VMSTATE_END_OF_LIST()
> > > +    },
> > > +    .subsections = (const VMStateDescription * []) {
> > > +        &kvmclock_reliable_get_clock,
> > > +        NULL
> > >      }
> > >  };
> > >  
> > > +static Property kvmclock_properties[] = {
> > > +    DEFINE_PROP_BOOL("x-migrate-reliable-flag", KVMClockState,
> > > +                      migrate_reliable_flag, true),
> > > +    DEFINE_PROP_END_OF_LIST(),
> > > +};
> > > +
> > >  static void kvmclock_class_init(ObjectClass *klass, void *data)
> > >  {
> > >      DeviceClass *dc = DEVICE_CLASS(klass);
> > >  
> > >      dc->realize = kvmclock_realize;
> > >      dc->vmsd = &kvmclock_vmsd;
> > > +    dc->props = kvmclock_properties;
> > >  }
> > >  
> > >  static const TypeInfo kvmclock_info = {
> > > diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> > > index 4b74130..fb80d69 100644
> > > --- a/include/hw/i386/pc.h
> > > +++ b/include/hw/i386/pc.h
> > > @@ -372,6 +372,11 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
> > >  #define PC_COMPAT_2_7 \
> > >      HW_COMPAT_2_7 \
> > >      {\
> > > +        .driver   = "kvmclock",\
> > > +        .property = "x-migrate-reliable-flag",\
> > > +        .value    = "off",\
> > > +    },\
> > > +    {\
> > >          .driver   = TYPE_X86_CPU,\
> > >          .property = "l3-cache",\
> > >          .value    = "off",\
> > > diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> > > index f62264a..10a9cd8 100644
> > > --- a/target-i386/kvm.c
> > > +++ b/target-i386/kvm.c
> > > @@ -117,6 +117,13 @@ bool kvm_has_smm(void)
> > >      return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
> > >  }
> > >  
> > > +bool kvm_has_adjust_clock_stable(void)
> > > +{
> > > +    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
> > > +
> > > +    return (ret == KVM_CLOCK_TSC_STABLE);
> > > +}
> > > +
> > >  bool kvm_allows_irq0_override(void)
> > >  {
> > >      return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
> > > diff --git a/target-i386/kvm_i386.h b/target-i386/kvm_i386.h
> > > index 7607929..bfce427 100644
> > > --- a/target-i386/kvm_i386.h
> > > +++ b/target-i386/kvm_i386.h
> > > @@ -17,6 +17,7 @@
> > >  
> > >  bool kvm_allows_irq0_override(void);
> > >  bool kvm_has_smm(void);
> > > +bool kvm_has_adjust_clock_stable(void);
> > >  void kvm_synchronize_all_tsc(void);
> > >  void kvm_arch_reset_vcpu(X86CPU *cs);
> > >  void kvm_arch_do_init_vcpu(X86CPU *cs);
> > > -- 
> > > 2.7.4
> > > 
> > > 
> > > > 
> > > > 
> > > > > 
> > > > > > 
> > > > > > If s->clock_valid == true, kvm_has_adjust_clock_stable() is
> > > > > > what decides whether to use KVM_GET_CLOCK or not.
> > > > > > 
> > > > > > > (If this is an optimization to avoid extra ioctl() calls, you can
> > > > > > > cache it inside kvm_has_adjust_clock_stable()).
> > > > > > 
> > > > > > Its not.
> > > > > > 
> > > > > > > > +            s->clock_is_reliable = kvm_has_adjust_clock_stable();
> > > > > > > >          }
> > > > > > > 
> > > > > > > This should go to kvm_get_clock(), so s->clock and
> > > > > > > s->clock_is_reliable are guaranteed to be always in sync.
> > > > > > 
> > > > > > No: the point to change clock_is_reliable is the following: whether 
> > > > > > its the first read of the clock (migration), or subsequent 
> > > > > > reads (normal machine operation).
> > > > > > 
> > > > > > Now kvm_get_clock() just reads the clock, so i don't see it as
> > > > > > the right place to change the variable.
> > > > > > 
> > > > > > Agreed?
> > > > > 
> > > > > I don't agree. clock_is_reliable is information about the current
> > > > > value in s->clock, I don't see why we need to change s->clock and
> > > > > s->clock_is_reliable at different places in the code.
> > > > > 
> > > > > Your code might work too, but I can't tell without reviewing very
> > > > > carefully the state transitions KVMClockState could go through. I
> > > > > don't see the point of making the state transitions in this code
> > > > > even more complex, if we can simply stick to a simple "s->clock
> > > > > and s->clock_is_reliable are always in sync" rule.
> > > > 
> > > > It is updating from the same place:
> > > > 
> > > >         /* migration/savevm/init restore
> > > >          * update clock_is_reliable to match local
> > > >          * host capabilities.
> > > >          */
> > > >         if (s->clock_valid == false) {
> > > >             s->clock_is_reliable = kvm_has_adjust_clock_stable();
> > > >         }
> > > > 
> > > >         /* We can't rely on the saved clock value, just discard it */
> > > >         if (pvclock_via_mem) {
> > > >             s->clock = pvclock_via_mem;
> > > >         }
> > > 
> > > I'm talking about the other s->clock assignments. This one
> > > doesn't even seem to be necessary.
> > 
> > It is necessary: this is how you read pvclock from memory.
> 
> It is not necessary if you set data.clock directly.
> 
> But this doesn't matter for this discussion. My new patch below
> keeps that line, just to make the diff smaller.
> 
> > 
> > > > 
> > > > 
> > > > > 
> > > > > > 
> > > > > > > >          /* We can't rely on the saved clock value, just discard it */
> > > > > > > > @@ -181,20 +173,14 @@
> > > > > > > >  {
> > > > > > > >      KVMClockState *s = KVM_CLOCK(dev);
> > > > > > > >  
> > > > > > > > -    /*
> > > > > > > > -     * On machine types that support reliable KVM_GET_CLOCK,
> > > > > > > > -     * if host kernel does provide reliable KVM_GET_CLOCK,
> > > > > > > > -     * set src_use_reliable_get_clock=true so that destination
> > > > > > > > -     * avoids reading kvmclock from memory.
> > > > > > > > -     */
> > > > > > > > -    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > > > > > > > -        s->src_use_reliable_get_clock = true;
> > > > > > > > +    if (kvm_has_adjust_clock_stable()) {
> > > > > > > > +        s->clock_is_reliable = true;
> > > > > > > >      }
> > > > > > > >  
> > > > > > > >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> > > > > > > >  }
> > > > > > > >  
> > > > > > > > -static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > > > > > > > +static bool kvmclock_clock_is_reliable_needed(void *opaque)
> > > > > > > >  {
> > > > > > > >      KVMClockState *s = opaque;
> > > > > > > >  
> > > > > > > > @@ -202,12 +188,12 @@
> > > > > > > >  }
> > > > > > > >  
> > > > > > > >  static const VMStateDescription kvmclock_reliable_get_clock = {
> > > > > > > > -    .name = "kvmclock/src_use_reliable_get_clock",
> > > > > > > > +    .name = "kvmclock/clock_is_reliable",
> > > > > > > >      .version_id = 1,
> > > > > > > >      .minimum_version_id = 1,
> > > > > > > > -    .needed = kvmclock_src_use_reliable_get_clock,
> > > > > > > > +    .needed = kvmclock_clock_is_reliable_needed,
> > > > > > > >      .fields = (VMStateField[]) {
> > > > > > > > -        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> > > > > > > > +        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
> > > > > > > >          VMSTATE_END_OF_LIST()
> > > > > > > >      }
> > > > > > > >  };
> > > > > > > 
> > > > > > > -- 
> > > > > > > Eduardo
> > > > > 
> > > > > -- 
> > > > > Eduardo
> > > 
> > > -- 
> > > Eduardo
> > 
> > 
> > kvmclock: reduce kvmclock difference on migration
> > 
> > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > that moment.
> > 
> > For new machine types, use this value rather than reading 
> > from guest memory.
> > 
> > This reduces kvmclock difference on migration from 5s to 0.1s
> > (when max_downtime == 5s).
> > 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > ---
> >  hw/i386/kvm/clock.c    |   97 +++++++++++++++++++++++++++++++++++++++++--------
> >  include/hw/i386/pc.h   |    5 ++
> >  target-i386/kvm.c      |    7 +++
> >  target-i386/kvm_i386.h |    1 
> >  4 files changed, 96 insertions(+), 14 deletions(-)
> > 
> > v2: 
> > - improve variable names (Juan)
> > - consolidate code on kvm_get_clock function (Paolo)
> > - return mach_use_reliable_get_clock from needed function (Paolo)
> > v3: 
> > - simplify check for src_use_reliable_get_clock (Eduardo)
> > - move src_use_reliable_get_clock initialization to realize (Eduardo)
> > 
> > 
> > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-12-01 12:43:18.606294830 -0200
> > @@ -36,6 +36,13 @@
> >  
> >      uint64_t clock;
> >      bool clock_valid;
> > +
> > +    /* whether machine type supports reliable get clock */
> > +    bool mach_use_reliable_get_clock;
> > +
> > +    /* whether the 'clock' value was obtained in a host with
> > +     * reliable KVM_GET_CLOCK */
> > +    bool clock_is_reliable;
> >  } KVMClockState;
> >  
> >  struct pvclock_vcpu_time_info {
> > @@ -81,6 +88,19 @@
> >      return nsec + time.system_time;
> >  }
> >  
> > +static uint64_t kvm_get_clock(void)
> > +{
> > +    struct kvm_clock_data data;
> > +    int ret;
> > +
> > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > +    if (ret < 0) {
> > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > +                abort();
> > +    }
> > +    return data.clock;
> > +}
> > +
> >  static void kvmclock_vm_state_change(void *opaque, int running,
> >                                       RunState state)
> >  {
> > @@ -91,15 +111,31 @@
> >  
> >      if (running) {
> >          struct kvm_clock_data data = {};
> > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > +        uint64_t pvclock_via_mem = 0;
> >  
> > -        s->clock_valid = false;
> > +        /*
> > +         * If the host where s->clock was read did not support reliable
> > +         * KVM_GET_CLOCK, read kvmclock value from memory.
> > +         */
> > +        if (!s->clock_is_reliable) {
> > +            pvclock_via_mem = kvmclock_current_nsec(s);
> > +        }
> >  
> > -        /* We can't rely on the migrated clock value, just discard it */
> > -        if (time_at_migration) {
> > -            s->clock = time_at_migration;
> > +        /* migration/savevm/init restore
> > +         * update clock_is_reliable to match local
> > +         * host capabilities.
> > +         */
> > +        if (s->clock_valid == false) {
> > +            s->clock_is_reliable = kvm_has_adjust_clock_stable();
> >          }
> >  
> 
> If we are reusing clock_valid for a different purpose, I suggest
> we rename it to something clearer, like "clock_is_local" or
> "clock_is_stopped".
> 
> But I still don't see the need for this convoluted logic, if we
> can simply set s->clock_is_reliable and s->clock at the same
> time. This even allow us to remove s->clock_valid completely.
> 
> > +        /* We can't rely on the saved clock value, just discard it */
> > +        if (pvclock_via_mem) {
> > +            s->clock = pvclock_via_mem;
> > +        }
> > +
> > +        s->clock_valid = false;
> > +
> 
> I suggest the following on top of your patch. It removes 27
> unnecessary lines from the code.
> 
> But, really, I'm tired of this discussion. If you want to keep
> the complex logic you suggest and others are happy with it, go
> ahead. You just won't get my Reviewed-by.

I have been addressing all the comments you made so far Eduardo, 
and have a point at (*) which seems valid to me.
This is the reason the patch is the way it is now.

But sure, i will include your next patch in the series, test 
it, and if someone does kvm_get_clock() in between those two
points in the code, then it'll break. I'll add a
comment to that effect to warn users.

> But in addition to this discussion about migration_is_safe, I
> have a question below about kvmclock_pre_save(), which seems to
> be a real problem. See below.
> 
> Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
> ---
>  hw/i386/kvm/clock.c | 37 +++++--------------------------------
>  1 file changed, 5 insertions(+), 32 deletions(-)
> 
> diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
> index e179862..359abce 100644
> --- a/hw/i386/kvm/clock.c
> +++ b/hw/i386/kvm/clock.c
> @@ -35,8 +35,6 @@ typedef struct KVMClockState {
>      /*< public >*/
>  
>      uint64_t clock;
> -    bool clock_valid;
> -
>      /* whether machine type supports reliable get clock */
>      bool mach_use_reliable_get_clock;
>  
> @@ -88,7 +86,7 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s)
>      return nsec + time.system_time;
>  }
>  
> -static uint64_t kvm_get_clock(void)
> +static void kvm_get_clock(KVMClockState *s)
>  {
>      struct kvm_clock_data data;
>      int ret;
> @@ -98,7 +96,8 @@ static uint64_t kvm_get_clock(void)
>          fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
>                  abort();
>      }
> -    return data.clock;
> +    s->clock = data.clock;
> +    s->clock_is_reliable = kvm_has_adjust_clock_stable();
>  }
>  
>  static void kvmclock_vm_state_change(void *opaque, int running,
> @@ -121,21 +120,11 @@ static void kvmclock_vm_state_change(void *opaque, int running,
>              pvclock_via_mem = kvmclock_current_nsec(s);
>          }
>  
> -        /* migration/savevm/init restore
> -         * update clock_is_reliable to match local
> -         * host capabilities.
> -         */
> -        if (s->clock_valid == false) {
> -            s->clock_is_reliable = kvm_has_adjust_clock_stable();
> -        }
>          /* We can't rely on the saved clock value, just discard it */
>          if (pvclock_via_mem) {
>              s->clock = pvclock_via_mem;
>          }
>  
> -        s->clock_valid = false;
> -
>          data.clock = s->clock;
>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>          if (ret < 0) {
> @@ -156,20 +145,8 @@ static void kvmclock_vm_state_change(void *opaque, int running,
>              }
>          }
>      } else {
> -
> -        if (s->clock_valid) {
> -            return;
> -        }
> -
>          kvm_synchronize_all_tsc();
> -
> -        s->clock = kvm_get_clock();
> -        /*
> -         * If the VM is stopped, declare the clock state valid to
> -         * avoid re-reading it on next vmsave (which would return
> -         * a different value). Will be reset when the VM is continued.
> -         */
> -        s->clock_valid = true;
> +        kvm_get_clock(s);
>      }
>  }
>  
> @@ -177,10 +154,6 @@ static void kvmclock_realize(DeviceState *dev, Error **errp)
>  {
>      KVMClockState *s = KVM_CLOCK(dev);
>  
> -    if (kvm_has_adjust_clock_stable()) {
> -        s->clock_is_reliable = true;
> -    }
> -
>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>  }
>  
> @@ -206,7 +179,7 @@ static void kvmclock_pre_save(void *opaque)
>  {
>      KVMClockState *s = opaque;
>  
> -    s->clock = kvm_get_clock();
> +    kvm_get_clock(s);
>  }
>  
>  static const VMStateDescription kvmclock_vmsd = {
> 
> 
> > +
> >          data.clock = s->clock;
> >          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> >          if (ret < 0) {
> > @@ -120,8 +156,6 @@
> >              }
> >          }
> >      } else {
> > -        struct kvm_clock_data data;
> > -        int ret;
> >  
> >          if (s->clock_valid) {
> >              return;
> > @@ -129,13 +163,7 @@
> >  
> >          kvm_synchronize_all_tsc();
> >  
> > -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > -        if (ret < 0) {
> > -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > -            abort();
> > -        }
> > -        s->clock = data.clock;
> > -
> > +        s->clock = kvm_get_clock();
> >          /*
> >           * If the VM is stopped, declare the clock state valid to
> >           * avoid re-reading it on next vmsave (which would return
> > @@ -149,25 +177,66 @@
> >  {
> >      KVMClockState *s = KVM_CLOCK(dev);
> >  
> > +    if (kvm_has_adjust_clock_stable()) {
> > +        s->clock_is_reliable = true;
> > +    }
> > +
> >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> >  }
> >  
> > +static bool kvmclock_clock_is_reliable_needed(void *opaque)
> > +{
> > +    KVMClockState *s = opaque;
> > +
> > +    return s->mach_use_reliable_get_clock;
> > +}
> > +
> > +static const VMStateDescription kvmclock_reliable_get_clock = {
> > +    .name = "kvmclock/clock_is_reliable",
> > +    .version_id = 1,
> > +    .minimum_version_id = 1,
> > +    .needed = kvmclock_clock_is_reliable_needed,
> > +    .fields = (VMStateField[]) {
> > +        VMSTATE_BOOL(clock_is_reliable, KVMClockState),
> > +        VMSTATE_END_OF_LIST()
> > +    }
> > +};
> > +
> > +static void kvmclock_pre_save(void *opaque)
> > +{
> > +    KVMClockState *s = opaque;
> > +
> 
> We still need a comment here explaining why we need to re-read
> the clock on pre_save if s->clock was already set on
> kvmclock_vm_state_change().

OK.

> Also, what if we are migrating a VM that was already paused 10
> minutes ago? Should we migrate the s->clock value from
> 10 minutes ago, or the one read at pre_save?

The one read at pre_save. I'll add a comment.

> > +    s->clock = kvm_get_clock();
> > +}
> > +
> >  static const VMStateDescription kvmclock_vmsd = {
> >      .name = "kvmclock",
> >      .version_id = 1,
> >      .minimum_version_id = 1,
> > +    .pre_save = kvmclock_pre_save,
> >      .fields = (VMStateField[]) {
> >          VMSTATE_UINT64(clock, KVMClockState),
> >          VMSTATE_END_OF_LIST()
> > +    },
> > +    .subsections = (const VMStateDescription * []) {
> > +        &kvmclock_reliable_get_clock,
> > +        NULL
> >      }
> >  };
> >  
> > +static Property kvmclock_properties[] = {
> > +    DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState,
> > +                      mach_use_reliable_get_clock, true),
> > +    DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> >  static void kvmclock_class_init(ObjectClass *klass, void *data)
> >  {
> >      DeviceClass *dc = DEVICE_CLASS(klass);
> >  
> >      dc->realize = kvmclock_realize;
> >      dc->vmsd = &kvmclock_vmsd;
> > +    dc->props = kvmclock_properties;
> >  }
> >  
> >  static const TypeInfo kvmclock_info = {
> > Index: qemu-mig-advance-clock/include/hw/i386/pc.h
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-17 15:07:11.220632761 -0200
> > +++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-17 15:08:58.096791416 -0200
> > @@ -370,6 +370,11 @@
> >  #define PC_COMPAT_2_7 \
> >      HW_COMPAT_2_7 \
> >      {\
> > +        .driver   = "kvmclock",\
> > +        .property = "x-mach-use-reliable-get-clock",\
> > +        .value    = "off",\
> > +    },\
> > +    {\
> >          .driver   = TYPE_X86_CPU,\
> >          .property = "l3-cache",\
> >          .value    = "off",\
> > Index: qemu-mig-advance-clock/target-i386/kvm.c
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-17 15:07:11.221632762 -0200
> > +++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-17 15:07:15.867639659 -0200
> > @@ -117,6 +117,13 @@
> >      return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
> >  }
> >  
> > +bool kvm_has_adjust_clock_stable(void)
> > +{
> > +    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
> > +
> > +    return (ret == KVM_CLOCK_TSC_STABLE);
> > +}
> > +
> >  bool kvm_allows_irq0_override(void)
> >  {
> >      return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
> > Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-17 15:07:11.222632764 -0200
> > +++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-17 15:07:15.867639659 -0200
> > @@ -17,6 +17,7 @@
> >  
> >  bool kvm_allows_irq0_override(void);
> >  bool kvm_has_smm(void);
> > +bool kvm_has_adjust_clock_stable(void);
> >  void kvm_synchronize_all_tsc(void);
> >  void kvm_arch_reset_vcpu(X86CPU *cs);
> >  void kvm_arch_do_init_vcpu(X86CPU *cs);
> 
> -- 
> Eduardo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration
  2016-12-05 17:20                         ` Marcelo Tosatti
@ 2016-12-06  0:47                           ` Eduardo Habkost
  0 siblings, 0 replies; 18+ messages in thread
From: Eduardo Habkost @ 2016-12-06  0:47 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Mon, Dec 05, 2016 at 03:20:16PM -0200, Marcelo Tosatti wrote:
> On Mon, Dec 05, 2016 at 01:17:42PM -0200, Eduardo Habkost wrote:
[...]
> > > >  
> > > > +static void kvm_get_clock(KVMClockState *s)
> > > > +{
> > > > +    struct kvm_clock_data data;
> > > > +    int ret;
> > > > +
> > > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > +    if (ret < 0) {
> > > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > +        abort();
> > > > +    }
> > > > +    s->clock = data.clock;
> > > > +    s->clock_is_reliable = kvm_has_adjust_clock_stable();
> > > > +}
> > > > +
> > > 
> > > s->clock_is_reliable has to updated at migration, after 
> > > the first time KVM_SET_CLOCK is called. There is no other
> > > place where it should be updated.
> > 
> > I don't see why you claim that. We can simply update
> > s->clock_is_reliable the next time s->clock is updated. The only
> > code that reads s->clock checks s->clock_is_reliable first.
> > s->clock_is_reliable is a flag _about_ the value in s->clock. I
> > don't see why you want to update them at different times, if it
> > means more complex logic.
> 
> Because the table of (wanted) behaviour dictates that.

My suggestion also follows the table strictly.

> 
> > > This is the behaviour 
> > > desired, according to this table:
> > > 
> > > index:
> > > mig->c means use s->clock value from migration
> > > mig->m means read pvclock value from memory 
> > > (both at migration, only the first time kvmclock_change_vmstate
> > > is called).
> > > 
> > > runt->c means use KVM_GET_CLOCK/KVM_SET_CLOCK at runtime
> > > runt->m means read pvclock value from memory
> > > (both at runtime, after the first time kvmclock_change_vmstate
> > > is called).
> > > 
> > > SOURCE                DESTINATION             BEHAVIOUR
> > > get_clock_reliable    get_clock_reliable      mig->c,runt->c  1
> > > get_clock_reliable    ~get_clock_reliable     mig->c,runt->m  2
> > > ~get_clock_reliable   get_clock_reliable      mig->m,runt->c  3
> > > ~get_clock_reliable   ~get_clock_reliable     mig->m,runt->m  4
> > > 
> > > So looking at your patch below, and thinking of 
> > > kvm_get_clock() updating s->clock_is_reliable from 
> > > !running case (that is RUNNING -> ~RUNNING transition)
> > > makes no sense at all. That should not happen.
> > 
> > Why not?
> 
> Because we want the behaviour of the table above as follows:
> 
> First execution of ~RUNNING -> RUNNING transition:
> 
> * If source host supports reliable KVM_GET_CLOCK, then 
> its not necessary to read from guest memory (because there
> is no danger of KVM_GET_CLOCK returning a value which will cause 
> the guest to crash).

Correct. And this can be generalized to:

"If the host that set s->clock support reliable KVM_GET_CLOCK,
then its not necessary to read from guest memory [...]".

> 
> * If source host does NOT support reliable KVM_GET_CLOCK, then 
> it is necessary to read from guest memory (because there
> is danger of KVM_GET_CLOCK returning a value which will cause 
> the guest to crash).

Correct. And this can be generalized to:

"If host that set s->clock do NOT support reliable KVM_GET_CLOCK,
then it is necessary to read from guest memory [...]"


> 
> Second and subsequent executions of ~RUNNING -> RUNNING transition:
> (say if you pause the guest and continue it):
> 
> * If host supports reliable KVM_GET_CLOCK, then 
> its not necessary to read from guest memory (because there
> is no danger of KVM_GET_CLOCK returning a value which will cause 
> the guest to crash).
> 

Correct. And this can be generalized to:

"If the host that set s->clock support reliable KVM_GET_CLOCK,
then its not necessary to [...]".

> * If  host does NOT support reliable KVM_GET_CLOCK, then 
> it is necessary to read from guest memory (because there
> is danger of KVM_GET_CLOCK returning a value which will cause 
> the guest to crash).
> 
Correct. And this can be generalized to:

"If host that set s->clock do NOT support reliable KVM_GET_CLOCK,
then it is necessary to read from guest memory [...]"

> Does that make sense?

Yes. And settting s->clock_is_reliable and s->clock at the same
time guarantees that.

> 
> > > s->clock_is_reliable should be updated after the first
> > > time ~RUNNING -> RUNNING migration is done, which 
> > > is my patch does.
> > 
> > I don't see why. All we need is that s->clock_is_reliable get
> > updated before any code try to read s->clock again.
> 
> > > I think your s->clock,s->clock_is_reliable should be in sync
> > > suggestion comes from a misunderstanding of how they are used.
> > > 
> > > s->clock is not read if s->clock_is_reliable is set.
> > 
> > Did you mean "s->clock is not read if s->clock_is_reliable is
> > false"? You are right (except when kvmclock_current_nsec()==0?).
> > But this also means that the only code that reads s->clock also
> > checks s->clock_is_reliable first.
> > 
> > > 
> > > Look at my attached patch below (still untested, will test and then
> > > resubmit) and check whether it matches the behaviour 
> > > described in the table above.
> > 
> > It seems to match. But I don't see why my patch doesn't match the
> > table above.
> 
> Its just fragile Eduardo: if anyone comes and changes the
> code as follows.
> 
> Source: Does not support reliable KVM_GET_CLOCK.
> Destination: Does support reliable KVM_GET_CLOCK.
> 
> * Migration from remote host that does not support
>   reliable KVM_GET_CLOCK, to a host that supports it.
> 
> * Some modification is done so that:
> 
>     1) Incoming migration sets s->clock_is_reliable=false from source.
>     2) _Before_ vm_start(), new code decides to issue kvm_get_clock().
> 
> Your code sets s->clock_reliable = true (because local host supports
> it), and failure. Do you see the problem?

The whole point of (my version of) kvm_get_clock() is to set
s->clock. If any code wants to change s->clock before vm_start(),
we are already screwed. I really don't think this is a realistic
scenario.

If you also want a function that returns KVM_GET_CLOCK but do not
change s->clock or s->clock_is_reliable, you can write one and I
won't complain.

> 
> (*)
> 
> > > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > > >                                       RunState state)
> > > >  {
> > > > @@ -91,16 +112,18 @@ static void kvmclock_vm_state_change(void *opaque, int running,
> > > >  
> > > >      if (running) {
> > > >          struct kvm_clock_data data = {};
> > > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > >  
> > > >          s->clock_valid = false;
> > > >  
> > > > -        /* We can't rely on the migrated clock value, just discard it */
> > > > -        if (time_at_migration) {
> > > > -            s->clock = time_at_migration;
> > > > +        /* if host that set s->clock did not support reliable KVM_GET_CLOCK,
> > > > +         * read kvmclock value from memory
> > > > +         */
> > > > +        if (!s->clock_is_reliable) {
> > > > +            data.clock = kvmclock_current_nsec(s);
> > > > +        } else {
> > > > +            data.clock = s->clock;
> > > >          }
> > > >  
> > > > -        data.clock = s->clock;
> > > >          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> > > >          if (ret < 0) {
> > > >              fprintf(stderr, "KVM_SET_CLOCK failed: %s\n", strerror(ret));
> > > > @@ -120,22 +143,23 @@ static void kvmclock_vm_state_change(void *opaque, int running,
> > > 
> > > >              }
> > > >          }
> > > >      } else {
> > > > -        struct kvm_clock_data data;
> > > > -        int ret;
> > > >  
> > > > +        /*FIXME: is this really possible to happen?
> > > > +         * Can kvmclock_vm_state_change(s, 0) be called twice without
> > > > +         * a kvmclock_vm_state_change(s, 1) call in between?
> > > > +         */
> > > 
> > > It can't: check the code where notifiers are called.
> > 
> > Yet another reason to simplify the code: if we don't need
> > different code paths for local stop/resume vs migration, we can
> > remove s->clock_valid completely.
> 
> Yes it can be removed... But i'm just trying to fix a bug, not rewrite
> code while at it.

No need to remove it right now. But if the logic can make the
extra field unnecessary, that's another reason to simplify it. I
normally don't care about 2 or 3 extra lines of code, but in this
case I really think the complexity you are adding is unnecessary.

> 
> > > >          if (s->clock_valid) {
> > > >              return;
> > > >          }
> > > >  
> > > >          kvm_synchronize_all_tsc();
> > > >  
> > > > -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > -        if (ret < 0) {
> > > > -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > -            abort();
> > > > -        }
> > > > -        s->clock = data.clock;
> > > > -
> > > > +        kvm_get_clock(s);
> > > > +        /* FIXME: the comment below doesn't match what is done at
> > > > +         * kvmclock_pre_save(). The comment must be updated, or
> > > > +         * kvmclock_pre_save() must be changed. I don't know what's
> > > > +         * the right thing to do.
> > > > +         */
> > 
> > We still need to address this comment. Your patch keeps the
> > comment that contradicts kvmclock_pre_save().
> 
> The comment refers to the fact that you should not do:
> 
>     Event stop_vm()
>        saved_clock1 = KVM_GET_CLOCK.
> 
>     Then again with the VM stopped, overwrite saved_clock:
>         saved_clock2 = KVM_GET_CLOCK.
> 
>     You should always use saved_clock1. Thats what its about.

I am not sure I follow. pre_save runs after stop_vm() and
overwrites saved_clock1 (s->clock) with a new value, doesn't it?
So is it incorrect?

> 
> Updated it.
> 
> > 
> > > 
> > > This fails for all cases (1,2,3,4) in the table above.
> > 
> > I can't see why it fails. The ordering of events from migration
> > followed by vm stop/resume should be:
> > 
> >     | Event                       | s->clock         | s->clock_is_reliable
> > src | kvmclock_vm_state_change(0) | -                | -
> > src |   kvm_get_clock()           | TIME-AT-VM-STOP  | CAP_ADJUST_CLOCK-on-SRC
> > src | kvmclock_pre_save()         | TIME-AT-VM-STOP  | CAP_ADJUST_CLOCK-on-SRC
> > src |   kvm_get_clock()           | TIME-AT-PRE-SAVE | CAP_ADJUST_CLOCK-on-SRC
> > --- | MIGRATION                   | TIME-AT-PRE-SAVE | CAP_ADJUST_CLOCK-on-SRC
> > dst | kvmclock_vm_state_change(1) | TIME-AT-PRE-SAVE | CAP_ADJUST_CLOCK-on-SRC
> > dst |   KVM_SET_CLOCK             | TIME-AT-PRE-SAVE | CAP_ADJUST_CLOCK-on-SRC
> > dst | kvmclock_vm_state_change(0) | TIME-AT-PRE-SAVE | CAP_ADJUST_CLOCK-on-SRC
> > dst |   kvm_get_clock()           | TIME-AT-VM-STOP  | CAP_ADJUST_CLOCK-on-DEST
> > dst | kvmclock_vm_state_change(1) | TIME-AT-VM-STOP  | CAP_ADJUST_CLOCK-on-DEST
> > dst |   KVM_SET_CLOCK             | TIME-AT-VM-STOP  | CAP_ADJUST_CLOCK-on-DEST
> > 
> > Specific cases from your table are below. See the KVM_SET_CLOCK
> > rows.
> > 
> > Case 1:
> > 
> >     | Event                       | s->clock         | s->clock_is_reliable
> > src | kvmclock_vm_state_change(0) | -                | -
> > src |   kvm_get_clock()           | TIME-AT-VM-STOP  | 1
> > src | kvmclock_pre_save()         | TIME-AT-VM-STOP  | 1
> > src |   kvm_get_clock()           | TIME-AT-PRE-SAVE | 1
> > --- | MIGRATION                   | TIME-AT-PRE-SAVE | 1
> > dst | kvmclock_vm_state_change(1) | TIME-AT-PRE-SAVE | 1
> > dst |   KVM_SET_CLOCK(s->clock)   | TIME-AT-PRE-SAVE | 1
> > dst | kvmclock_vm_state_change(0) | TIME-AT-PRE-SAVE | 1
> > dst |   kvm_get_clock()           | TIME-AT-VM-STOP  | 1
> > dst | kvmclock_vm_state_change(1) | TIME-AT-VM-STOP  | 1
> > dst |   KVM_SET_CLOCK(s->clock)   | TIME-AT-VM-STOP  | 1
> > 
> > Case 2:
> > 
> >     | Event                       | s->clock         | s->clock_is_reliable
> > src | kvmclock_vm_state_change(0) | -                | -
> > src |   kvm_get_clock()           | TIME-AT-VM-STOP  | 1
> > src | kvmclock_pre_save()         | TIME-AT-VM-STOP  | 1
> > src |   kvm_get_clock()           | TIME-AT-PRE-SAVE | 1
> > --- | MIGRATION                   | TIME-AT-PRE-SAVE | 1
> > dst | kvmclock_vm_state_change(1) | TIME-AT-PRE-SAVE | 1
> > dst |   KVM_SET_CLOCK(s->clock)   | TIME-AT-PRE-SAVE | 1
> > dst | kvmclock_vm_state_change(0) | TIME-AT-PRE-SAVE | 1
> > dst |   kvm_get_clock()           | TIME-AT-VM-STOP  | 0
> > dst | kvmclock_vm_state_change(1) | TIME-AT-VM-STOP  | 0
> > dst |   KVM_SET_CLOCK(from_mem)   | TIME-AT-VM-STOP  | 0
> > 
> > Case 3:
> > 
> >     | Event                       | s->clock         | s->clock_is_reliable
> > src | kvmclock_vm_state_change(0) | -                | -
> > src |   kvm_get_clock()           | TIME-AT-VM-STOP  | 0
> > src | kvmclock_pre_save()         | TIME-AT-VM-STOP  | 0
> > src |   kvm_get_clock()           | TIME-AT-PRE-SAVE | 0
> > --- | MIGRATION                   | TIME-AT-PRE-SAVE | 0
> > dst | kvmclock_vm_state_change(1) | TIME-AT-PRE-SAVE | 0
> > dst |   KVM_SET_CLOCK(from_mem)   | TIME-AT-PRE-SAVE | 0
> > dst | kvmclock_vm_state_change(0) | TIME-AT-PRE-SAVE | 0
> > dst |   kvm_get_clock()           | TIME-AT-VM-STOP  | 1
> > dst | kvmclock_vm_state_change(1) | TIME-AT-VM-STOP  | 1
> > dst |   KVM_SET_CLOCK(s->clock)   | TIME-AT-VM-STOP  | 1
> > 
> > Case 4:
> > 
> >     | Event                       | s->clock         | s->clock_is_reliable
> > src | kvmclock_vm_state_change(0) | -                | -
> > src |   kvm_get_clock()           | TIME-AT-VM-STOP  | 0
> > src | kvmclock_pre_save()         | TIME-AT-VM-STOP  | 0
> > src |   kvm_get_clock()           | TIME-AT-PRE-SAVE | 0
> > --- | MIGRATION                   | TIME-AT-PRE-SAVE | 0
> > dst | kvmclock_vm_state_change(1) | TIME-AT-PRE-SAVE | 0
> > dst |   KVM_SET_CLOCK(from_mem)   | TIME-AT-PRE-SAVE | 0
> > dst | kvmclock_vm_state_change(0) | TIME-AT-PRE-SAVE | 0
> > dst |   kvm_get_clock()           | TIME-AT-VM-STOP  | 0
> > dst | kvmclock_vm_state_change(1) | TIME-AT-VM-STOP  | 0
> > dst |   KVM_SET_CLOCK(from_mem)   | TIME-AT-VM-STOP  | 0
> > 
> > 
> > 
[...]
> > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > >                                       RunState state)
> > >  {
> > > @@ -91,15 +111,31 @@
> > >  
> > >      if (running) {
> > >          struct kvm_clock_data data = {};
> > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > +        uint64_t pvclock_via_mem = 0;
> > >  
> > > -        s->clock_valid = false;
> > > +        /*
> > > +         * If the host where s->clock was read did not support reliable
> > > +         * KVM_GET_CLOCK, read kvmclock value from memory.
> > > +         */
> > > +        if (!s->clock_is_reliable) {
> > > +            pvclock_via_mem = kvmclock_current_nsec(s);
> > > +        }
> > >  
> > > -        /* We can't rely on the migrated clock value, just discard it */
> > > -        if (time_at_migration) {
> > > -            s->clock = time_at_migration;
> > > +        /* migration/savevm/init restore
> > > +         * update clock_is_reliable to match local
> > > +         * host capabilities.
> > > +         */
> > > +        if (s->clock_valid == false) {
> > > +            s->clock_is_reliable = kvm_has_adjust_clock_stable();
> > >          }
> > >  
> > 
> > If we are reusing clock_valid for a different purpose, I suggest
> > we rename it to something clearer, like "clock_is_local" or
> > "clock_is_stopped".
> > 
> > But I still don't see the need for this convoluted logic, if we
> > can simply set s->clock_is_reliable and s->clock at the same
> > time. This even allow us to remove s->clock_valid completely.
> > 
> > > +        /* We can't rely on the saved clock value, just discard it */
> > > +        if (pvclock_via_mem) {
> > > +            s->clock = pvclock_via_mem;
> > > +        }
> > > +
> > > +        s->clock_valid = false;
> > > +
> > 
> > I suggest the following on top of your patch. It removes 27
> > unnecessary lines from the code.
> > 
> > But, really, I'm tired of this discussion. If you want to keep
> > the complex logic you suggest and others are happy with it, go
> > ahead. You just won't get my Reviewed-by.
> 
> I have been addressing all the comments you made so far Eduardo, 
> and have a point at (*) which seems valid to me.
> This is the reason the patch is the way it is now.
> 
> But sure, i will include your next patch in the series, test 
> it, and if someone does kvm_get_clock() in between those two
> points in the code, then it'll break. I'll add a
> comment to that effect to warn users.

kvm_get_clock() changes s->clock too. If you set s->clock at the
wrong moment you'll have bigger problems than clock_is_reliable
being incorrect. But if your worry is kvm_get_clock(), then we
can just do the following. Would that be OK?

Then I can send a patch to remove clock_is_valid later, as a
follow-up.

(I have one additional comment about pre_save(), below)

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
---

 hw/i386/kvm/clock.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index e179862..e2256a6 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -121,14 +121,6 @@ static void kvmclock_vm_state_change(void *opaque, int running,
             pvclock_via_mem = kvmclock_current_nsec(s);
         }
 
-        /* migration/savevm/init restore
-         * update clock_is_reliable to match local
-         * host capabilities.
-         */
-        if (s->clock_valid == false) {
-            s->clock_is_reliable = kvm_has_adjust_clock_stable();
-        }
-
         /* We can't rely on the saved clock value, just discard it */
         if (pvclock_via_mem) {
             s->clock = pvclock_via_mem;
@@ -164,6 +156,10 @@ static void kvmclock_vm_state_change(void *opaque, int running,
         kvm_synchronize_all_tsc();
 
         s->clock = kvm_get_clock();
+        /* any code that sets s->clock needs to ensure clock_is_reliable
+         * is correctly set.
+         */
+        s->clock_is_reliable = kvm_has_adjust_clock_stable();
         /*
          * If the VM is stopped, declare the clock state valid to
          * avoid re-reading it on next vmsave (which would return
@@ -177,10 +173,6 @@ static void kvmclock_realize(DeviceState *dev, Error **errp)
 {
     KVMClockState *s = KVM_CLOCK(dev);
 
-    if (kvm_has_adjust_clock_stable()) {
-        s->clock_is_reliable = true;
-    }
-
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
 
> 
[...]
> > > +
> > > +static void kvmclock_pre_save(void *opaque)
> > > +{
> > > +    KVMClockState *s = opaque;
> > > +
> > 
> > We still need a comment here explaining why we need to re-read
> > the clock on pre_save if s->clock was already set on
> > kvmclock_vm_state_change().
> 
> OK.
> 
> > Also, what if we are migrating a VM that was already paused 10
> > minutes ago? Should we migrate the s->clock value from
> > 10 minutes ago, or the one read at pre_save?
> 
> The one read at pre_save. I'll add a comment.

Is it really valid to make the clock move on an already-paused
VM, only because it was migrated?

> 
> > > +    s->clock = kvm_get_clock();
> > > +}
> > > +

-- 
Eduardo

^ permalink raw reply related	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2016-12-06  0:47 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-11-21 10:50 [Qemu-devel] [qemu patch V3 0/2] improve kvmclock difference on migration Marcelo Tosatti
2016-11-21 10:50 ` [Qemu-devel] [qemu patch V3 1/2] kvm: sync linux headers Marcelo Tosatti
2016-11-21 10:50 ` [Qemu-devel] [qemu patch V3 2/2] kvmclock: reduce kvmclock difference on migration Marcelo Tosatti
2016-11-28 14:13   ` Eduardo Habkost
2016-11-28 16:45     ` Marcelo Tosatti
2016-11-28 17:12       ` Eduardo Habkost
2016-11-28 17:31         ` Paolo Bonzini
2016-11-29 10:56         ` Marcelo Tosatti
2016-11-29 12:34           ` Eduardo Habkost
2016-11-29 23:54             ` Marcelo Tosatti
2016-11-30 13:09               ` Eduardo Habkost
2016-12-01 14:39                 ` Marcelo Tosatti
2016-12-02 13:56                   ` Eduardo Habkost
2016-12-02 22:30                     ` Marcelo Tosatti
2016-12-05 15:17                       ` Eduardo Habkost
2016-12-05 17:20                         ` Marcelo Tosatti
2016-12-06  0:47                           ` Eduardo Habkost
2016-11-28 12:16 ` [Qemu-devel] [qemu patch V3 0/2] improve " Marcelo Tosatti

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).