From: Greg KH <gregkh@suse.de>
To: linux-kernel@vger.kernel.org, stable@kernel.org,
Alexander Graf <agraf@suse.de>, Greg KH <gregkh@suse.de>
Cc: stable-review@kernel.org, torvalds@linux-foundation.org,
akpm@linux-foundation.org, alan@lxorguk.ukuu.org.uk,
Avi Kivity <avi@redhat.com>, KVM list <kvm@vger.kernel.org>,
Glauber Costa <glommer@redhat.com>,
Marcelo Tosatti <mtosatti@redhat.com>
Subject: [49/74] KVM: allow userspace to adjust kvmclock offset
Date: Thu, 04 Feb 2010 09:12:20 -0800 [thread overview]
Message-ID: <20100204171515.880676617@linux.site> (raw)
In-Reply-To: <20100204171850.GA16539@kroah.com>
2.6.32-stable review patch. If anyone has any objections, please let us know.
------------------
From: Glauber Costa <glommer@redhat.com>
(cherry picked from afbcf7ab8d1bc8c2d04792f6d9e786e0adeb328d)
When we migrate a kvm guest that uses pvclock between two hosts, we may
suffer a large skew. This is because there can be significant differences
between the monotonic clock of the hosts involved. When a new host with
a much larger monotonic time starts running the guest, the view of time
will be significantly impacted.
Situation is much worse when we do the opposite, and migrate to a host with
a smaller monotonic clock.
This proposed ioctl will allow userspace to inform us what is the monotonic
clock value in the source host, so we can keep the time skew short, and
more importantly, never goes backwards. Userspace may also need to trigger
the current data, since from the first migration onwards, it won't be
reflected by a simple call to clock_gettime() anymore.
[marcelo: future-proof abi with a flags field]
[jan: fix KVM_GET_CLOCK by clearing flags field instead of checking it]
Signed-off-by: Glauber Costa <glommer@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
Documentation/kvm/api.txt | 36 ++++++++++++++++++++++++++++++++++
arch/x86/include/asm/kvm_host.h | 1
arch/x86/kvm/x86.c | 42 +++++++++++++++++++++++++++++++++++++++-
include/linux/kvm.h | 9 ++++++++
4 files changed, 87 insertions(+), 1 deletion(-)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -412,6 +412,7 @@ struct kvm_arch{
unsigned long irq_sources_bitmap;
unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
u64 vm_init_tsc;
+ s64 kvmclock_offset;
};
struct kvm_vm_stat {
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -680,7 +680,8 @@ static void kvm_write_guest_time(struct
/* With all the info we got, fill in the values */
vcpu->hv_clock.system_time = ts.tv_nsec +
- (NSEC_PER_SEC * (u64)ts.tv_sec);
+ (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
+
/*
* The interface expects us to write an even number signaling that the
* update is finished. Since the guest won't see the intermediate
@@ -1227,6 +1228,7 @@ int kvm_dev_ioctl_check_extension(long e
case KVM_CAP_PIT2:
case KVM_CAP_PIT_STATE2:
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
+ case KVM_CAP_ADJUST_CLOCK:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2424,6 +2426,44 @@ long kvm_arch_vm_ioctl(struct file *filp
r = 0;
break;
}
+ case KVM_SET_CLOCK: {
+ struct timespec now;
+ struct kvm_clock_data user_ns;
+ u64 now_ns;
+ s64 delta;
+
+ r = -EFAULT;
+ if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
+ goto out;
+
+ r = -EINVAL;
+ if (user_ns.flags)
+ goto out;
+
+ r = 0;
+ ktime_get_ts(&now);
+ now_ns = timespec_to_ns(&now);
+ delta = user_ns.clock - now_ns;
+ kvm->arch.kvmclock_offset = delta;
+ break;
+ }
+ case KVM_GET_CLOCK: {
+ struct timespec now;
+ struct kvm_clock_data user_ns;
+ u64 now_ns;
+
+ ktime_get_ts(&now);
+ now_ns = timespec_to_ns(&now);
+ user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
+ user_ns.flags = 0;
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
+ goto out;
+ r = 0;
+ break;
+ }
+
default:
;
}
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -593,6 +593,42 @@ struct kvm_irqchip {
} chip;
};
+4.27 KVM_GET_CLOCK
+
+Capability: KVM_CAP_ADJUST_CLOCK
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_clock_data (out)
+Returns: 0 on success, -1 on error
+
+Gets the current timestamp of kvmclock as seen by the current guest. In
+conjunction with KVM_SET_CLOCK, it is used to ensure monotonicity on scenarios
+such as migration.
+
+struct kvm_clock_data {
+ __u64 clock; /* kvmclock current value */
+ __u32 flags;
+ __u32 pad[9];
+};
+
+4.28 KVM_SET_CLOCK
+
+Capability: KVM_CAP_ADJUST_CLOCK
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_clock_data (in)
+Returns: 0 on success, -1 on error
+
+Sets the current timestamp of kvmclock to the valued specific in its parameter.
+In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios
+such as migration.
+
+struct kvm_clock_data {
+ __u64 clock; /* kvmclock current value */
+ __u32 flags;
+ __u32 pad[9];
+};
+
5. The kvm_run structure
Application code obtains a pointer to the kvm_run structure by
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -439,6 +439,7 @@ struct kvm_ioeventfd {
#endif
#define KVM_CAP_IOEVENTFD 36
#define KVM_CAP_SET_IDENTITY_MAP_ADDR 37
+#define KVM_CAP_ADJUST_CLOCK 39
#ifdef KVM_CAP_IRQ_ROUTING
@@ -501,6 +502,12 @@ struct kvm_irqfd {
__u8 pad[20];
};
+struct kvm_clock_data {
+ __u64 clock;
+ __u32 flags;
+ __u32 pad[9];
+};
+
/*
* ioctls for VM fds
*/
@@ -550,6 +557,8 @@ struct kvm_irqfd {
#define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config)
#define KVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78)
#define KVM_IOEVENTFD _IOW(KVMIO, 0x79, struct kvm_ioeventfd)
+#define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data)
+#define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data)
/*
* ioctls for vcpu fds
next prev parent reply other threads:[~2010-02-04 17:22 UTC|newest]
Thread overview: 82+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-02-04 17:18 [00/74] 2.6.32.8-stable review Greg KH
2010-02-04 17:11 ` [01/74] [SCSI] scsi_lib: Fix bug in completion of bidi commands Greg KH
2010-02-04 17:11 ` [02/74] [SCSI] mptsas: Fix issue with chain pools allocation on katmai Greg KH
2010-02-04 17:11 ` [03/74] mm: add new read_cache_page_gfp() helper function Greg KH
2010-02-04 17:11 ` [04/74] drm/i915: Selectively enable self-reclaim Greg KH
2010-02-04 17:11 ` [05/74] firewire: ohci: fix crashes with TSB43AB23 on 64bit systems Greg KH
2010-02-04 17:11 ` [06/74] S390: fix single stepped svcs with TRACE_IRQFLAGS=y Greg KH
2010-02-04 17:11 ` [07/74] x86: Set hotpluggable nodes in nodes_possible_map Greg KH
2010-02-04 17:11 ` [08/74] x86: Remove "x86 CPU features in debugfs" (CONFIG_X86_CPU_DEBUG) Greg KH
2010-02-04 17:11 ` [09/74] libata: retry FS IOs even if it has failed with AC_ERR_INVALID Greg KH
2010-02-04 17:11 ` [10/74] [S390] zcrypt: Do not remove coprocessor for error 8/72 Greg KH
2010-02-04 17:11 ` [11/74] [S390] dasd: fix possible NULL pointer errors Greg KH
2010-02-11 23:15 ` Bastian Blank
2010-02-11 23:38 ` Greg KH
2010-02-04 17:11 ` [12/74] ACPI: Add a generic API for _OSC -v2 Greg KH
2010-02-04 17:11 ` [13/74] ACPI: Add platform-wide _OSC support Greg KH
2010-02-04 17:11 ` [14/74] ACPI: fix OSC regression that caused aer and pciehp not to load Greg KH
2010-02-04 17:11 ` [15/74] ACPI: Advertise to BIOS in _OSC: _OST on _PPC changes Greg KH
2010-02-04 17:11 ` [16/74] UBI: fix volume creation input checking Greg KH
2010-02-04 17:11 ` [17/74] e1000: enhance frame fragment detection Greg KH
2010-02-04 17:11 ` [18/74] e1000e: " Greg KH
2010-02-04 17:11 ` [19/74] e1000/e1000e: dont use small hardware rx buffers Greg KH
2010-02-04 17:11 ` [20/74] drm/i915: Reload hangcheck timer too for Ironlake Greg KH
2010-02-04 17:11 ` [21/74] Fix a leak in affs_fill_super() Greg KH
2010-02-04 17:11 ` [22/74] Fix failure exits in bfs_fill_super() Greg KH
2010-02-04 17:11 ` [23/74] fix oops in fs/9p late mount failure Greg KH
2010-02-04 17:11 ` [24/74] fix leak in romfs_fill_super() Greg KH
2010-02-04 17:11 ` [25/74] Fix remount races with symlink handling in affs Greg KH
2010-02-04 17:11 ` [26/74] fix affs parse_options() Greg KH
2010-02-04 17:11 ` [27/74] Fix failure exit in ipathfs Greg KH
2010-02-04 17:11 ` [28/74] mm: fix migratetype bug which slowed swapping Greg KH
2010-02-04 17:12 ` [29/74] FDPIC: Respect PT_GNU_STACK exec protection markings when creating NOMMU stack Greg KH
2010-02-04 17:12 ` [30/74] Split flush_old_exec into two functions Greg KH
2010-02-04 17:12 ` [31/74] sparc: TIF_ABI_PENDING bit removal Greg KH
2010-02-04 17:12 ` [32/74] x86: get rid of the insane TIF_ABI_PENDING bit Greg KH
2010-02-04 17:12 ` [33/74] Input: winbond-cir - remove dmesg spam Greg KH
2010-02-04 17:12 ` [34/74] x86: Disable HPET MSI on ATI SB700/SB800 Greg KH
2010-02-04 17:12 ` [35/74] iwlwifi: set default aggregation frame count limit to 31 Greg KH
2010-02-04 17:12 ` [36/74] drm/i915: only enable hotplug for detected outputs Greg KH
2010-02-04 17:12 ` [37/74] firewire: core: add_descriptor size check Greg KH
2010-02-04 17:12 ` [38/74] SECURITY: selinux, fix update_rlimit_cpu parameter Greg KH
2010-02-04 17:12 ` [39/74] regulator: Specify REGULATOR_CHANGE_STATUS for WM835x LED constraints Greg KH
2010-02-04 17:12 ` [40/74] x86: Add Dell OptiPlex 760 reboot quirk Greg KH
2010-02-04 17:12 ` [41/74] x86: Add quirk for Intel DG45FC board to avoid low memory corruption Greg KH
2010-02-04 17:12 ` [42/74] x86/amd-iommu: Fix possible integer overflow Greg KH
2010-02-04 17:12 ` [43/74] clocksource: fix compilation if no GENERIC_TIME Greg KH
2010-02-04 17:12 ` [44/74] tcp: update the netstamp_needed counter when cloning sockets Greg KH
2010-02-04 17:12 ` [45/74] sky2: Fix oops in sky2_xmit_frame() after TX timeout Greg KH
2010-02-04 17:12 ` [46/74] net: restore ip source validation Greg KH
2010-02-05 10:16 ` Sven Joachim
2010-02-04 17:12 ` [47/74] af_packet: Dont use skb after dev_queue_xmit() Greg KH
2010-02-04 17:12 ` [48/74] ax25: netrom: rose: Fix timer oopses Greg KH
2010-02-04 17:12 ` Greg KH [this message]
2010-02-04 17:12 ` [50/74] oprofile/x86: add Xeon 7500 series support Greg KH
2010-02-04 17:12 ` [51/74] oprofile/x86: fix crash when profiling more than 28 events Greg KH
2010-02-04 17:12 ` [52/74] libata: retry link resume if necessary Greg KH
2010-02-04 17:12 ` [53/74] mm: percpu-vmap fix RCU list walking Greg KH
2010-02-04 17:12 ` [54/74] mm: purge fragmented percpu vmap blocks Greg KH
2010-02-04 17:12 ` [55/74] block: fix bio_add_page for non trivial merge_bvec_fn case Greg KH
2010-02-04 17:12 ` [56/74] Fix flush_old_exec()/setup_new_exec() split Greg KH
2010-02-04 17:12 ` [57/74] random: drop weird m_time/a_time manipulation Greg KH
2010-02-04 17:12 ` [58/74] random: Remove unused inode variable Greg KH
2010-02-04 17:12 ` [59/74] block: fix bugs in bio-integrity mempool usage Greg KH
2010-02-04 17:12 ` [60/74] usb: r8a66597-hdc disable interrupts fix Greg KH
2010-02-04 17:12 ` [61/74] connector: Delete buggy notification code Greg KH
2010-02-04 17:12 ` [62/74] be2net: Bug fix to support newer generation of BE ASIC Greg KH
2010-02-04 17:12 ` [63/74] be2net: Fix memset() arg ordering Greg KH
2010-02-04 17:12 ` [64/74] mm: flush dcache before writing into page to avoid alias Greg KH
2010-02-04 17:12 ` [65/74] mac80211: fix NULL pointer dereference when ftrace is enabled Greg KH
2010-02-04 17:12 ` [66/74] imxfb: correct location of callbacks in suspend and resume Greg KH
2010-02-04 17:12 ` [67/74] mx3fb: some debug and initialisation fixes Greg KH
2010-02-04 17:12 ` [68/74] starfire: clean up properly if firmware loading fails Greg KH
2010-02-04 17:12 ` [69/74] kernel/cred.c: use kmem_cache_free Greg KH
2010-02-04 17:12 ` [70/74] uartlite: fix crash when using as console Greg KH
2010-02-04 17:12 ` [71/74] pktcdvd: removing device does not remove its sysfs dir Greg KH
2010-02-04 17:12 ` [72/74] ath9k: fix eeprom INI values override for 2GHz-only cards Greg KH
2010-02-04 17:12 ` [73/74] ath9k: fix beacon slot/buffer leak Greg KH
2010-02-04 17:12 ` [74/74] powerpc: TIF_ABI_PENDING bit removal Greg KH
2010-02-05 7:36 ` [Stable-review] [00/74] 2.6.32.8-stable review Nikola Ciprich
2010-02-05 17:12 ` Greg KH
2010-02-07 10:26 ` Nikola Ciprich
2010-02-05 16:53 ` Greg KH
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20100204171515.880676617@linux.site \
--to=gregkh@suse.de \
--cc=agraf@suse.de \
--cc=akpm@linux-foundation.org \
--cc=alan@lxorguk.ukuu.org.uk \
--cc=avi@redhat.com \
--cc=glommer@redhat.com \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mtosatti@redhat.com \
--cc=stable-review@kernel.org \
--cc=stable@kernel.org \
--cc=torvalds@linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).