From: David Hildenbrand <david@redhat.com>
To: qemu-devel@nongnu.org
Cc: Eduardo Habkost <ehabkost@redhat.com>,
"Michael S . Tsirkin" <mst@redhat.com>,
David Hildenbrand <david@redhat.com>,
"Dr . David Alan Gilbert" <dgilbert@redhat.com>,
Igor Mammedov <imammedo@redhat.com>,
Paolo Bonzini <pbonzini@redhat.com>,
Richard Henderson <rth@twiddle.net>
Subject: [PATCH v2 16/16] kvm: Implement region_resize() for atomic memory section resizes
Date: Wed, 12 Feb 2020 14:36:01 +0100 [thread overview]
Message-ID: <20200212133601.10555-17-david@redhat.com> (raw)
In-Reply-To: <20200212133601.10555-1-david@redhat.com>
virtio-mem wants to resize (esp. grow) memory regions while the guest is
already aware of them and makes use of them. Resizing a KVM slot can
only currently be done by removing it and re-adding it. While the kvm slot
is temporarily removed, VCPUs that try to read from these slots will fault.
Let's inhibit KVM_RUN while performing the resize. Keep it lightweight by
remembering using one bool per VCPU, if the VCPU is executing in the
kernel.
Note1: Instead of implementing region_resize(), we could also inhibit in
begin() and let the VCPUs continue to run in commit(). This would also
handle atomic splitting of memory regions. (I remember a BUG report but
cannot dig up the mail). However, using the region_resize() callback we
can later wire up an ioctl that can perform the resize atomically, and
make the inhibit conditional. Also, this way we inhibit KVM only when
resizing - not on any address space changes. This will not affect existing
RT workloads (resizes currently only happen during reboot or at the
start of an incoming migration).
Note2: We cannot use pause_all_vcpus()/resume_all_vcpus(), as it will
temporarily drop the BQL, which is something most caller cannot deal
with when trying to resize a memory region.
Signed-off-by: David Hildenbrand <david@redhat.com>
---
accel/kvm/kvm-all.c | 87 +++++++++++++++++++++++++++++++++++++++++++
include/hw/core/cpu.h | 3 ++
2 files changed, 90 insertions(+)
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index c111312dfd..e24805771c 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -148,6 +148,10 @@ bool kvm_ioeventfd_any_length_allowed;
bool kvm_msi_use_devid;
static bool kvm_immediate_exit;
static hwaddr kvm_max_slot_size = ~0;
+static QemuMutex kvm_run_mutex;
+static QemuCond kvm_run_cond;
+static QemuCond kvm_run_inhibit_cond;
+static int kvm_run_inhibited;
static const KVMCapabilityInfo kvm_required_capabilites[] = {
KVM_CAP_INFO(USER_MEMORY),
@@ -1121,6 +1125,57 @@ static void kvm_region_del(MemoryListener *listener,
memory_region_unref(section->mr);
}
+/*
+ * Certain updates (e.g., resizing memory regions) require temporarily removing
+ * kvm memory slots. Avoid any VCPU to fault by making sure all VCPUs
+ * left KVM_RUN and won't enter it again until unblocked.
+ */
+static void kvm_run_inhibit_begin(void)
+{
+ CPUState *cpu;
+
+ atomic_inc(&kvm_run_inhibited);
+ while (true) {
+ bool any_in_kernel = false;
+
+ CPU_FOREACH(cpu) {
+ if (atomic_read(&cpu->in_kernel)) {
+ any_in_kernel = true;
+ qemu_cpu_kick(cpu);
+ }
+ }
+ if (!any_in_kernel) {
+ break;
+ }
+ qemu_mutex_lock(&kvm_run_mutex);
+ qemu_cond_wait(&kvm_run_inhibit_cond, &kvm_run_mutex);
+ qemu_mutex_unlock(&kvm_run_mutex);
+ }
+}
+
+static void kvm_run_inhibit_end(void)
+{
+ atomic_dec(&kvm_run_inhibited);
+ qemu_mutex_lock(&kvm_run_mutex);
+ qemu_cond_broadcast(&kvm_run_cond);
+ qemu_mutex_unlock(&kvm_run_mutex);
+}
+
+static void kvm_region_resize(MemoryListener *listener,
+ MemoryRegionSection *section, Int128 new)
+{
+ KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
+ MemoryRegionSection new_section = *section;
+
+ new_section.size = new;
+
+ /* Inhibit KVM while we temporarily remove slots. */
+ kvm_run_inhibit_begin();
+ kvm_set_phys_mem(kml, section, false);
+ kvm_set_phys_mem(kml, &new_section, true);
+ kvm_run_inhibit_end();
+}
+
static void kvm_log_sync(MemoryListener *listener,
MemoryRegionSection *section)
{
@@ -1239,6 +1294,7 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
kml->listener.region_add = kvm_region_add;
kml->listener.region_del = kvm_region_del;
+ kml->listener.region_resize = kvm_region_resize;
kml->listener.log_start = kvm_log_start;
kml->listener.log_stop = kvm_log_stop;
kml->listener.log_sync = kvm_log_sync;
@@ -1884,6 +1940,9 @@ static int kvm_init(MachineState *ms)
assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size);
s->sigmask_len = 8;
+ qemu_mutex_init(&kvm_run_mutex);
+ qemu_cond_init(&kvm_run_cond);
+ qemu_cond_init(&kvm_run_inhibit_cond);
#ifdef KVM_CAP_SET_GUEST_DEBUG
QTAILQ_INIT(&s->kvm_sw_breakpoints);
@@ -2294,6 +2353,29 @@ static void kvm_eat_signals(CPUState *cpu)
} while (sigismember(&chkset, SIG_IPI));
}
+static void kvm_set_cpu_in_kernel(CPUState *cpu, bool in_kernel)
+{
+ atomic_set(&cpu->in_kernel, in_kernel);
+ if (in_kernel) {
+ /* wait until KVM_RUN is no longer inhibited */
+ while (unlikely(atomic_read(&kvm_run_inhibited))) {
+ atomic_set(&cpu->in_kernel, false);
+ qemu_mutex_lock(&kvm_run_mutex);
+ qemu_cond_broadcast(&kvm_run_inhibit_cond);
+ qemu_cond_wait(&kvm_run_cond, &kvm_run_mutex);
+ qemu_mutex_unlock(&kvm_run_mutex);
+ atomic_set(&cpu->in_kernel, true);
+ }
+ } else {
+ /* wake up somebody wanting to inhibit KVM_RUN */
+ if (unlikely(atomic_read(&kvm_run_inhibited))) {
+ qemu_mutex_lock(&kvm_run_mutex);
+ qemu_cond_broadcast(&kvm_run_inhibit_cond);
+ qemu_mutex_unlock(&kvm_run_mutex);
+ }
+ }
+}
+
int kvm_cpu_exec(CPUState *cpu)
{
struct kvm_run *run = cpu->kvm_run;
@@ -2318,6 +2400,9 @@ int kvm_cpu_exec(CPUState *cpu)
}
kvm_arch_pre_run(cpu, run);
+
+ kvm_set_cpu_in_kernel(cpu, true);
+
if (atomic_read(&cpu->exit_request)) {
DPRINTF("interrupt exit requested\n");
/*
@@ -2335,6 +2420,8 @@ int kvm_cpu_exec(CPUState *cpu)
run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
+ kvm_set_cpu_in_kernel(cpu, false);
+
attrs = kvm_arch_post_run(cpu, run);
#ifdef KVM_HAVE_MCE_INJECTION
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 73e9a869a4..83614e537b 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -431,6 +431,9 @@ struct CPUState {
/* shared by kvm, hax and hvf */
bool vcpu_dirty;
+ /* kvm only for now: VCPU is executing in the kernel (KVM_RUN) */
+ bool in_kernel;
+
/* Used to keep track of an outstanding cpu throttle thread for migration
* autoconverge
*/
--
2.24.1
next prev parent reply other threads:[~2020-02-12 13:42 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-02-12 13:35 [PATCH v2 00/16] Ram blocks with resizable anonymous allocations under POSIX David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 01/16] virtio-mem: Prototype David Hildenbrand
2020-02-12 14:15 ` Eric Blake
2020-02-12 14:20 ` David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 02/16] virtio-pci: Proxy for virtio-mem David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 03/16] hmp: Handle virtio-mem when printing memory device infos David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 04/16] numa: Handle virtio-mem in NUMA stats David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 05/16] pc: Support for virtio-mem-pci David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 06/16] exec: Provide owner when resizing memory region David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 07/16] memory: Add memory_region_max_size() and memory_region_is_resizable() David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 08/16] memory: Disallow resizing to 0 David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 09/16] memory-device: properly deal with resizable memory regions David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 10/16] hostmem: Factor out applying settings David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 11/16] hostmem: Factor out common checks into host_memory_backend_validate() David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 12/16] hostmem: Introduce "managed-size" for memory-backend-ram David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 13/16] qmp/hmp: Expose "managed-size" for memory backends David Hildenbrand
2020-02-12 14:17 ` Eric Blake
2020-02-12 13:35 ` [PATCH v2 14/16] virtio-mem: Support for resizable memory regions David Hildenbrand
2020-02-12 13:36 ` [PATCH v2 15/16] memory: Add region_resize() callback to memory notifier David Hildenbrand
2020-02-12 13:36 ` David Hildenbrand [this message]
2020-02-12 13:40 ` [PATCH v2 00/16] Ram blocks with resizable anonymous allocations under POSIX David Hildenbrand
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200212133601.10555-17-david@redhat.com \
--to=david@redhat.com \
--cc=dgilbert@redhat.com \
--cc=ehabkost@redhat.com \
--cc=imammedo@redhat.com \
--cc=mst@redhat.com \
--cc=pbonzini@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=rth@twiddle.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.