qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: David Hildenbrand <david@redhat.com>
To: qemu-devel@nongnu.org
Cc: Eduardo Habkost <ehabkost@redhat.com>,
	"Michael S . Tsirkin" <mst@redhat.com>,
	David Hildenbrand <david@redhat.com>,
	"Dr . David Alan Gilbert" <dgilbert@redhat.com>,
	Igor Mammedov <imammedo@redhat.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Richard Henderson <rth@twiddle.net>
Subject: [PATCH v2 16/16] kvm: Implement region_resize() for atomic memory section resizes
Date: Wed, 12 Feb 2020 14:36:01 +0100	[thread overview]
Message-ID: <20200212133601.10555-17-david@redhat.com> (raw)
In-Reply-To: <20200212133601.10555-1-david@redhat.com>

virtio-mem wants to resize (esp. grow) memory regions while the guest is
already aware of them and makes use of them. Resizing a KVM slot can
only currently be done by removing it and re-adding it. While the kvm slot
is temporarily removed, VCPUs that try to read from these slots will fault.

Let's inhibit KVM_RUN while performing the resize. Keep it lightweight by
remembering using one bool per VCPU, if the VCPU is executing in the
kernel.

Note1: Instead of implementing region_resize(), we could also inhibit in
begin() and let the VCPUs continue to run in commit(). This would also
handle atomic splitting of memory regions. (I remember a BUG report but
cannot dig up the mail). However, using the region_resize() callback we
can later wire up an ioctl that can perform the resize atomically, and
make the inhibit conditional. Also, this way we inhibit KVM only when
resizing - not on any address space changes. This will not affect existing
RT workloads (resizes currently only happen during reboot or at the
start of an incoming migration).

Note2: We cannot use pause_all_vcpus()/resume_all_vcpus(), as it will
temporarily drop the BQL, which is something most caller cannot deal
with when trying to resize a memory region.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 accel/kvm/kvm-all.c   | 87 +++++++++++++++++++++++++++++++++++++++++++
 include/hw/core/cpu.h |  3 ++
 2 files changed, 90 insertions(+)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index c111312dfd..e24805771c 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -148,6 +148,10 @@ bool kvm_ioeventfd_any_length_allowed;
 bool kvm_msi_use_devid;
 static bool kvm_immediate_exit;
 static hwaddr kvm_max_slot_size = ~0;
+static QemuMutex kvm_run_mutex;
+static QemuCond kvm_run_cond;
+static QemuCond kvm_run_inhibit_cond;
+static int kvm_run_inhibited;
 
 static const KVMCapabilityInfo kvm_required_capabilites[] = {
     KVM_CAP_INFO(USER_MEMORY),
@@ -1121,6 +1125,57 @@ static void kvm_region_del(MemoryListener *listener,
     memory_region_unref(section->mr);
 }
 
+/*
+ * Certain updates (e.g., resizing memory regions) require temporarily removing
+ * kvm memory slots. Avoid any VCPU to fault by making sure all VCPUs
+ * left KVM_RUN and won't enter it again until unblocked.
+ */
+static void kvm_run_inhibit_begin(void)
+{
+    CPUState *cpu;
+
+    atomic_inc(&kvm_run_inhibited);
+    while (true) {
+        bool any_in_kernel = false;
+
+        CPU_FOREACH(cpu) {
+            if (atomic_read(&cpu->in_kernel)) {
+                any_in_kernel = true;
+                qemu_cpu_kick(cpu);
+            }
+        }
+        if (!any_in_kernel) {
+            break;
+        }
+        qemu_mutex_lock(&kvm_run_mutex);
+        qemu_cond_wait(&kvm_run_inhibit_cond, &kvm_run_mutex);
+        qemu_mutex_unlock(&kvm_run_mutex);
+    }
+}
+
+static void kvm_run_inhibit_end(void)
+{
+    atomic_dec(&kvm_run_inhibited);
+    qemu_mutex_lock(&kvm_run_mutex);
+    qemu_cond_broadcast(&kvm_run_cond);
+    qemu_mutex_unlock(&kvm_run_mutex);
+}
+
+static void kvm_region_resize(MemoryListener *listener,
+                              MemoryRegionSection *section, Int128 new)
+{
+    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
+    MemoryRegionSection new_section = *section;
+
+    new_section.size = new;
+
+    /* Inhibit KVM while we temporarily remove slots. */
+    kvm_run_inhibit_begin();
+    kvm_set_phys_mem(kml, section, false);
+    kvm_set_phys_mem(kml, &new_section, true);
+    kvm_run_inhibit_end();
+}
+
 static void kvm_log_sync(MemoryListener *listener,
                          MemoryRegionSection *section)
 {
@@ -1239,6 +1294,7 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
 
     kml->listener.region_add = kvm_region_add;
     kml->listener.region_del = kvm_region_del;
+    kml->listener.region_resize = kvm_region_resize;
     kml->listener.log_start = kvm_log_start;
     kml->listener.log_stop = kvm_log_stop;
     kml->listener.log_sync = kvm_log_sync;
@@ -1884,6 +1940,9 @@ static int kvm_init(MachineState *ms)
     assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size);
 
     s->sigmask_len = 8;
+    qemu_mutex_init(&kvm_run_mutex);
+    qemu_cond_init(&kvm_run_cond);
+    qemu_cond_init(&kvm_run_inhibit_cond);
 
 #ifdef KVM_CAP_SET_GUEST_DEBUG
     QTAILQ_INIT(&s->kvm_sw_breakpoints);
@@ -2294,6 +2353,29 @@ static void kvm_eat_signals(CPUState *cpu)
     } while (sigismember(&chkset, SIG_IPI));
 }
 
+static void kvm_set_cpu_in_kernel(CPUState *cpu, bool in_kernel)
+{
+    atomic_set(&cpu->in_kernel, in_kernel);
+    if (in_kernel) {
+        /* wait until KVM_RUN is no longer inhibited */
+        while (unlikely(atomic_read(&kvm_run_inhibited))) {
+            atomic_set(&cpu->in_kernel, false);
+            qemu_mutex_lock(&kvm_run_mutex);
+            qemu_cond_broadcast(&kvm_run_inhibit_cond);
+            qemu_cond_wait(&kvm_run_cond, &kvm_run_mutex);
+            qemu_mutex_unlock(&kvm_run_mutex);
+            atomic_set(&cpu->in_kernel, true);
+        }
+    } else {
+        /* wake up somebody wanting to inhibit KVM_RUN */
+        if (unlikely(atomic_read(&kvm_run_inhibited))) {
+            qemu_mutex_lock(&kvm_run_mutex);
+            qemu_cond_broadcast(&kvm_run_inhibit_cond);
+            qemu_mutex_unlock(&kvm_run_mutex);
+        }
+    }
+}
+
 int kvm_cpu_exec(CPUState *cpu)
 {
     struct kvm_run *run = cpu->kvm_run;
@@ -2318,6 +2400,9 @@ int kvm_cpu_exec(CPUState *cpu)
         }
 
         kvm_arch_pre_run(cpu, run);
+
+        kvm_set_cpu_in_kernel(cpu, true);
+
         if (atomic_read(&cpu->exit_request)) {
             DPRINTF("interrupt exit requested\n");
             /*
@@ -2335,6 +2420,8 @@ int kvm_cpu_exec(CPUState *cpu)
 
         run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
 
+        kvm_set_cpu_in_kernel(cpu, false);
+
         attrs = kvm_arch_post_run(cpu, run);
 
 #ifdef KVM_HAVE_MCE_INJECTION
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 73e9a869a4..83614e537b 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -431,6 +431,9 @@ struct CPUState {
     /* shared by kvm, hax and hvf */
     bool vcpu_dirty;
 
+    /* kvm only for now: VCPU is executing in the kernel (KVM_RUN) */
+    bool in_kernel;
+
     /* Used to keep track of an outstanding cpu throttle thread for migration
      * autoconverge
      */
-- 
2.24.1



  parent reply	other threads:[~2020-02-12 13:42 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-02-12 13:35 [PATCH v2 00/16] Ram blocks with resizable anonymous allocations under POSIX David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 01/16] virtio-mem: Prototype David Hildenbrand
2020-02-12 14:15   ` Eric Blake
2020-02-12 14:20     ` David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 02/16] virtio-pci: Proxy for virtio-mem David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 03/16] hmp: Handle virtio-mem when printing memory device infos David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 04/16] numa: Handle virtio-mem in NUMA stats David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 05/16] pc: Support for virtio-mem-pci David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 06/16] exec: Provide owner when resizing memory region David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 07/16] memory: Add memory_region_max_size() and memory_region_is_resizable() David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 08/16] memory: Disallow resizing to 0 David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 09/16] memory-device: properly deal with resizable memory regions David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 10/16] hostmem: Factor out applying settings David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 11/16] hostmem: Factor out common checks into host_memory_backend_validate() David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 12/16] hostmem: Introduce "managed-size" for memory-backend-ram David Hildenbrand
2020-02-12 13:35 ` [PATCH v2 13/16] qmp/hmp: Expose "managed-size" for memory backends David Hildenbrand
2020-02-12 14:17   ` Eric Blake
2020-02-12 13:35 ` [PATCH v2 14/16] virtio-mem: Support for resizable memory regions David Hildenbrand
2020-02-12 13:36 ` [PATCH v2 15/16] memory: Add region_resize() callback to memory notifier David Hildenbrand
2020-02-12 13:36 ` David Hildenbrand [this message]
2020-02-12 13:40 ` [PATCH v2 00/16] Ram blocks with resizable anonymous allocations under POSIX David Hildenbrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200212133601.10555-17-david@redhat.com \
    --to=david@redhat.com \
    --cc=dgilbert@redhat.com \
    --cc=ehabkost@redhat.com \
    --cc=imammedo@redhat.com \
    --cc=mst@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=rth@twiddle.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).