From: Sean Christopherson <seanjc@google.com>
To: Keith Busch <kbusch@kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>,
kvm@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH] KVM: x86/mmu: Ensure NX huge page recovery thread is alive before waking
Date: Mon, 27 Jan 2025 08:48:03 -0800 [thread overview]
Message-ID: <Z5e4w7IlEEk2cpH-@google.com> (raw)
In-Reply-To: <Z5RkcB_wf5Y74BUM@kbusch-mbp>
On Fri, Jan 24, 2025, Keith Busch wrote:
> On Fri, Jan 24, 2025 at 03:46:23PM -0800, Sean Christopherson wrote:
> > +static void kvm_wake_nx_recovery_thread(struct kvm *kvm)
> > +{
> > + /*
> > + * The NX recovery thread is spawned on-demand at the first KVM_RUN and
> > + * may not be valid even though the VM is globally visible. Do nothing,
> > + * as such a VM can't have any possible NX huge pages.
> > + */
> > + struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread);
> > +
> > + if (nx_thread)
> > + vhost_task_wake(nx_thread);
> > +}
...
> > + nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker,
> > + kvm_nx_huge_page_recovery_worker_kill,
> > + kvm, "kvm-nx-lpage-recovery");
> >
> > - if (kvm->arch.nx_huge_page_recovery_thread)
> > - vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
> > + if (!nx_thread)
> > + return;
> > +
> > + vhost_task_start(nx_thread);
> > +
> > + /* Make the task visible only once it is fully started. */
> > + WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread);
>
> I believe the WRITE_ONCE needs to happen before the vhost_task_start to
> ensure the parameter update callback can see it before it's started.
It's not clear to me that calling vhost_task_wake() before vhost_task_start() is
allowed, which is why I deliberately waited until the task was started to make it
visible. Though FWIW, doing "vhost_task_wake(nx_thread)" before vhost_task_start()
doesn't explode.
Ha! There is another bug here, but we can smack 'em both with a bit of trickery
and do an optimized serialization in the process.
If vhost_task_create() fails, then the call_once() will "succeed" and mark the
structure as ONCE_COMPLETED. The first KVM_RUN will fail with -ENOMEM, but any
subsequent calls will succeed, including in-flight KVM_RUNs on other threads.
Odds are good userspace will terminate the VM on -ENOMEM, but that't not guaranteed,
e.g. if userspace has logic to retry a few times before giving up.
If call_once() and its callback are modified to return errors, then we can abuse
call_once() to serialize against kvm_mmu_start_lpage_recovery() when waking the
recovery thread. If the recovery thread is fully created, call_once() is a lockless
happy path, otherwise the wakup path will serialize against the creation path
via the once's mutex.
Over two patches...
---
arch/x86/kvm/mmu/mmu.c | 46 ++++++++++++++++++++++++++++-----------
include/linux/call_once.h | 16 ++++++++++----
2 files changed, 45 insertions(+), 17 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index a45ae60e84ab..f3ad33cd68b3 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7120,6 +7120,26 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}
+static int kvm_nx_recovery_thread_not_ready(struct once *once)
+{
+ return -ENOENT;
+}
+
+static void kvm_wake_nx_recovery_thread(struct kvm *kvm)
+{
+ /*
+ * The NX recovery thread is spawned on-demand at the first KVM_RUN and
+ * may not be started even though the VM is globally visible. Abuse
+ * call_once() to serialize against starting the recovery thread; if
+ * this task's callback is invoked, then the thread hasn't been created
+ * and the thread is guaranteed to see up-to-date parameters.
+ */
+ if (call_once(&kvm->arch.nx_once, kvm_nx_recovery_thread_not_ready))
+ return;
+
+ vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
+}
+
static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
{
if (nx_hugepage_mitigation_hard_disabled)
@@ -7180,7 +7200,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
kvm_mmu_zap_all_fast(kvm);
mutex_unlock(&kvm->slots_lock);
- vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
+ kvm_wake_nx_recovery_thread(kvm);
}
mutex_unlock(&kvm_lock);
}
@@ -7315,7 +7335,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
- vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
+ kvm_wake_nx_recovery_thread(kvm);
mutex_unlock(&kvm_lock);
}
@@ -7447,7 +7467,7 @@ static bool kvm_nx_huge_page_recovery_worker(void *data)
return true;
}
-static void kvm_mmu_start_lpage_recovery(struct once *once)
+static int kvm_mmu_start_lpage_recovery(struct once *once)
{
struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once);
struct kvm *kvm = container_of(ka, struct kvm, arch);
@@ -7457,21 +7477,21 @@ static void kvm_mmu_start_lpage_recovery(struct once *once)
kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill,
kvm, "kvm-nx-lpage-recovery");
- if (kvm->arch.nx_huge_page_recovery_thread)
- vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
-}
-
-int kvm_mmu_post_init_vm(struct kvm *kvm)
-{
- if (nx_hugepage_mitigation_hard_disabled)
- return 0;
-
- call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
if (!kvm->arch.nx_huge_page_recovery_thread)
return -ENOMEM;
+
+ vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
return 0;
}
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+ if (nx_hugepage_mitigation_hard_disabled)
+ return 0;
+
+ return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
+}
+
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
{
if (kvm->arch.nx_huge_page_recovery_thread)
diff --git a/include/linux/call_once.h b/include/linux/call_once.h
index 6261aa0b3fb0..9d47ed50139b 100644
--- a/include/linux/call_once.h
+++ b/include/linux/call_once.h
@@ -26,20 +26,28 @@ do { \
__once_init((once), #once, &__key); \
} while (0)
-static inline void call_once(struct once *once, void (*cb)(struct once *))
+static inline int call_once(struct once *once, int (*cb)(struct once *))
{
+ int r;
+
/* Pairs with atomic_set_release() below. */
if (atomic_read_acquire(&once->state) == ONCE_COMPLETED)
- return;
+ return 0;
guard(mutex)(&once->lock);
WARN_ON(atomic_read(&once->state) == ONCE_RUNNING);
if (atomic_read(&once->state) != ONCE_NOT_STARTED)
- return;
+ return -EINVAL;
atomic_set(&once->state, ONCE_RUNNING);
- cb(once);
+ r = cb(once);
+ if (r) {
+ atomic_set(&once->state, ONCE_NOT_STARTED);
+ return r;
+ }
+
atomic_set_release(&once->state, ONCE_COMPLETED);
+ return 0;
}
#endif /* _LINUX_CALL_ONCE_H */
base-commit: f7bafceba76e9ab475b413578c1757ee18c3e44b
--
next prev parent reply other threads:[~2025-01-27 16:48 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-01-24 23:46 [PATCH] KVM: x86/mmu: Ensure NX huge page recovery thread is alive before waking Sean Christopherson
2025-01-25 0:50 ` Sean Christopherson
2025-01-25 4:11 ` Keith Busch
2025-01-27 16:48 ` Sean Christopherson [this message]
2025-01-27 17:04 ` Keith Busch
2025-01-27 17:19 ` Sean Christopherson
2025-01-27 18:22 ` Keith Busch
2025-01-28 15:41 ` Paolo Bonzini
2025-01-28 15:44 ` Keith Busch
2025-02-04 16:28 ` Paolo Bonzini
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=Z5e4w7IlEEk2cpH-@google.com \
--to=seanjc@google.com \
--cc=kbusch@kernel.org \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=pbonzini@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.