From: Sean Christopherson <seanjc@google.com>
To: Keith Busch <kbusch@kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>,
kvm@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH] KVM: x86/mmu: Ensure NX huge page recovery thread is alive before waking
Date: Mon, 27 Jan 2025 08:48:03 -0800 [thread overview]
Message-ID: <Z5e4w7IlEEk2cpH-@google.com> (raw)
In-Reply-To: <Z5RkcB_wf5Y74BUM@kbusch-mbp>
On Fri, Jan 24, 2025, Keith Busch wrote:
> On Fri, Jan 24, 2025 at 03:46:23PM -0800, Sean Christopherson wrote:
> > +static void kvm_wake_nx_recovery_thread(struct kvm *kvm)
> > +{
> > + /*
> > + * The NX recovery thread is spawned on-demand at the first KVM_RUN and
> > + * may not be valid even though the VM is globally visible. Do nothing,
> > + * as such a VM can't have any possible NX huge pages.
> > + */
> > + struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread);
> > +
> > + if (nx_thread)
> > + vhost_task_wake(nx_thread);
> > +}
...
> > + nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker,
> > + kvm_nx_huge_page_recovery_worker_kill,
> > + kvm, "kvm-nx-lpage-recovery");
> >
> > - if (kvm->arch.nx_huge_page_recovery_thread)
> > - vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
> > + if (!nx_thread)
> > + return;
> > +
> > + vhost_task_start(nx_thread);
> > +
> > + /* Make the task visible only once it is fully started. */
> > + WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread);
>
> I believe the WRITE_ONCE needs to happen before the vhost_task_start to
> ensure the parameter update callback can see it before it's started.
It's not clear to me that calling vhost_task_wake() before vhost_task_start() is
allowed, which is why I deliberately waited until the task was started to make it
visible. Though FWIW, doing "vhost_task_wake(nx_thread)" before vhost_task_start()
doesn't explode.
Ha! There is another bug here, but we can smack 'em both with a bit of trickery
and do an optimized serialization in the process.
If vhost_task_create() fails, then the call_once() will "succeed" and mark the
structure as ONCE_COMPLETED. The first KVM_RUN will fail with -ENOMEM, but any
subsequent calls will succeed, including in-flight KVM_RUNs on other threads.
Odds are good userspace will terminate the VM on -ENOMEM, but that't not guaranteed,
e.g. if userspace has logic to retry a few times before giving up.
If call_once() and its callback are modified to return errors, then we can abuse
call_once() to serialize against kvm_mmu_start_lpage_recovery() when waking the
recovery thread. If the recovery thread is fully created, call_once() is a lockless
happy path, otherwise the wakup path will serialize against the creation path
via the once's mutex.
Over two patches...
---
arch/x86/kvm/mmu/mmu.c | 46 ++++++++++++++++++++++++++++-----------
include/linux/call_once.h | 16 ++++++++++----
2 files changed, 45 insertions(+), 17 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index a45ae60e84ab..f3ad33cd68b3 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7120,6 +7120,26 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}
+static int kvm_nx_recovery_thread_not_ready(struct once *once)
+{
+ return -ENOENT;
+}
+
+static void kvm_wake_nx_recovery_thread(struct kvm *kvm)
+{
+ /*
+ * The NX recovery thread is spawned on-demand at the first KVM_RUN and
+ * may not be started even though the VM is globally visible. Abuse
+ * call_once() to serialize against starting the recovery thread; if
+ * this task's callback is invoked, then the thread hasn't been created
+ * and the thread is guaranteed to see up-to-date parameters.
+ */
+ if (call_once(&kvm->arch.nx_once, kvm_nx_recovery_thread_not_ready))
+ return;
+
+ vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
+}
+
static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
{
if (nx_hugepage_mitigation_hard_disabled)
@@ -7180,7 +7200,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
kvm_mmu_zap_all_fast(kvm);
mutex_unlock(&kvm->slots_lock);
- vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
+ kvm_wake_nx_recovery_thread(kvm);
}
mutex_unlock(&kvm_lock);
}
@@ -7315,7 +7335,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
- vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
+ kvm_wake_nx_recovery_thread(kvm);
mutex_unlock(&kvm_lock);
}
@@ -7447,7 +7467,7 @@ static bool kvm_nx_huge_page_recovery_worker(void *data)
return true;
}
-static void kvm_mmu_start_lpage_recovery(struct once *once)
+static int kvm_mmu_start_lpage_recovery(struct once *once)
{
struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once);
struct kvm *kvm = container_of(ka, struct kvm, arch);
@@ -7457,21 +7477,21 @@ static void kvm_mmu_start_lpage_recovery(struct once *once)
kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill,
kvm, "kvm-nx-lpage-recovery");
- if (kvm->arch.nx_huge_page_recovery_thread)
- vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
-}
-
-int kvm_mmu_post_init_vm(struct kvm *kvm)
-{
- if (nx_hugepage_mitigation_hard_disabled)
- return 0;
-
- call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
if (!kvm->arch.nx_huge_page_recovery_thread)
return -ENOMEM;
+
+ vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
return 0;
}
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+ if (nx_hugepage_mitigation_hard_disabled)
+ return 0;
+
+ return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
+}
+
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
{
if (kvm->arch.nx_huge_page_recovery_thread)
diff --git a/include/linux/call_once.h b/include/linux/call_once.h
index 6261aa0b3fb0..9d47ed50139b 100644
--- a/include/linux/call_once.h
+++ b/include/linux/call_once.h
@@ -26,20 +26,28 @@ do { \
__once_init((once), #once, &__key); \
} while (0)
-static inline void call_once(struct once *once, void (*cb)(struct once *))
+static inline int call_once(struct once *once, int (*cb)(struct once *))
{
+ int r;
+
/* Pairs with atomic_set_release() below. */
if (atomic_read_acquire(&once->state) == ONCE_COMPLETED)
- return;
+ return 0;
guard(mutex)(&once->lock);
WARN_ON(atomic_read(&once->state) == ONCE_RUNNING);
if (atomic_read(&once->state) != ONCE_NOT_STARTED)
- return;
+ return -EINVAL;
atomic_set(&once->state, ONCE_RUNNING);
- cb(once);
+ r = cb(once);
+ if (r) {
+ atomic_set(&once->state, ONCE_NOT_STARTED);
+ return r;
+ }
+
atomic_set_release(&once->state, ONCE_COMPLETED);
+ return 0;
}
#endif /* _LINUX_CALL_ONCE_H */
base-commit: f7bafceba76e9ab475b413578c1757ee18c3e44b
--
next prev parent reply other threads:[~2025-01-27 16:48 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-01-24 23:46 [PATCH] KVM: x86/mmu: Ensure NX huge page recovery thread is alive before waking Sean Christopherson
2025-01-25 0:50 ` Sean Christopherson
2025-01-25 4:11 ` Keith Busch
2025-01-27 16:48 ` Sean Christopherson [this message]
2025-01-27 17:04 ` Keith Busch
2025-01-27 17:19 ` Sean Christopherson
2025-01-27 18:22 ` Keith Busch
2025-01-28 15:41 ` Paolo Bonzini
2025-01-28 15:44 ` Keith Busch
2025-02-04 16:28 ` Paolo Bonzini
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=Z5e4w7IlEEk2cpH-@google.com \
--to=seanjc@google.com \
--cc=kbusch@kernel.org \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=pbonzini@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox