From: Carsten Stollmaier <stollmc@amazon.com>
To: Sean Christopherson <seanjc@google.com>,
Paolo Bonzini <pbonzini@redhat.com>,
Thomas Gleixner <tglx@linutronix.de>,
Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
Dave Hansen <dave.hansen@linux.intel.com>, <x86@kernel.org>,
"H. Peter Anvin" <hpa@zytor.com>
Cc: <nh-open-source@amazon.com>,
Carsten Stollmaier <stollmc@amazon.com>,
David Woodhouse <dwmw2@infradead.org>,
Peter Xu <peterx@redhat.com>,
Sebastian Biemueller <sbiemue@amazon.de>, <kvm@vger.kernel.org>,
<linux-kernel@vger.kernel.org>
Subject: [PATCH] KVM: x86: Use gfn_to_pfn_cache for steal_time
Date: Fri, 2 Aug 2024 11:44:01 +0000 [thread overview]
Message-ID: <20240802114402.96669-1-stollmc@amazon.com> (raw)
On vcpu_run, before entering the guest, the update of the steal time
information causes a page-fault if the page is not present. In our
scenario, this gets handled by do_user_addr_fault and successively
handle_userfault since we have the region registered to that.
handle_userfault uses TASK_INTERRUPTIBLE, so it is interruptible by
signals. do_user_addr_fault then busy-retries it if the pending signal
is non-fatal. This leads to contention of the mmap_lock.
This patch replaces the use of gfn_to_hva_cache with gfn_to_pfn_cache,
as gfn_to_pfn_cache ensures page presence for the memory access,
preventing the contention of the mmap_lock.
Signed-off-by: Carsten Stollmaier <stollmc@amazon.com>
CC: David Woodhouse <dwmw2@infradead.org>
CC: Sean Christopherson <seanjc@google.com>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Peter Xu <peterx@redhat.com>
CC: Sebastian Biemueller <sbiemue@amazon.de>
---
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/x86.c | 115 +++++++++++++++-----------------
2 files changed, 54 insertions(+), 63 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 950a03e0181e..63d0c0cd7a8e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -898,7 +898,7 @@ struct kvm_vcpu_arch {
u8 preempted;
u64 msr_val;
u64 last_steal;
- struct gfn_to_hva_cache cache;
+ struct gfn_to_pfn_cache cache;
} st;
u64 l1_tsc_offset;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index af6c8cf6a37a..2b8adbadfc50 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3652,10 +3652,8 @@ EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests);
static void record_steal_time(struct kvm_vcpu *vcpu)
{
- struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
- struct kvm_steal_time __user *st;
- struct kvm_memslots *slots;
- gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+ struct gfn_to_pfn_cache *gpc = &vcpu->arch.st.cache;
+ struct kvm_steal_time *st;
u64 steal;
u32 version;
@@ -3670,42 +3668,26 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
return;
- slots = kvm_memslots(vcpu->kvm);
-
- if (unlikely(slots->generation != ghc->generation ||
- gpa != ghc->gpa ||
- kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
+ read_lock(&gpc->lock);
+ while (!kvm_gpc_check(gpc, sizeof(*st))) {
/* We rely on the fact that it fits in a single page. */
BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
- kvm_is_error_hva(ghc->hva) || !ghc->memslot)
+ read_unlock(&gpc->lock);
+
+ if (kvm_gpc_refresh(gpc, sizeof(*st)))
return;
+
+ read_lock(&gpc->lock);
}
- st = (struct kvm_steal_time __user *)ghc->hva;
+ st = (struct kvm_steal_time *)gpc->khva;
/*
* Doing a TLB flush here, on the guest's behalf, can avoid
* expensive IPIs.
*/
if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
- u8 st_preempted = 0;
- int err = -EFAULT;
-
- if (!user_access_begin(st, sizeof(*st)))
- return;
-
- asm volatile("1: xchgb %0, %2\n"
- "xor %1, %1\n"
- "2:\n"
- _ASM_EXTABLE_UA(1b, 2b)
- : "+q" (st_preempted),
- "+&r" (err),
- "+m" (st->preempted));
- if (err)
- goto out;
-
- user_access_end();
+ u8 st_preempted = xchg(&st->preempted, 0);
vcpu->arch.st.preempted = 0;
@@ -3713,39 +3695,32 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
st_preempted & KVM_VCPU_FLUSH_TLB);
if (st_preempted & KVM_VCPU_FLUSH_TLB)
kvm_vcpu_flush_tlb_guest(vcpu);
-
- if (!user_access_begin(st, sizeof(*st)))
- goto dirty;
} else {
- if (!user_access_begin(st, sizeof(*st)))
- return;
-
- unsafe_put_user(0, &st->preempted, out);
+ st->preempted = 0;
vcpu->arch.st.preempted = 0;
}
- unsafe_get_user(version, &st->version, out);
+ version = st->version;
if (version & 1)
version += 1; /* first time write, random junk */
version += 1;
- unsafe_put_user(version, &st->version, out);
+ st->version = version;
smp_wmb();
- unsafe_get_user(steal, &st->steal, out);
+ steal = st->steal;
steal += current->sched_info.run_delay -
vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
- unsafe_put_user(steal, &st->steal, out);
+ st->steal = steal;
version += 1;
- unsafe_put_user(version, &st->version, out);
+ st->version = version;
+
+ kvm_gpc_mark_dirty_in_slot(gpc);
- out:
- user_access_end();
- dirty:
- mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
+ read_unlock(&gpc->lock);
}
static bool kvm_is_msr_to_save(u32 msr_index)
@@ -4020,8 +3995,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.st.msr_val = data;
- if (!(data & KVM_MSR_ENABLED))
- break;
+ if (data & KVM_MSR_ENABLED) {
+ kvm_gpc_activate(&vcpu->arch.st.cache, data & ~KVM_MSR_ENABLED,
+ sizeof(struct kvm_steal_time));
+ } else {
+ kvm_gpc_deactivate(&vcpu->arch.st.cache);
+ }
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
@@ -5051,11 +5030,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
- struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
- struct kvm_steal_time __user *st;
- struct kvm_memslots *slots;
+ struct gfn_to_pfn_cache *gpc = &vcpu->arch.st.cache;
+ struct kvm_steal_time *st;
static const u8 preempted = KVM_VCPU_PREEMPTED;
- gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+ unsigned long flags;
/*
* The vCPU can be marked preempted if and only if the VM-Exit was on
@@ -5080,20 +5058,28 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
if (unlikely(current->mm != vcpu->kvm->mm))
return;
- slots = kvm_memslots(vcpu->kvm);
-
- if (unlikely(slots->generation != ghc->generation ||
- gpa != ghc->gpa ||
- kvm_is_error_hva(ghc->hva) || !ghc->memslot))
- return;
+ read_lock_irqsave(&gpc->lock, flags);
+ if (!kvm_gpc_check(gpc, sizeof(*st)))
+ goto out_unlock_gpc;
- st = (struct kvm_steal_time __user *)ghc->hva;
+ st = (struct kvm_steal_time *)gpc->khva;
BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
- if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
- vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+ st->preempted = preempted;
+ vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
- mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
+ kvm_gpc_mark_dirty_in_slot(gpc);
+
+out_unlock_gpc:
+ read_unlock_irqrestore(&gpc->lock, flags);
+}
+
+static void kvm_steal_time_reset(struct kvm_vcpu *vcpu)
+{
+ kvm_gpc_deactivate(&vcpu->arch.st.cache);
+ vcpu->arch.st.preempted = 0;
+ vcpu->arch.st.msr_val = 0;
+ vcpu->arch.st.last_steal = 0;
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -12219,6 +12205,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
+ kvm_gpc_init(&vcpu->arch.st.cache, vcpu->kvm);
+
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
@@ -12331,6 +12319,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
int idx;
+ kvm_steal_time_reset(vcpu);
+
kvmclock_reset(vcpu);
kvm_x86_call(vcpu_free)(vcpu);
@@ -12401,7 +12391,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.apf.msr_en_val = 0;
vcpu->arch.apf.msr_int_val = 0;
- vcpu->arch.st.msr_val = 0;
+
+ kvm_steal_time_reset(vcpu);
kvmclock_reset(vcpu);
--
2.40.1
Amazon Web Services Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597
next reply other threads:[~2024-08-02 11:44 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-08-02 11:44 Carsten Stollmaier [this message]
2024-08-02 12:03 ` [PATCH] KVM: x86: Use gfn_to_pfn_cache for steal_time David Woodhouse
2024-08-02 12:38 ` Matthew Wilcox
2024-08-02 12:53 ` David Woodhouse
2024-08-02 12:56 ` David Woodhouse
2024-08-02 16:06 ` David Woodhouse
2024-08-02 22:40 ` Peter Xu
2024-08-03 8:35 ` David Woodhouse
2024-08-04 13:31 ` Peter Xu
2024-08-17 0:22 ` Sean Christopherson
2024-08-20 10:11 ` David Woodhouse
2025-07-29 10:28 ` David Woodhouse
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240802114402.96669-1-stollmc@amazon.com \
--to=stollmc@amazon.com \
--cc=bp@alien8.de \
--cc=dave.hansen@linux.intel.com \
--cc=dwmw2@infradead.org \
--cc=hpa@zytor.com \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@redhat.com \
--cc=nh-open-source@amazon.com \
--cc=pbonzini@redhat.com \
--cc=peterx@redhat.com \
--cc=sbiemue@amazon.de \
--cc=seanjc@google.com \
--cc=tglx@linutronix.de \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.