Re: [RFC] Para-virtualized TLB flush for PV-waiting vCPUs

kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Sean Christopherson <seanjc@google.com>
To: Kenta Ishiguro <kentaishiguro@sslab.ics.keio.ac.jp>
Cc: pbonzini@redhat.com, kvm@vger.kernel.org, vkuznets@redhat.com
Subject: Re: [RFC] Para-virtualized TLB flush for PV-waiting vCPUs
Date: Fri, 17 Jan 2025 13:34:17 -0800	[thread overview]
Message-ID: <Z4rM2abMZvurfFDO@google.com> (raw)
In-Reply-To: <20250106155652.484001-1-kentaishiguro@sslab.ics.keio.ac.jp>

On Tue, Jan 07, 2025, Kenta Ishiguro wrote:
> Signed-off-by: Kenta Ishiguro <kentaishiguro@sslab.ics.keio.ac.jp>
> ---
>  arch/x86/include/uapi/asm/kvm_para.h |  1 +
>  arch/x86/kernel/kvm.c                | 13 +++++++++++--
>  2 files changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
> index a1efa7907a0b..db26e167a707 100644
> --- a/arch/x86/include/uapi/asm/kvm_para.h
> +++ b/arch/x86/include/uapi/asm/kvm_para.h
> @@ -70,6 +70,7 @@ struct kvm_steal_time {
>  
>  #define KVM_VCPU_PREEMPTED          (1 << 0)
>  #define KVM_VCPU_FLUSH_TLB          (1 << 1)
> +#define KVM_VCPU_IN_PVWAIT          (1 << 2)
>  
>  #define KVM_CLOCK_PAIRING_WALLCLOCK 0
>  struct kvm_clock_pairing {
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 21e9e4845354..f17057b7d263 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -668,7 +668,8 @@ static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
>  		 */
>  		src = &per_cpu(steal_time, cpu);
>  		state = READ_ONCE(src->preempted);
> -		if ((state & KVM_VCPU_PREEMPTED)) {
> +		if ((state & KVM_VCPU_PREEMPTED) ||
> +		    (state & KVM_VCPU_IN_PVWAIT)) {
>  			if (try_cmpxchg(&src->preempted, &state,
>  					state | KVM_VCPU_FLUSH_TLB))

This is unsafe.  KVM_VCPU_PREEMPTED relies on it being set and cleared by the host
to ensure either the host observes KVM_VCPU_FLUSH_TLB before the vCPU enters the
guest, i.e. before the vCPU can possibly consume stale TLB entries.

If the host is already in the process of re-entering the guest on the waiting
vCPU, e.g. because it received a kick, then there will be no KVM_REQ_STEAL_UPDATE
in the host and so the host won't process KVM_VCPU_FLUSH_TLB.

I also see no reason to limit this to kvm_wait(); the logic you are relying on is
really just "let KVM do the flushes if the vCPU is in the host".  There's a balance
to be had, e.g. toggling a flag on every entry+exit would get expensive, but
toggling at kvm_arch_vcpu_put() and then letting kvm_arch_vcpu_load() request
a steal_time update seems pretty straightforward.

E.g. this will also elide IPIs if the vCPU happens to be in host userspace doing
emulation of some kind, or if the vCPU is blocking.

diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index a1efa7907a0b..e3a6e6ecf70b 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -70,6 +70,7 @@ struct kvm_steal_time {
 
 #define KVM_VCPU_PREEMPTED          (1 << 0)
 #define KVM_VCPU_FLUSH_TLB          (1 << 1)
+#define KVM_VCPU_IN_HOST           (1 << 2)
 
 #define KVM_CLOCK_PAIRING_WALLCLOCK 0
 struct kvm_clock_pairing {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index acdd72e89bb0..5e3dc209e86c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5018,8 +5018,11 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
        struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
        struct kvm_steal_time __user *st;
        struct kvm_memslots *slots;
-       static const u8 preempted = KVM_VCPU_PREEMPTED;
        gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+       u8 preempted = KVM_VCPU_IN_HOST;
+
+       if (vcpu->preempted)
+               preempted |= KVM_VCPU_PREEMPTED;
 
        /*
         * The vCPU can be marked preempted if and only if the VM-Exit was on
@@ -5037,7 +5040,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
                return;
 
-       if (vcpu->arch.st.preempted)
+       if (vcpu->arch.st.preempted == preempted)
                return;
 
        /* This happens on process exit */
@@ -5055,7 +5058,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
        BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
 
        if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
-               vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+               vcpu->arch.st.preempted = preempted;
 
        mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
 }
@@ -5064,7 +5067,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
        int idx;
 
-       if (vcpu->preempted) {
+       if (vcpu->preempted || !kvm_xen_msr_enabled(vcpu->kvm)) {
                /*
                 * Assume protected guests are in-kernel.  Inefficient yielding
                 * due to false positives is preferable to never yielding due

>  				__cpumask_clear_cpu(cpu, flushmask);
> @@ -1045,6 +1046,9 @@ static void kvm_kick_cpu(int cpu)
>  
>  static void kvm_wait(u8 *ptr, u8 val)
>  {
> +	u8 state;
> +	struct kvm_steal_time *src;
> +
>  	if (in_nmi())
>  		return;
>  
> @@ -1054,8 +1058,13 @@ static void kvm_wait(u8 *ptr, u8 val)
>  	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
>  	 */
>  	if (irqs_disabled()) {
> -		if (READ_ONCE(*ptr) == val)
> +		if (READ_ONCE(*ptr) == val) {
> +			src = this_cpu_ptr(&steal_time);
> +			state = READ_ONCE(src->preempted);
> +			try_cmpxchg(&src->preempted, &state,
> +				    state | KVM_VCPU_IN_PVWAIT);
>  			halt();
> +		}
>  	} else {
>  		local_irq_disable();
>  
> -- 
> 2.25.1
>

next prev parent reply	other threads:[~2025-01-17 21:34 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-01-06 15:56 [RFC] Para-virtualized TLB flush for PV-waiting vCPUs Kenta Ishiguro
2025-01-17 21:34 ` Sean Christopherson [this message]
2025-01-19 16:27   ` Kenta Ishiguro
2025-01-21 21:35     ` Sean Christopherson
2025-01-20  6:06 ` Chao Gao
2025-01-21 20:59   ` Sean Christopherson

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:a1efa7907a0 dfblob:e3a6e6ecf70 dfblob:acdd72e89bb
dfblob:5e3dc209e86 )
 OR (
bs:"Re: [RFC] Para-virtualized TLB flush for PV-waiting vCPUs" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Z4rM2abMZvurfFDO@google.com \
    --to=seanjc@google.com \
    --cc=kentaishiguro@sslab.ics.keio.ac.jp \
    --cc=kvm@vger.kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=vkuznets@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).