* [PATCH v2] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count transitions
@ 2025-05-05 18:03 Sean Christopherson
2025-05-06 9:48 ` Yosry Ahmed
` (2 more replies)
0 siblings, 3 replies; 9+ messages in thread
From: Sean Christopherson @ 2025-05-05 18:03 UTC (permalink / raw)
To: Sean Christopherson, Paolo Bonzini
Cc: kvm, linux-kernel, Michael Larabel, Borislav Petkov
Set the magic BP_SPEC_REDUCE bit to mitigate SRSO when running VMs if and
only if KVM has at least one active VM. Leaving the bit set at all times
unfortunately degrades performance by a wee bit more than expected.
Use a dedicated spinlock and counter instead of hooking virtualization
enablement, as changing the behavior of kvm.enable_virt_at_load based on
SRSO_BP_SPEC_REDUCE is painful, and has its own drawbacks, e.g. could
result in performance issues for flows that are sensitive to VM creation
latency.
Defer setting BP_SPEC_REDUCE until VMRUN is imminent to avoid impacting
performance on CPUs that aren't running VMs, e.g. if a setup is using
housekeeping CPUs. Setting BP_SPEC_REDUCE in task context, i.e. without
blasting IPIs to all CPUs, also helps avoid serializing 1<=>N transitions
without incurring a gross amount of complexity (see the Link for details
on how ugly coordinating via IPIs gets).
Link: https://lore.kernel.org/all/aBOnzNCngyS_pQIW@google.com
Fixes: 8442df2b49ed ("x86/bugs: KVM: Add support for SRSO_MSR_FIX")
Reported-by: Michael Larabel <Michael@michaellarabel.com>
Closes: https://www.phoronix.com/review/linux-615-amd-regression
Cc: Borislav Petkov <bp@alien8.de>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
v2: Defer setting BP_SPEC_REDUCE until VMRUN is imminent, which in turn
allows for eliding the lock on 0<=>1 transitions as there is no race
with CPUs doing VMRUN before receiving the IPI to set the bit, and
having multiple tasks take the lock during svm_srso_vm_init() is a-ok.
v1: https://lore.kernel.org/all/20250502223456.887618-1-seanjc@google.com
arch/x86/kvm/svm/svm.c | 71 ++++++++++++++++++++++++++++++++++++++----
arch/x86/kvm/svm/svm.h | 2 ++
2 files changed, 67 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index cc1c721ba067..15f7a0703c16 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -607,9 +607,6 @@ static void svm_disable_virtualization_cpu(void)
kvm_cpu_svm_disable();
amd_pmu_disable_virt();
-
- if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
- msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
}
static int svm_enable_virtualization_cpu(void)
@@ -687,9 +684,6 @@ static int svm_enable_virtualization_cpu(void)
rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi);
}
- if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
- msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
-
return 0;
}
@@ -1518,6 +1512,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
+#ifdef CONFIG_CPU_MITIGATIONS
+static DEFINE_SPINLOCK(srso_lock);
+static atomic_t srso_nr_vms;
+
+static void svm_srso_clear_bp_spec_reduce(void *ign)
+{
+ struct svm_cpu_data *sd = this_cpu_ptr(&svm_data);
+
+ if (!sd->bp_spec_reduce_set)
+ return;
+
+ msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+ sd->bp_spec_reduce_set = false;
+}
+
+static void svm_srso_vm_destroy(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ return;
+
+ if (atomic_dec_return(&srso_nr_vms))
+ return;
+
+ guard(spinlock)(&srso_lock);
+
+ /*
+ * Verify a new VM didn't come along, acquire the lock, and increment
+ * the count before this task acquired the lock.
+ */
+ if (atomic_read(&srso_nr_vms))
+ return;
+
+ on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
+}
+
+static void svm_srso_vm_init(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ return;
+
+ /*
+ * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0
+ * transition, i.e. destroying the last VM, is fully complete, e.g. so
+ * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs.
+ */
+ if (atomic_inc_not_zero(&srso_nr_vms))
+ return;
+
+ guard(spinlock)(&srso_lock);
+
+ atomic_inc(&srso_nr_vms);
+}
+#else
+static void svm_srso_vm_init(void) { }
+static void svm_srso_vm_destroy(void) { }
+#endif
+
static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1550,6 +1601,11 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
(!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
+ if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) &&
+ !sd->bp_spec_reduce_set) {
+ sd->bp_spec_reduce_set = true;
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+ }
svm->guest_state_loaded = true;
}
@@ -5036,6 +5092,8 @@ static void svm_vm_destroy(struct kvm *kvm)
{
avic_vm_destroy(kvm);
sev_vm_destroy(kvm);
+
+ svm_srso_vm_destroy();
}
static int svm_vm_init(struct kvm *kvm)
@@ -5061,6 +5119,7 @@ static int svm_vm_init(struct kvm *kvm)
return ret;
}
+ svm_srso_vm_init();
return 0;
}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index d4490eaed55d..f16b068c4228 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -335,6 +335,8 @@ struct svm_cpu_data {
u32 next_asid;
u32 min_asid;
+ bool bp_spec_reduce_set;
+
struct vmcb *save_area;
unsigned long save_area_pa;
base-commit: 45eb29140e68ffe8e93a5471006858a018480a45
--
2.49.0.967.g6a0df3ecc3-goog
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH v2] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count transitions 2025-05-05 18:03 [PATCH v2] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count transitions Sean Christopherson @ 2025-05-06 9:48 ` Yosry Ahmed 2025-05-06 14:16 ` Sean Christopherson 2025-05-06 14:22 ` Borislav Petkov 2025-05-08 23:04 ` Sean Christopherson 2 siblings, 1 reply; 9+ messages in thread From: Yosry Ahmed @ 2025-05-06 9:48 UTC (permalink / raw) To: Sean Christopherson Cc: Paolo Bonzini, kvm, linux-kernel, Michael Larabel, Borislav Petkov On Mon, May 05, 2025 at 11:03:00AM -0700, Sean Christopherson wrote: > Set the magic BP_SPEC_REDUCE bit to mitigate SRSO when running VMs if and > only if KVM has at least one active VM. Leaving the bit set at all times > unfortunately degrades performance by a wee bit more than expected. > > Use a dedicated spinlock and counter instead of hooking virtualization > enablement, as changing the behavior of kvm.enable_virt_at_load based on > SRSO_BP_SPEC_REDUCE is painful, and has its own drawbacks, e.g. could > result in performance issues for flows that are sensitive to VM creation > latency. > > Defer setting BP_SPEC_REDUCE until VMRUN is imminent to avoid impacting > performance on CPUs that aren't running VMs, e.g. if a setup is using > housekeeping CPUs. Setting BP_SPEC_REDUCE in task context, i.e. without > blasting IPIs to all CPUs, also helps avoid serializing 1<=>N transitions > without incurring a gross amount of complexity (see the Link for details > on how ugly coordinating via IPIs gets). > > Link: https://lore.kernel.org/all/aBOnzNCngyS_pQIW@google.com > Fixes: 8442df2b49ed ("x86/bugs: KVM: Add support for SRSO_MSR_FIX") > Reported-by: Michael Larabel <Michael@michaellarabel.com> > Closes: https://www.phoronix.com/review/linux-615-amd-regression > Cc: Borislav Petkov <bp@alien8.de> > Signed-off-by: Sean Christopherson <seanjc@google.com> > --- > > v2: Defer setting BP_SPEC_REDUCE until VMRUN is imminent, which in turn > allows for eliding the lock on 0<=>1 transitions as there is no race > with CPUs doing VMRUN before receiving the IPI to set the bit, and > having multiple tasks take the lock during svm_srso_vm_init() is a-ok. > > v1: https://lore.kernel.org/all/20250502223456.887618-1-seanjc@google.com > > arch/x86/kvm/svm/svm.c | 71 ++++++++++++++++++++++++++++++++++++++---- > arch/x86/kvm/svm/svm.h | 2 ++ > 2 files changed, 67 insertions(+), 6 deletions(-) > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > index cc1c721ba067..15f7a0703c16 100644 > --- a/arch/x86/kvm/svm/svm.c > +++ b/arch/x86/kvm/svm/svm.c > @@ -607,9 +607,6 @@ static void svm_disable_virtualization_cpu(void) > kvm_cpu_svm_disable(); > > amd_pmu_disable_virt(); > - > - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) > - msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); > } > > static int svm_enable_virtualization_cpu(void) > @@ -687,9 +684,6 @@ static int svm_enable_virtualization_cpu(void) > rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); > } > > - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) > - msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); > - > return 0; > } > > @@ -1518,6 +1512,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) > __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); > } > > +#ifdef CONFIG_CPU_MITIGATIONS > +static DEFINE_SPINLOCK(srso_lock); > +static atomic_t srso_nr_vms; > + > +static void svm_srso_clear_bp_spec_reduce(void *ign) > +{ > + struct svm_cpu_data *sd = this_cpu_ptr(&svm_data); > + > + if (!sd->bp_spec_reduce_set) > + return; > + > + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); > + sd->bp_spec_reduce_set = false; > +} > + > +static void svm_srso_vm_destroy(void) > +{ > + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) > + return; > + > + if (atomic_dec_return(&srso_nr_vms)) > + return; > + > + guard(spinlock)(&srso_lock); > + > + /* > + * Verify a new VM didn't come along, acquire the lock, and increment > + * the count before this task acquired the lock. > + */ > + if (atomic_read(&srso_nr_vms)) > + return; > + > + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); Just a passing-by comment. I get worried about sending IPIs while holding a spinlock because if someone ever tries to hold that spinlock with IRQs disabled, it may cause a deadlock. This is not the case for this lock, but it's not obvious (at least to me) that holding it in a different code path that doesn't send IPIs with IRQs disabled could cause a problem. You could add a comment, convert it to a mutex to make this scenario impossible, or dismiss my comment as being too paranoid/ridiculous :) ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count transitions 2025-05-06 9:48 ` Yosry Ahmed @ 2025-05-06 14:16 ` Sean Christopherson 2025-05-06 14:29 ` Yosry Ahmed 0 siblings, 1 reply; 9+ messages in thread From: Sean Christopherson @ 2025-05-06 14:16 UTC (permalink / raw) To: Yosry Ahmed Cc: Paolo Bonzini, kvm, linux-kernel, Michael Larabel, Borislav Petkov On Tue, May 06, 2025, Yosry Ahmed wrote: > On Mon, May 05, 2025 at 11:03:00AM -0700, Sean Christopherson wrote: > > +static void svm_srso_vm_destroy(void) > > +{ > > + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) > > + return; > > + > > + if (atomic_dec_return(&srso_nr_vms)) > > + return; > > + > > + guard(spinlock)(&srso_lock); > > + > > + /* > > + * Verify a new VM didn't come along, acquire the lock, and increment > > + * the count before this task acquired the lock. > > + */ > > + if (atomic_read(&srso_nr_vms)) > > + return; > > + > > + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); > > Just a passing-by comment. I get worried about sending IPIs while > holding a spinlock because if someone ever tries to hold that spinlock > with IRQs disabled, it may cause a deadlock. > > This is not the case for this lock, but it's not obvious (at least to > me) that holding it in a different code path that doesn't send IPIs with > IRQs disabled could cause a problem. > > You could add a comment, convert it to a mutex to make this scenario > impossible, Using a mutex doesn't make deadlock impossible, it's still perfectly legal to disable IRQs while holding a mutex. Similarly, I don't want to add a comment, because there is absolutely nothing special/unique about this situation/lock. E.g. KVM has tens of calls to smp_call_function_many_cond() while holding a spinlock equivalent, in the form of kvm_make_all_cpus_request() while holding mmu_lock. smp_call_function_many_cond() already asserts that IRQs are disabled, so I have zero concerns about this flow breaking in the future. > or dismiss my comment as being too paranoid/ridiculous :) I wouldn't say your thought process is too paranoid; when writing the code, I had to pause and think to remember whether or not using on_each_cpu() while holding a spinlock is allowed. But I do think the conclusion is wrong :-) ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count transitions 2025-05-06 14:16 ` Sean Christopherson @ 2025-05-06 14:29 ` Yosry Ahmed 2025-05-06 15:57 ` Sean Christopherson 0 siblings, 1 reply; 9+ messages in thread From: Yosry Ahmed @ 2025-05-06 14:29 UTC (permalink / raw) To: Sean Christopherson Cc: Paolo Bonzini, kvm, linux-kernel, Michael Larabel, Borislav Petkov On Tue, May 06, 2025 at 07:16:06AM -0700, Sean Christopherson wrote: > On Tue, May 06, 2025, Yosry Ahmed wrote: > > On Mon, May 05, 2025 at 11:03:00AM -0700, Sean Christopherson wrote: > > > +static void svm_srso_vm_destroy(void) > > > +{ > > > + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) > > > + return; > > > + > > > + if (atomic_dec_return(&srso_nr_vms)) > > > + return; > > > + > > > + guard(spinlock)(&srso_lock); > > > + > > > + /* > > > + * Verify a new VM didn't come along, acquire the lock, and increment > > > + * the count before this task acquired the lock. > > > + */ > > > + if (atomic_read(&srso_nr_vms)) > > > + return; > > > + > > > + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); > > > > Just a passing-by comment. I get worried about sending IPIs while > > holding a spinlock because if someone ever tries to hold that spinlock > > with IRQs disabled, it may cause a deadlock. > > > > This is not the case for this lock, but it's not obvious (at least to > > me) that holding it in a different code path that doesn't send IPIs with > > IRQs disabled could cause a problem. > > > > You could add a comment, convert it to a mutex to make this scenario > > impossible, > > Using a mutex doesn't make deadlock impossible, it's still perfectly legal to > disable IRQs while holding a mutex. Right, but it's illegal to hold a mutex while disabling IRQs. In this case, if the other CPU is already holding the lock then there's no risk of deadlock, right? > > Similarly, I don't want to add a comment, because there is absolutely nothing > special/unique about this situation/lock. E.g. KVM has tens of calls to > smp_call_function_many_cond() while holding a spinlock equivalent, in the form > of kvm_make_all_cpus_request() while holding mmu_lock. Agreed that it's not a unique situation at all. Ideally we'd have some debugging (lockdep?) magic that identifies that an IPI is being sent while a lock is held, and that this specific lock is never spinned on with IRQs disabled. > > smp_call_function_many_cond() already asserts that IRQs are disabled, so I have > zero concerns about this flow breaking in the future. That doesn't really help tho, the problem is if another CPU spins on the lock with IRQs disabled, regardless of whether or not it. Basically if CPU 1 acquires the lock and sends an IPI while CPU 2 disables IRQs and spins on the lock. > > > or dismiss my comment as being too paranoid/ridiculous :) > > I wouldn't say your thought process is too paranoid; when writing the code, I had > to pause and think to remember whether or not using on_each_cpu() while holding a > spinlock is allowed. But I do think the conclusion is wrong :-) That's fair. I think protection against this should be done more generically as I mentioned earlier, but it felt like it would be easy-ish to side-step it in this case. ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count transitions 2025-05-06 14:29 ` Yosry Ahmed @ 2025-05-06 15:57 ` Sean Christopherson 2025-05-07 7:05 ` Yosry Ahmed 0 siblings, 1 reply; 9+ messages in thread From: Sean Christopherson @ 2025-05-06 15:57 UTC (permalink / raw) To: Yosry Ahmed Cc: Paolo Bonzini, kvm, linux-kernel, Michael Larabel, Borislav Petkov On Tue, May 06, 2025, Yosry Ahmed wrote: > On Tue, May 06, 2025 at 07:16:06AM -0700, Sean Christopherson wrote: > > On Tue, May 06, 2025, Yosry Ahmed wrote: > > > On Mon, May 05, 2025 at 11:03:00AM -0700, Sean Christopherson wrote: > > > > +static void svm_srso_vm_destroy(void) > > > > +{ > > > > + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) > > > > + return; > > > > + > > > > + if (atomic_dec_return(&srso_nr_vms)) > > > > + return; > > > > + > > > > + guard(spinlock)(&srso_lock); > > > > + > > > > + /* > > > > + * Verify a new VM didn't come along, acquire the lock, and increment > > > > + * the count before this task acquired the lock. > > > > + */ > > > > + if (atomic_read(&srso_nr_vms)) > > > > + return; > > > > + > > > > + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); > > > > > > Just a passing-by comment. I get worried about sending IPIs while > > > holding a spinlock because if someone ever tries to hold that spinlock > > > with IRQs disabled, it may cause a deadlock. > > > > > > This is not the case for this lock, but it's not obvious (at least to > > > me) that holding it in a different code path that doesn't send IPIs with > > > IRQs disabled could cause a problem. > > > > > > You could add a comment, convert it to a mutex to make this scenario > > > impossible, > > > > Using a mutex doesn't make deadlock impossible, it's still perfectly legal to > > disable IRQs while holding a mutex. > > Right, but it's illegal to hold a mutex while disabling IRQs. Nit on the wording: it's illegal to take a mutex while IRQs are disabled. Disabling IRQs while already holding a mutex is fine. And it's also illegal to take a spinlock while IRQs are disabled, becauase spinlocks become sleepable mutexes with PREEMPT_RT=y. While PREEMPT_RT=y isn't super common, people do run KVM with PREEMPT_RT=y, and I'm guessing bots/CI would trip any such violation quite quickly. E.g. with IRQs disabled around the guard(spinlock)(&srso_lock): BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 2799, name: qemu preempt_count: 0, expected: 0 RCU nest depth: 0, expected: 0 1 lock held by qemu/2799: #0: ffffffff8263f898 (srso_lock){....}-{3:3}, at: svm_vm_destroy+0x47/0xa0 irq event stamp: 9090 hardirqs last enabled at (9089): [<ffffffff81414087>] vprintk_store+0x467/0x4d0 hardirqs last disabled at (9090): [<ffffffff812fd1ce>] svm_vm_destroy+0x5e/0xa0 softirqs last enabled at (0): [<ffffffff8137585c>] copy_process+0xa1c/0x29f0 softirqs last disabled at (0): [<0000000000000000>] 0x0 Call Trace: <TASK> dump_stack_lvl+0x57/0x80 __might_resched.cold+0xcc/0xde rt_spin_lock+0x5b/0x170 svm_vm_destroy+0x47/0xa0 kvm_destroy_vm+0x180/0x310 kvm_vm_release+0x1d/0x30 __fput+0x10d/0x2f0 task_work_run+0x58/0x90 do_exit+0x325/0xa80 do_group_exit+0x32/0xa0 get_signal+0xb5b/0xbb0 arch_do_signal_or_restart+0x29/0x230 syscall_exit_to_user_mode+0xea/0x180 do_syscall_64+0x7a/0x220 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7fb50ae7fc4e </TASK> > In this case, if the other CPU is already holding the lock then there's no > risk of deadlock, right? Not on srso_lock, but there's still deadlock potential on the locks used to protect the call_function_data structure. > > Similarly, I don't want to add a comment, because there is absolutely nothing > > special/unique about this situation/lock. E.g. KVM has tens of calls to > > smp_call_function_many_cond() while holding a spinlock equivalent, in the form > > of kvm_make_all_cpus_request() while holding mmu_lock. > > Agreed that it's not a unique situation at all. Ideally we'd have some > debugging (lockdep?) magic that identifies that an IPI is being sent > while a lock is held, and that this specific lock is never spinned on > with IRQs disabled. Sleepable spinlocks aside, the lockdep_assert_irqs_enabled() in smp_call_function_many_cond() already provides sufficient of coverage for that case. And if code is using some other form of IPI communication *and* taking raw spinlocks, then I think it goes without saying that developers would need to be very, very careful. > > smp_call_function_many_cond() already asserts that IRQs are disabled, so I have > > zero concerns about this flow breaking in the future. > > That doesn't really help tho, the problem is if another CPU spins on the > lock with IRQs disabled, regardless of whether or not it. Basically if > CPU 1 acquires the lock and sends an IPI while CPU 2 disables IRQs and > spins on the lock. Given that svm_srso_vm_destroy() is guaranteed to call on_each_cpu() with the lock held at some point, I'm completely comfortable relying on its lockdep assertion. > > > or dismiss my comment as being too paranoid/ridiculous :) > > > > I wouldn't say your thought process is too paranoid; when writing the code, I had > > to pause and think to remember whether or not using on_each_cpu() while holding a > > spinlock is allowed. But I do think the conclusion is wrong :-) > > That's fair. I think protection against this should be done more generically > as I mentioned earlier, but it felt like it would be easy-ish to side-step it > in this case. Eh, modifying this code in such a way that it could deadlock without lockdep noticing would likely send up a comincal number of red flags during code review. ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count transitions 2025-05-06 15:57 ` Sean Christopherson @ 2025-05-07 7:05 ` Yosry Ahmed 2025-05-07 13:19 ` Sean Christopherson 0 siblings, 1 reply; 9+ messages in thread From: Yosry Ahmed @ 2025-05-07 7:05 UTC (permalink / raw) To: Sean Christopherson Cc: Paolo Bonzini, kvm, linux-kernel, Michael Larabel, Borislav Petkov On Tue, May 06, 2025 at 08:57:36AM -0700, Sean Christopherson wrote: > On Tue, May 06, 2025, Yosry Ahmed wrote: > > On Tue, May 06, 2025 at 07:16:06AM -0700, Sean Christopherson wrote: > > > On Tue, May 06, 2025, Yosry Ahmed wrote: > > > > On Mon, May 05, 2025 at 11:03:00AM -0700, Sean Christopherson wrote: > > > > > +static void svm_srso_vm_destroy(void) > > > > > +{ > > > > > + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) > > > > > + return; > > > > > + > > > > > + if (atomic_dec_return(&srso_nr_vms)) > > > > > + return; > > > > > + > > > > > + guard(spinlock)(&srso_lock); > > > > > + > > > > > + /* > > > > > + * Verify a new VM didn't come along, acquire the lock, and increment > > > > > + * the count before this task acquired the lock. > > > > > + */ > > > > > + if (atomic_read(&srso_nr_vms)) > > > > > + return; > > > > > + > > > > > + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); > > > > > > > > Just a passing-by comment. I get worried about sending IPIs while > > > > holding a spinlock because if someone ever tries to hold that spinlock > > > > with IRQs disabled, it may cause a deadlock. > > > > > > > > This is not the case for this lock, but it's not obvious (at least to > > > > me) that holding it in a different code path that doesn't send IPIs with > > > > IRQs disabled could cause a problem. > > > > > > > > You could add a comment, convert it to a mutex to make this scenario > > > > impossible, > > > > > > Using a mutex doesn't make deadlock impossible, it's still perfectly legal to > > > disable IRQs while holding a mutex. > > > > Right, but it's illegal to hold a mutex while disabling IRQs. > > Nit on the wording: it's illegal to take a mutex while IRQs are disabled. Disabling > IRQs while already holding a mutex is fine. Yes :) > > And it's also illegal to take a spinlock while IRQs are disabled, becauase spinlocks > become sleepable mutexes with PREEMPT_RT=y. While PREEMPT_RT=y isn't super common, > people do run KVM with PREEMPT_RT=y, and I'm guessing bots/CI would trip any such > violation quite quickly. But I think spin_lock_irq() and friends aren't a violation with PREEMPT_RT=y, so these wouldn't trip bots/CI AFAICT. > > E.g. with IRQs disabled around the guard(spinlock)(&srso_lock): > > BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 > in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 2799, name: qemu > preempt_count: 0, expected: 0 > RCU nest depth: 0, expected: 0 > 1 lock held by qemu/2799: > #0: ffffffff8263f898 (srso_lock){....}-{3:3}, at: svm_vm_destroy+0x47/0xa0 > irq event stamp: 9090 > hardirqs last enabled at (9089): [<ffffffff81414087>] vprintk_store+0x467/0x4d0 > hardirqs last disabled at (9090): [<ffffffff812fd1ce>] svm_vm_destroy+0x5e/0xa0 > softirqs last enabled at (0): [<ffffffff8137585c>] copy_process+0xa1c/0x29f0 > softirqs last disabled at (0): [<0000000000000000>] 0x0 > Call Trace: > <TASK> > dump_stack_lvl+0x57/0x80 > __might_resched.cold+0xcc/0xde > rt_spin_lock+0x5b/0x170 > svm_vm_destroy+0x47/0xa0 > kvm_destroy_vm+0x180/0x310 > kvm_vm_release+0x1d/0x30 > __fput+0x10d/0x2f0 > task_work_run+0x58/0x90 > do_exit+0x325/0xa80 > do_group_exit+0x32/0xa0 > get_signal+0xb5b/0xbb0 > arch_do_signal_or_restart+0x29/0x230 > syscall_exit_to_user_mode+0xea/0x180 > do_syscall_64+0x7a/0x220 > entry_SYSCALL_64_after_hwframe+0x76/0x7e > RIP: 0033:0x7fb50ae7fc4e > </TASK> > > > In this case, if the other CPU is already holding the lock then there's no > > risk of deadlock, right? > > Not on srso_lock, but there's still deadlock potential on the locks used to protect > the call_function_data structure. > > > > Similarly, I don't want to add a comment, because there is absolutely nothing > > > special/unique about this situation/lock. E.g. KVM has tens of calls to > > > smp_call_function_many_cond() while holding a spinlock equivalent, in the form > > > of kvm_make_all_cpus_request() while holding mmu_lock. > > > > Agreed that it's not a unique situation at all. Ideally we'd have some > > debugging (lockdep?) magic that identifies that an IPI is being sent > > while a lock is held, and that this specific lock is never spinned on > > with IRQs disabled. > > Sleepable spinlocks aside, the lockdep_assert_irqs_enabled() in > smp_call_function_many_cond() already provides sufficient of coverage for that > case. And if code is using some other form of IPI communication *and* taking raw > spinlocks, then I think it goes without saying that developers would need to be > very, very careful. I think we are not talking about the same thing, or I am misunderstanding you. The lockdep_assert_irqs_enabled() assertion in smp_call_function_many_cond() does not protect against this case AFAICT. Basically imagine that a new code path is added that does: spin_lock_irq(&srso_lock); /* Some trivial logic, no IPI sending */ spin_unlock_irq(&srso_lock); I believe spin_lock_irq() will disable IRQs (at least on some setups) then spin on the lock. Now imagine svm_srso_vm_destroy() is already holding the lock and sends the IPI from CPU 1, while CPU 2 is executing the above code with IRQs already disabled and spinning on the lock. This is the deadlock scenario I am talking about. The lockdep assertion in smp_call_function_many_cond() doesn't help because IRQs are enabled on CPU 1, the problem is that they are disabled on CPU 2. Lockdep can detect this by keeping track of the fact that some code paths acquire the lock with IRQs off while some code paths acquire the lock and send IPIs, I think. > > > > smp_call_function_many_cond() already asserts that IRQs are disabled, so I have > > > zero concerns about this flow breaking in the future. > > > > That doesn't really help tho, the problem is if another CPU spins on the > > lock with IRQs disabled, regardless of whether or not it. Basically if > > CPU 1 acquires the lock and sends an IPI while CPU 2 disables IRQs and > > spins on the lock. > > Given that svm_srso_vm_destroy() is guaranteed to call on_each_cpu() with the > lock held at some point, I'm completely comfortable relying on its lockdep > assertion. > > > > > or dismiss my comment as being too paranoid/ridiculous :) > > > > > > I wouldn't say your thought process is too paranoid; when writing the code, I had > > > to pause and think to remember whether or not using on_each_cpu() while holding a > > > spinlock is allowed. But I do think the conclusion is wrong :-) > > > > That's fair. I think protection against this should be done more generically > > as I mentioned earlier, but it felt like it would be easy-ish to side-step it > > in this case. > > Eh, modifying this code in such a way that it could deadlock without lockdep > noticing would likely send up a comincal number of red flags during code review. It just takes another code path using spin_lock_irq() or friends to deadlock AFAICT. Anyway, this has side tracked and I am probably taking more of your time that I originally wanted, I was just making an observation more-or-less :) ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count transitions 2025-05-07 7:05 ` Yosry Ahmed @ 2025-05-07 13:19 ` Sean Christopherson 0 siblings, 0 replies; 9+ messages in thread From: Sean Christopherson @ 2025-05-07 13:19 UTC (permalink / raw) To: Yosry Ahmed Cc: Paolo Bonzini, kvm, linux-kernel, Michael Larabel, Borislav Petkov On Wed, May 07, 2025, Yosry Ahmed wrote: > On Tue, May 06, 2025 at 08:57:36AM -0700, Sean Christopherson wrote: > > Sleepable spinlocks aside, the lockdep_assert_irqs_enabled() in > > smp_call_function_many_cond() already provides sufficient of coverage for that > > case. And if code is using some other form of IPI communication *and* taking raw > > spinlocks, then I think it goes without saying that developers would need to be > > very, very careful. > > I think we are not talking about the same thing, or I am > misunderstanding you. The lockdep_assert_irqs_enabled() assertion in > smp_call_function_many_cond() does not protect against this case AFAICT. > > Basically imagine that a new code path is added that does: > > spin_lock_irq(&srso_lock); > /* Some trivial logic, no IPI sending */ > spin_unlock_irq(&srso_lock); > > I believe spin_lock_irq() will disable IRQs (at least on some setups) > then spin on the lock. Yes, because the most common use case for spin_lock_irq() is to prevent deadlock due to the lock being taken in IRQ context. > Now imagine svm_srso_vm_destroy() is already holding the lock and sends > the IPI from CPU 1, while CPU 2 is executing the above code with IRQs > already disabled and spinning on the lock. > > This is the deadlock scenario I am talking about. The lockdep assertion > in smp_call_function_many_cond() doesn't help because IRQs are enabled > on CPU 1, the problem is that they are disabled on CPU 2. > > Lockdep can detect this by keeping track of the fact that some code > paths acquire the lock with IRQs off while some code paths acquire the > lock and send IPIs, I think. I understand the scenario, I just don't see any meaningful risk in this case, which in turn means I don't see any reason to use an inferior lock type (for this particular case) to protect the count. spin_lock_irq() isn't a tool that's used willy-nilly, and the usage of srso_lock is extremely limited. If we manage to merge code that does spin_lock_irq(&srso_lock), you have my full permission to mock my ineptitude :-) ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count transitions 2025-05-05 18:03 [PATCH v2] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count transitions Sean Christopherson 2025-05-06 9:48 ` Yosry Ahmed @ 2025-05-06 14:22 ` Borislav Petkov 2025-05-08 23:04 ` Sean Christopherson 2 siblings, 0 replies; 9+ messages in thread From: Borislav Petkov @ 2025-05-06 14:22 UTC (permalink / raw) To: Sean Christopherson; +Cc: Paolo Bonzini, kvm, linux-kernel, Michael Larabel On Mon, May 05, 2025 at 11:03:00AM -0700, Sean Christopherson wrote: > Set the magic BP_SPEC_REDUCE bit to mitigate SRSO when running VMs if and > only if KVM has at least one active VM. Leaving the bit set at all times > unfortunately degrades performance by a wee bit more than expected. > > Use a dedicated spinlock and counter instead of hooking virtualization > enablement, as changing the behavior of kvm.enable_virt_at_load based on > SRSO_BP_SPEC_REDUCE is painful, and has its own drawbacks, e.g. could > result in performance issues for flows that are sensitive to VM creation > latency. > > Defer setting BP_SPEC_REDUCE until VMRUN is imminent to avoid impacting > performance on CPUs that aren't running VMs, e.g. if a setup is using > housekeeping CPUs. Setting BP_SPEC_REDUCE in task context, i.e. without > blasting IPIs to all CPUs, also helps avoid serializing 1<=>N transitions > without incurring a gross amount of complexity (see the Link for details > on how ugly coordinating via IPIs gets). > > Link: https://lore.kernel.org/all/aBOnzNCngyS_pQIW@google.com > Fixes: 8442df2b49ed ("x86/bugs: KVM: Add support for SRSO_MSR_FIX") I guess Cc: <stable@kernel.org> as the above is in 6.14. > Reported-by: Michael Larabel <Michael@michaellarabel.com> > Closes: https://www.phoronix.com/review/linux-615-amd-regression > Cc: Borislav Petkov <bp@alien8.de> > Signed-off-by: Sean Christopherson <seanjc@google.com> LGTM, seems to work too on my machine. Tested-by: Borislav Petkov (AMD) <bp@alien8.de> Thx for sticking with this and improving it! -- Regards/Gruss, Boris. https://people.kernel.org/tglx/notes-about-netiquette ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count transitions 2025-05-05 18:03 [PATCH v2] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count transitions Sean Christopherson 2025-05-06 9:48 ` Yosry Ahmed 2025-05-06 14:22 ` Borislav Petkov @ 2025-05-08 23:04 ` Sean Christopherson 2 siblings, 0 replies; 9+ messages in thread From: Sean Christopherson @ 2025-05-08 23:04 UTC (permalink / raw) To: Sean Christopherson, Paolo Bonzini Cc: kvm, linux-kernel, Michael Larabel, Borislav Petkov On Mon, 05 May 2025 11:03:00 -0700, Sean Christopherson wrote: > Set the magic BP_SPEC_REDUCE bit to mitigate SRSO when running VMs if and > only if KVM has at least one active VM. Leaving the bit set at all times > unfortunately degrades performance by a wee bit more than expected. > > Use a dedicated spinlock and counter instead of hooking virtualization > enablement, as changing the behavior of kvm.enable_virt_at_load based on > SRSO_BP_SPEC_REDUCE is painful, and has its own drawbacks, e.g. could > result in performance issues for flows that are sensitive to VM creation > latency. > > [...] Applied to kvm-x86 fixes. Assuming -next doesn't explode overnight, I'll get a pull request sent to Paolo tomorrow. [1/1] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count transitions https://github.com/kvm-x86/linux/commit/e3417ab75ab2 -- https://github.com/kvm-x86/linux/tree/next ^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2025-05-08 23:04 UTC | newest] Thread overview: 9+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2025-05-05 18:03 [PATCH v2] KVM: SVM: Set/clear SRSO's BP_SPEC_REDUCE on 0 <=> 1 VM count transitions Sean Christopherson 2025-05-06 9:48 ` Yosry Ahmed 2025-05-06 14:16 ` Sean Christopherson 2025-05-06 14:29 ` Yosry Ahmed 2025-05-06 15:57 ` Sean Christopherson 2025-05-07 7:05 ` Yosry Ahmed 2025-05-07 13:19 ` Sean Christopherson 2025-05-06 14:22 ` Borislav Petkov 2025-05-08 23:04 ` Sean Christopherson
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox