* [PATCH v3 41/52] KVM: PPC: Book3S HV P9: Don't restore PSSCR if not needed
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>
This also moves the PSSCR update in nested entry to avoid a SPR
scoreboard stall.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/kvm/book3s_hv.c | 7 +++++--
arch/powerpc/kvm/book3s_hv_p9_entry.c | 26 +++++++++++++++++++-------
2 files changed, 24 insertions(+), 9 deletions(-)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f4445cc5a29a..f10cb4167549 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3876,7 +3876,9 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns
if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
msr = mfmsr(); /* TM restore can update msr */
- mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
+ if (vcpu->arch.psscr != host_psscr)
+ mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
+
kvmhv_save_hv_regs(vcpu, &hvregs);
hvregs.lpcr = lpcr;
vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
@@ -3917,7 +3919,6 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns
vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);
- mtspr(SPRN_PSSCR_PR, host_psscr);
store_vcpu_state(vcpu);
@@ -3930,6 +3931,8 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns
timer_rearm_host_dec(*tb);
restore_p9_host_os_sprs(vcpu, &host_os_sprs);
+ if (vcpu->arch.psscr != host_psscr)
+ mtspr(SPRN_PSSCR_PR, host_psscr);
return trap;
}
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 0f341011816c..eae9d806d704 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -649,6 +649,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
unsigned long host_dawr0;
unsigned long host_dawrx0;
unsigned long host_psscr;
+ unsigned long host_hpsscr;
unsigned long host_pidr;
unsigned long host_dawr1;
unsigned long host_dawrx1;
@@ -666,7 +667,9 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
host_hfscr = mfspr(SPRN_HFSCR);
host_ciabr = mfspr(SPRN_CIABR);
- host_psscr = mfspr(SPRN_PSSCR);
+ host_psscr = mfspr(SPRN_PSSCR_PR);
+ if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+ host_hpsscr = mfspr(SPRN_PSSCR);
host_pidr = mfspr(SPRN_PID);
if (dawr_enabled()) {
@@ -750,8 +753,14 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
if (vcpu->arch.ciabr != host_ciabr)
mtspr(SPRN_CIABR, vcpu->arch.ciabr);
- mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
- (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
+
+ if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) {
+ mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
+ (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
+ } else {
+ if (vcpu->arch.psscr != host_psscr)
+ mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
+ }
mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
@@ -957,7 +966,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
vcpu->arch.ic = mfspr(SPRN_IC);
vcpu->arch.pid = mfspr(SPRN_PID);
- vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
+ vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);
vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
@@ -1003,9 +1012,12 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr);
mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr);
- /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
- mtspr(SPRN_PSSCR, host_psscr |
- (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
+ if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) {
+ /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
+ mtspr(SPRN_PSSCR, host_hpsscr |
+ (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
+ }
+
mtspr(SPRN_HFSCR, host_hfscr);
if (vcpu->arch.ciabr != host_ciabr)
mtspr(SPRN_CIABR, host_ciabr);
--
2.23.0
^ permalink raw reply related
* [PATCH v3 42/52] KVM: PPC: Book3S HV P9: Avoid tlbsync sequence on radix guest exit
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>
Use the existing TLB flushing logic to IPI the previous CPU and run the
necessary barriers before running a guest vCPU on a new physical CPU,
to do the necessary radix GTSE barriers for handling the case of an
interrupted guest tlbie sequence.
This results in more IPIs than the TLB flush logic requires, but it's
a significant win for common case scheduling when the vCPU remains on
the same physical CPU.
This saves about 520 cycles (nearly 10%) on a guest entry+exit micro
benchmark on a POWER9.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/kvm/book3s_hv.c | 31 +++++++++++++++++++++++----
arch/powerpc/kvm/book3s_hv_p9_entry.c | 9 --------
2 files changed, 27 insertions(+), 13 deletions(-)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f10cb4167549..d2a9c930f33e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3025,6 +3025,25 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
smp_call_function_single(i, do_nothing, NULL, 1);
}
+static void do_migrate_away_vcpu(void *arg)
+{
+ struct kvm_vcpu *vcpu = arg;
+ struct kvm *kvm = vcpu->kvm;
+
+ /*
+ * If the guest has GTSE, it may execute tlbie, so do a eieio; tlbsync;
+ * ptesync sequence on the old CPU before migrating to a new one, in
+ * case we interrupted the guest between a tlbie ; eieio ;
+ * tlbsync; ptesync sequence.
+ *
+ * Otherwise, ptesync is sufficient.
+ */
+ if (kvm->arch.lpcr & LPCR_GTSE)
+ asm volatile("eieio; tlbsync; ptesync");
+ else
+ asm volatile("ptesync");
+}
+
static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
{
struct kvm_nested_guest *nested = vcpu->arch.nested;
@@ -3052,10 +3071,14 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
* so we use a single bit in .need_tlb_flush for all 4 threads.
*/
if (prev_cpu != pcpu) {
- if (prev_cpu >= 0 &&
- cpu_first_tlb_thread_sibling(prev_cpu) !=
- cpu_first_tlb_thread_sibling(pcpu))
- radix_flush_cpu(kvm, prev_cpu, vcpu);
+ if (prev_cpu >= 0) {
+ if (cpu_first_tlb_thread_sibling(prev_cpu) !=
+ cpu_first_tlb_thread_sibling(pcpu))
+ radix_flush_cpu(kvm, prev_cpu, vcpu);
+
+ smp_call_function_single(prev_cpu,
+ do_migrate_away_vcpu, vcpu, 1);
+ }
if (nested)
nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
else
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index eae9d806d704..a45ba584c734 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -1049,15 +1049,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_NONE;
- if (kvm_is_radix(kvm)) {
- /*
- * Since this is radix, do a eieio; tlbsync; ptesync sequence
- * in case we interrupted the guest between a tlbie and a
- * ptesync.
- */
- asm volatile("eieio; tlbsync; ptesync");
- }
-
/*
* cp_abort is required if the processor supports local copy-paste
* to clear the copy buffer that was under control of the guest.
--
2.23.0
^ permalink raw reply related
* [PATCH v3 43/52] KVM: PPC: Book3S HV Nested: Avoid extra mftb() in nested entry
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>
mftb() is expensive and one can be avoided on nested guest dispatch.
If the time checking code distinguishes between the L0 timer and the
nested HV timer, then both can be tested in the same place with the
same mftb() value.
This also nicely illustrates the relationship between the L0 and nested
HV timers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/include/asm/kvm_asm.h | 1 +
arch/powerpc/kvm/book3s_hv.c | 12 ++++++++++++
arch/powerpc/kvm/book3s_hv_nested.c | 5 -----
3 files changed, 13 insertions(+), 5 deletions(-)
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index fbbf3cec92e9..d68d71987d5c 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -79,6 +79,7 @@
#define BOOK3S_INTERRUPT_FP_UNAVAIL 0x800
#define BOOK3S_INTERRUPT_DECREMENTER 0x900
#define BOOK3S_INTERRUPT_HV_DECREMENTER 0x980
+#define BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER 0x1980
#define BOOK3S_INTERRUPT_DOORBELL 0xa00
#define BOOK3S_INTERRUPT_SYSCALL 0xc00
#define BOOK3S_INTERRUPT_TRACE 0xd00
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d2a9c930f33e..342b4f125d03 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1486,6 +1486,10 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
run->ready_for_interrupt_injection = 1;
switch (vcpu->arch.trap) {
/* We're good on these - the host merely wanted to get our attention */
+ case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
+ WARN_ON_ONCE(1); /* Should never happen */
+ vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
+ fallthrough;
case BOOK3S_INTERRUPT_HV_DECREMENTER:
vcpu->stat.dec_exits++;
r = RESUME_GUEST;
@@ -1814,6 +1818,12 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
vcpu->stat.ext_intr_exits++;
r = RESUME_GUEST;
break;
+ /* These need to go to the nested HV */
+ case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
+ vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
+ vcpu->stat.dec_exits++;
+ r = RESUME_HOST;
+ break;
/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
case BOOK3S_INTERRUPT_HMI:
case BOOK3S_INTERRUPT_PERFMON:
@@ -3975,6 +3985,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
return BOOK3S_INTERRUPT_HV_DECREMENTER;
if (next_timer < time_limit)
time_limit = next_timer;
+ else if (*tb >= time_limit) /* nested time limit */
+ return BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER;
vcpu->arch.ceded = 0;
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 7bed0b91245e..e57c08b968c0 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -375,11 +375,6 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
vcpu->arch.ret = RESUME_GUEST;
vcpu->arch.trap = 0;
do {
- if (mftb() >= hdec_exp) {
- vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
- r = RESUME_HOST;
- break;
- }
r = kvmhv_run_single_vcpu(vcpu, hdec_exp, lpcr);
} while (is_kvmppc_resume_guest(r));
--
2.23.0
^ permalink raw reply related
* [PATCH v3 44/52] KVM: PPC: Book3S HV P9: Improve mfmsr performance on entry
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>
Rearrange the MSR saving on entry so it does not follow the mtmsrd to
disable interrupts, avoiding a possible RAW scoreboard stall.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/include/asm/kvm_book3s_64.h | 2 +
arch/powerpc/kvm/book3s_hv.c | 18 ++-----
arch/powerpc/kvm/book3s_hv_p9_entry.c | 66 +++++++++++++++---------
3 files changed, 47 insertions(+), 39 deletions(-)
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 0a319ed9c2fd..96f0fda50a07 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -154,6 +154,8 @@ static inline bool kvmhv_vcpu_is_radix(struct kvm_vcpu *vcpu)
return radix;
}
+unsigned long kvmppc_msr_hard_disable_set_facilities(struct kvm_vcpu *vcpu, unsigned long msr);
+
int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb);
#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 342b4f125d03..0bbef4587f41 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3878,6 +3878,8 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns
s64 dec;
int trap;
+ msr = mfmsr();
+
save_p9_host_os_sprs(&host_os_sprs);
/*
@@ -3888,24 +3890,10 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns
*/
host_psscr = mfspr(SPRN_PSSCR_PR);
- hard_irq_disable();
+ kvmppc_msr_hard_disable_set_facilities(vcpu, msr);
if (lazy_irq_pending())
return 0;
- /* MSR bits may have been cleared by context switch */
- msr = 0;
- if (IS_ENABLED(CONFIG_PPC_FPU))
- msr |= MSR_FP;
- if (cpu_has_feature(CPU_FTR_ALTIVEC))
- msr |= MSR_VEC;
- if (cpu_has_feature(CPU_FTR_VSX))
- msr |= MSR_VSX;
- if ((cpu_has_feature(CPU_FTR_TM) ||
- cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) &&
- (vcpu->arch.hfscr & HFSCR_TM))
- msr |= MSR_TM;
- msr = msr_check_and_set(msr);
-
if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
msr = mfmsr(); /* TM restore can update msr */
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index a45ba584c734..646f487ebf97 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -632,6 +632,44 @@ static void save_clear_guest_mmu(struct kvm *kvm, struct kvm_vcpu *vcpu)
}
}
+unsigned long kvmppc_msr_hard_disable_set_facilities(struct kvm_vcpu *vcpu, unsigned long msr)
+{
+ unsigned long msr_needed = 0;
+
+ msr &= ~MSR_EE;
+
+ /* MSR bits may have been cleared by context switch so must recheck */
+ if (IS_ENABLED(CONFIG_PPC_FPU))
+ msr_needed |= MSR_FP;
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ msr_needed |= MSR_VEC;
+ if (cpu_has_feature(CPU_FTR_VSX))
+ msr_needed |= MSR_VSX;
+ if ((cpu_has_feature(CPU_FTR_TM) ||
+ cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) &&
+ (vcpu->arch.hfscr & HFSCR_TM))
+ msr_needed |= MSR_TM;
+
+ /*
+ * This could be combined with MSR[RI] clearing, but that expands
+ * the unrecoverable window. It would be better to cover unrecoverable
+ * with KVM bad interrupt handling rather than use MSR[RI] at all.
+ *
+ * Much more difficult and less worthwhile to combine with IR/DR
+ * disable.
+ */
+ if ((msr & msr_needed) != msr_needed) {
+ msr |= msr_needed;
+ __mtmsrd(msr, 0);
+ } else {
+ __hard_irq_disable();
+ }
+ local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+
+ return msr;
+}
+EXPORT_SYMBOL_GPL(kvmppc_msr_hard_disable_set_facilities);
+
int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb)
{
struct p9_host_os_sprs host_os_sprs;
@@ -665,6 +703,9 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
vcpu->arch.ceded = 0;
+ /* Save MSR for restore, with EE clear. */
+ msr = mfmsr() & ~MSR_EE;
+
host_hfscr = mfspr(SPRN_HFSCR);
host_ciabr = mfspr(SPRN_CIABR);
host_psscr = mfspr(SPRN_PSSCR_PR);
@@ -686,35 +727,12 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
save_p9_host_os_sprs(&host_os_sprs);
- /*
- * This could be combined with MSR[RI] clearing, but that expands
- * the unrecoverable window. It would be better to cover unrecoverable
- * with KVM bad interrupt handling rather than use MSR[RI] at all.
- *
- * Much more difficult and less worthwhile to combine with IR/DR
- * disable.
- */
- hard_irq_disable();
+ msr = kvmppc_msr_hard_disable_set_facilities(vcpu, msr);
if (lazy_irq_pending()) {
trap = 0;
goto out;
}
- /* MSR bits may have been cleared by context switch */
- msr = 0;
- if (IS_ENABLED(CONFIG_PPC_FPU))
- msr |= MSR_FP;
- if (cpu_has_feature(CPU_FTR_ALTIVEC))
- msr |= MSR_VEC;
- if (cpu_has_feature(CPU_FTR_VSX))
- msr |= MSR_VSX;
- if ((cpu_has_feature(CPU_FTR_TM) ||
- cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) &&
- (vcpu->arch.hfscr & HFSCR_TM))
- msr |= MSR_TM;
- msr = msr_check_and_set(msr);
- /* Save MSR for restore. This is after hard disable, so EE is clear. */
-
if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
msr = mfmsr(); /* MSR may have been updated */
--
2.23.0
^ permalink raw reply related
* [PATCH v3 45/52] KVM: PPC: Book3S HV P9: Optimise hash guest SLB saving
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>
slbmfee/slbmfev instructions are very expensive, moreso than a regular
mfspr instruction, so minimising them significantly improves hash guest
exit performance. The slbmfev is only required if slbmfee found a valid
SLB entry.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/kvm/book3s_hv_p9_entry.c | 22 ++++++++++++++++++----
1 file changed, 18 insertions(+), 4 deletions(-)
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 646f487ebf97..99ce5805ea28 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -487,10 +487,22 @@ static void __accumulate_time(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator
#define accumulate_time(vcpu, next) do {} while (0)
#endif
-static inline void mfslb(unsigned int idx, u64 *slbee, u64 *slbev)
+static inline u64 mfslbv(unsigned int idx)
{
- asm volatile("slbmfev %0,%1" : "=r" (*slbev) : "r" (idx));
- asm volatile("slbmfee %0,%1" : "=r" (*slbee) : "r" (idx));
+ u64 slbev;
+
+ asm volatile("slbmfev %0,%1" : "=r" (slbev) : "r" (idx));
+
+ return slbev;
+}
+
+static inline u64 mfslbe(unsigned int idx)
+{
+ u64 slbee;
+
+ asm volatile("slbmfee %0,%1" : "=r" (slbee) : "r" (idx));
+
+ return slbee;
}
static inline void mtslb(u64 slbee, u64 slbev)
@@ -620,8 +632,10 @@ static void save_clear_guest_mmu(struct kvm *kvm, struct kvm_vcpu *vcpu)
*/
for (i = 0; i < vcpu->arch.slb_nr; i++) {
u64 slbee, slbev;
- mfslb(i, &slbee, &slbev);
+
+ slbee = mfslbe(i);
if (slbee & SLB_ESID_V) {
+ slbev = mfslbv(i);
vcpu->arch.slb[nr].orige = slbee | i;
vcpu->arch.slb[nr].origv = slbev;
nr++;
--
2.23.0
^ permalink raw reply related
* [PATCH v3 46/52] KVM: PPC: Book3S HV P9: Avoid changing MSR[RI] in entry and exit
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>
kvm_hstate.in_guest provides the equivalent of MSR[RI]=0 protection,
and it covers the existing MSR[RI]=0 section in late entry and early
exit, so clearing and setting MSR[RI] in those cases does not
actually do anything useful.
Remove the RI manipulation and replace it with comments. Make the
in_guest memory accesses a bit closer to a proper critical section
pattern. This speeds up guest entry/exit performance.
This also removes the MSR[RI] warnings which aren't very interesting
and would cause crashes if they hit due to causing an interrupt in
non-recoverable code.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/kvm/book3s_hv_p9_entry.c | 50 ++++++++++++---------------
1 file changed, 23 insertions(+), 27 deletions(-)
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 99ce5805ea28..5a71532a3adf 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -829,7 +829,15 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
* But TM could be split out if this would be a significant benefit.
*/
- local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_HV_P9;
+ /*
+ * MSR[RI] does not need to be cleared (and is not, for radix guests
+ * with no prefetch bug), because in_guest is set. If we take a SRESET
+ * or MCE with in_guest set but still in HV mode, then
+ * kvmppc_p9_bad_interrupt handles the interrupt, which effectively
+ * clears MSR[RI] and doesn't return.
+ */
+ WRITE_ONCE(local_paca->kvm_hstate.in_guest, KVM_GUEST_MODE_HV_P9);
+ barrier(); /* Open in_guest critical section */
/*
* Hash host, hash guest, or radix guest with prefetch bug, all have
@@ -841,14 +849,10 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
save_clear_host_mmu(kvm);
- if (kvm_is_radix(kvm)) {
+ if (kvm_is_radix(kvm))
switch_mmu_to_guest_radix(kvm, vcpu, lpcr);
- if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
- __mtmsrd(0, 1); /* clear RI */
-
- } else {
+ else
switch_mmu_to_guest_hpt(kvm, vcpu, lpcr);
- }
/* TLBIEL uses LPID=LPIDR, so run this after setting guest LPID */
kvmppc_check_need_tlb_flush(kvm, vc->pcpu, nested);
@@ -903,19 +907,16 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
vcpu->arch.regs.gpr[3] = local_paca->kvm_hstate.scratch2;
/*
- * Only set RI after reading machine check regs (DAR, DSISR, SRR0/1)
- * and hstate scratch (which we need to move into exsave to make
- * re-entrant vs SRESET/MCE)
+ * After reading machine check regs (DAR, DSISR, SRR0/1) and hstate
+ * scratch (which we need to move into exsave to make re-entrant vs
+ * SRESET/MCE), register state is protected from reentrancy. However
+ * timebase, MMU, among other state is still set to guest, so don't
+ * enable MSR[RI] here. It gets enabled at the end, after in_guest
+ * is cleared.
+ *
+ * It is possible an NMI could come in here, which is why it is
+ * important to save the above state early so it can be debugged.
*/
- if (ri_set) {
- if (unlikely(!(mfmsr() & MSR_RI))) {
- __mtmsrd(MSR_RI, 1);
- WARN_ON_ONCE(1);
- }
- } else {
- WARN_ON_ONCE(mfmsr() & MSR_RI);
- __mtmsrd(MSR_RI, 1);
- }
vcpu->arch.regs.gpr[9] = exsave[EX_R9/sizeof(u64)];
vcpu->arch.regs.gpr[10] = exsave[EX_R10/sizeof(u64)];
@@ -973,13 +974,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
*/
mtspr(SPRN_HSRR0, vcpu->arch.regs.nip);
mtspr(SPRN_HSRR1, vcpu->arch.shregs.msr);
-
- /*
- * tm_return_to_guest re-loads SRR0/1, DAR,
- * DSISR after RI is cleared, in case they had
- * been clobbered by a MCE.
- */
- __mtmsrd(0, 1); /* clear RI */
goto tm_return_to_guest;
}
}
@@ -1079,7 +1073,9 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
restore_p9_host_os_sprs(vcpu, &host_os_sprs);
- local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_NONE;
+ barrier(); /* Close in_guest critical section */
+ WRITE_ONCE(local_paca->kvm_hstate.in_guest, KVM_GUEST_MODE_NONE);
+ /* Interrupts are recoverable at this point */
/*
* cp_abort is required if the processor supports local copy-paste
--
2.23.0
^ permalink raw reply related
* [PATCH v3 47/52] KVM: PPC: Book3S HV P9: Add unlikely annotation for !mmu_ready
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>
The mmu will almost always be ready.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/kvm/book3s_hv.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 0bbef4587f41..6e072e2e130a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4408,7 +4408,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
vc->runner = vcpu;
/* See if the MMU is ready to go */
- if (!kvm->arch.mmu_ready) {
+ if (unlikely(!kvm->arch.mmu_ready)) {
r = kvmhv_setup_mmu(vcpu);
if (r) {
run->exit_reason = KVM_EXIT_FAIL_ENTRY;
--
2.23.0
^ permalink raw reply related
* [PATCH v3 48/52] KVM: PPC: Book3S HV P9: Avoid cpu_in_guest atomics on entry and exit
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>
cpu_in_guest is set to determine if a CPU needs to be IPI'ed to exit
the guest and notice the need_tlb_flush bit.
This can be implemented as a global per-CPU pointer to the currently
running guest instead of per-guest cpumasks, saving 2 atomics per
entry/exit. P7/8 doesn't require cpu_in_guest, nor does a nested HV
(only the L0 does), so move it to the P9 HV path.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/include/asm/kvm_book3s_64.h | 1 -
arch/powerpc/include/asm/kvm_host.h | 1 -
arch/powerpc/kvm/book3s_hv.c | 38 +++++++++++++-----------
3 files changed, 21 insertions(+), 19 deletions(-)
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 96f0fda50a07..fe07558173ef 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -44,7 +44,6 @@ struct kvm_nested_guest {
struct mutex tlb_lock; /* serialize page faults and tlbies */
struct kvm_nested_guest *next;
cpumask_t need_tlb_flush;
- cpumask_t cpu_in_guest;
short prev_cpu[NR_CPUS];
u8 radix; /* is this nested guest radix */
};
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 92925f82a1e3..4de418f6c0a2 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -287,7 +287,6 @@ struct kvm_arch {
u32 online_vcores;
atomic_t hpte_mod_interest;
cpumask_t need_tlb_flush;
- cpumask_t cpu_in_guest;
u8 radix;
u8 fwnmi_enabled;
u8 secure_guest;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6e072e2e130a..6574e8a3731e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3009,30 +3009,33 @@ static void kvmppc_release_hwthread(int cpu)
tpaca->kvm_hstate.kvm_split_mode = NULL;
}
+static DEFINE_PER_CPU(struct kvm *, cpu_in_guest);
+
static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
{
struct kvm_nested_guest *nested = vcpu->arch.nested;
- cpumask_t *cpu_in_guest;
int i;
cpu = cpu_first_tlb_thread_sibling(cpu);
- if (nested) {
+ if (nested)
cpumask_set_cpu(cpu, &nested->need_tlb_flush);
- cpu_in_guest = &nested->cpu_in_guest;
- } else {
+ else
cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
- cpu_in_guest = &kvm->arch.cpu_in_guest;
- }
/*
- * Make sure setting of bit in need_tlb_flush precedes
- * testing of cpu_in_guest bits. The matching barrier on
- * the other side is the first smp_mb() in kvmppc_run_core().
+ * Make sure setting of bit in need_tlb_flush precedes testing of
+ * cpu_in_guest. The matching barrier on the other side is hwsync
+ * when switching to guest MMU mode, which happens between
+ * cpu_in_guest being set to the guest kvm, and need_tlb_flush bit
+ * being tested.
*/
smp_mb();
for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
- i += cpu_tlb_thread_sibling_step())
- if (cpumask_test_cpu(i, cpu_in_guest))
+ i += cpu_tlb_thread_sibling_step()) {
+ struct kvm *running = *per_cpu_ptr(&cpu_in_guest, i);
+
+ if (running == kvm)
smp_call_function_single(i, do_nothing, NULL, 1);
+ }
}
static void do_migrate_away_vcpu(void *arg)
@@ -3100,7 +3103,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
{
int cpu;
struct paca_struct *tpaca;
- struct kvm *kvm = vc->kvm;
cpu = vc->pcpu;
if (vcpu) {
@@ -3111,7 +3113,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
cpu += vcpu->arch.ptid;
vcpu->cpu = vc->pcpu;
vcpu->arch.thread_cpu = cpu;
- cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
}
tpaca = paca_ptrs[cpu];
tpaca->kvm_hstate.kvm_vcpu = vcpu;
@@ -3829,7 +3830,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
kvmppc_release_hwthread(pcpu + i);
if (sip && sip->napped[i])
kvmppc_ipi_thread(pcpu + i);
- cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
}
spin_unlock(&vc->lock);
@@ -3997,8 +3997,14 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
}
} else {
+ struct kvm *kvm = vcpu->kvm;
+
kvmppc_xive_push_vcpu(vcpu);
+
+ __this_cpu_write(cpu_in_guest, kvm);
trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb);
+ __this_cpu_write(cpu_in_guest, NULL);
+
if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
!(vcpu->arch.shregs.msr & MSR_PR)) {
unsigned long req = kvmppc_get_gpr(vcpu, 3);
@@ -4023,7 +4029,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
}
kvmppc_xive_pull_vcpu(vcpu);
- if (kvm_is_radix(vcpu->kvm))
+ if (kvm_is_radix(kvm))
vcpu->arch.slb_max = 0;
}
@@ -4500,8 +4506,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
powerpc_local_irq_pmu_restore(flags);
- cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest);
-
preempt_enable();
/*
--
2.23.0
^ permalink raw reply related
* [PATCH v3 49/52] KVM: PPC: Book3S HV P9: Remove most of the vcore logic
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>
The P9 path always uses one vcpu per vcore, so none of the vcore, locks,
stolen time, blocking logic, shared waitq, etc., is required.
Remove most of it.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/kvm/book3s_hv.c | 147 ++++++++++++++++++++---------------
1 file changed, 85 insertions(+), 62 deletions(-)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6574e8a3731e..d614f83c7b3f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -276,6 +276,8 @@ static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc, u64 tb)
{
unsigned long flags;
+ WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
spin_lock_irqsave(&vc->stoltb_lock, flags);
vc->preempt_tb = tb;
spin_unlock_irqrestore(&vc->stoltb_lock, flags);
@@ -285,6 +287,8 @@ static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc, u64 tb)
{
unsigned long flags;
+ WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
spin_lock_irqsave(&vc->stoltb_lock, flags);
if (vc->preempt_tb != TB_NIL) {
vc->stolen_tb += tb - vc->preempt_tb;
@@ -297,7 +301,12 @@ static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
unsigned long flags;
- u64 now = mftb();
+ u64 now;
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ return;
+
+ now = mftb();
/*
* We can test vc->runner without taking the vcore lock,
@@ -321,7 +330,12 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
unsigned long flags;
- u64 now = mftb();
+ u64 now;
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ return;
+
+ now = mftb();
if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
kvmppc_core_start_stolen(vc, now);
@@ -673,6 +687,8 @@ static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
u64 p;
unsigned long flags;
+ WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
spin_lock_irqsave(&vc->stoltb_lock, flags);
p = vc->stolen_tb;
if (vc->vcore_state != VCORE_INACTIVE &&
@@ -695,13 +711,19 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
dt = vcpu->arch.dtl_ptr;
vpa = vcpu->arch.vpa.pinned_addr;
now = tb;
- core_stolen = vcore_stolen_time(vc, now);
- stolen = core_stolen - vcpu->arch.stolen_logged;
- vcpu->arch.stolen_logged = core_stolen;
- spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
- stolen += vcpu->arch.busy_stolen;
- vcpu->arch.busy_stolen = 0;
- spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ stolen = 0;
+ } else {
+ core_stolen = vcore_stolen_time(vc, now);
+ stolen = core_stolen - vcpu->arch.stolen_logged;
+ vcpu->arch.stolen_logged = core_stolen;
+ spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
+ stolen += vcpu->arch.busy_stolen;
+ vcpu->arch.busy_stolen = 0;
+ spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
+ }
+
if (!dt || !vpa)
return;
memset(dt, 0, sizeof(struct dtl_entry));
@@ -898,13 +920,14 @@ static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
* mode handler is not called but no other threads are in the
* source vcore.
*/
-
- spin_lock(&vcore->lock);
- if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
- vcore->vcore_state != VCORE_INACTIVE &&
- vcore->runner)
- target = vcore->runner;
- spin_unlock(&vcore->lock);
+ if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+ spin_lock(&vcore->lock);
+ if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
+ vcore->vcore_state != VCORE_INACTIVE &&
+ vcore->runner)
+ target = vcore->runner;
+ spin_unlock(&vcore->lock);
+ }
return kvm_vcpu_yield_to(target);
}
@@ -3125,13 +3148,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
kvmppc_ipi_thread(cpu);
}
-/* Old path does this in asm */
-static void kvmppc_stop_thread(struct kvm_vcpu *vcpu)
-{
- vcpu->cpu = -1;
- vcpu->arch.thread_cpu = -1;
-}
-
static void kvmppc_wait_for_nap(int n_threads)
{
int cpu = smp_processor_id();
@@ -3220,6 +3236,8 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
{
struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
+ WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
vc->vcore_state = VCORE_PREEMPT;
vc->pcpu = smp_processor_id();
if (vc->num_threads < threads_per_vcore(vc->kvm)) {
@@ -3236,6 +3254,8 @@ static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc)
{
struct preempted_vcore_list *lp;
+ WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
kvmppc_core_end_stolen(vc, mftb());
if (!list_empty(&vc->preempt_list)) {
lp = &per_cpu(preempted_vcores, vc->pcpu);
@@ -3964,7 +3984,6 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns
static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
unsigned long lpcr, u64 *tb)
{
- struct kvmppc_vcore *vc = vcpu->arch.vcore;
u64 next_timer;
int trap;
@@ -3980,9 +3999,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
kvmppc_subcore_enter_guest();
- vc->entry_exit_map = 1;
- vc->in_guest = 1;
-
vcpu_vpa_increment_dispatch(vcpu);
if (kvmhv_on_pseries()) {
@@ -4035,9 +4051,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
vcpu_vpa_increment_dispatch(vcpu);
- vc->entry_exit_map = 0x101;
- vc->in_guest = 0;
-
kvmppc_subcore_exit_guest();
return trap;
@@ -4103,6 +4116,13 @@ static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu)
return false;
}
+static bool kvmppc_vcpu_check_block(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu))
+ return true;
+ return false;
+}
+
/*
* Check to see if any of the runnable vcpus on the vcore have pending
* exceptions or are no longer ceded
@@ -4113,7 +4133,7 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
int i;
for_each_runnable_thread(i, vcpu, vc) {
- if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu))
+ if (kvmppc_vcpu_check_block(vcpu))
return 1;
}
@@ -4130,6 +4150,8 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
int do_sleep = 1;
u64 block_ns;
+ WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
/* Poll for pending exceptions and ceded state */
cur = start_poll = ktime_get();
if (vc->halt_poll_ns) {
@@ -4407,11 +4429,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
vcpu->arch.ceded = 0;
vcpu->arch.run_task = current;
vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
- vcpu->arch.busy_preempt = TB_NIL;
vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
- vc->runnable_threads[0] = vcpu;
- vc->n_runnable = 1;
- vc->runner = vcpu;
/* See if the MMU is ready to go */
if (unlikely(!kvm->arch.mmu_ready)) {
@@ -4429,11 +4447,8 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
kvmppc_update_vpas(vcpu);
- init_vcore_to_run(vc);
-
preempt_disable();
pcpu = smp_processor_id();
- vc->pcpu = pcpu;
if (kvm_is_radix(kvm))
kvmppc_prepare_radix_vcpu(vcpu, pcpu);
@@ -4462,21 +4477,23 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
goto out;
}
- tb = mftb();
+ if (vcpu->arch.timer_running) {
+ hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+ vcpu->arch.timer_running = 0;
+ }
- vcpu->arch.stolen_logged = vcore_stolen_time(vc, tb);
- vc->preempt_tb = TB_NIL;
+ tb = mftb();
- kvmppc_clear_host_core(pcpu);
+ vcpu->cpu = pcpu;
+ vcpu->arch.thread_cpu = pcpu;
+ local_paca->kvm_hstate.kvm_vcpu = vcpu;
+ local_paca->kvm_hstate.ptid = 0;
+ local_paca->kvm_hstate.fake_suspend = 0;
- local_paca->kvm_hstate.napping = 0;
- local_paca->kvm_hstate.kvm_split_mode = NULL;
- kvmppc_start_thread(vcpu, vc);
+ vc->pcpu = pcpu; // for kvmppc_create_dtl_entry
kvmppc_create_dtl_entry(vcpu, vc, tb);
- trace_kvm_guest_enter(vcpu);
- vc->vcore_state = VCORE_RUNNING;
- trace_kvmppc_run_core(vc, 0);
+ trace_kvm_guest_enter(vcpu);
guest_enter_irqoff();
@@ -4498,11 +4515,10 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
set_irq_happened(trap);
- kvmppc_set_host_core(pcpu);
-
guest_exit_irqoff();
- kvmppc_stop_thread(vcpu);
+ vcpu->cpu = -1;
+ vcpu->arch.thread_cpu = -1;
powerpc_local_irq_pmu_restore(flags);
@@ -4529,28 +4545,31 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
}
vcpu->arch.ret = r;
- if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded &&
- !kvmppc_vcpu_woken(vcpu)) {
+ if (is_kvmppc_resume_guest(r) && !kvmppc_vcpu_check_block(vcpu)) {
kvmppc_set_timer(vcpu);
- while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) {
+
+ prepare_to_rcuwait(&vcpu->wait);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
if (signal_pending(current)) {
vcpu->stat.signal_exits++;
run->exit_reason = KVM_EXIT_INTR;
vcpu->arch.ret = -EINTR;
break;
}
- spin_lock(&vc->lock);
- kvmppc_vcore_blocked(vc);
- spin_unlock(&vc->lock);
+
+ if (kvmppc_vcpu_check_block(vcpu))
+ break;
+
+ trace_kvmppc_vcore_blocked(vc, 0);
+ schedule();
+ trace_kvmppc_vcore_blocked(vc, 1);
}
+ finish_rcuwait(&vcpu->wait);
}
vcpu->arch.ceded = 0;
- vc->vcore_state = VCORE_INACTIVE;
- trace_kvmppc_run_core(vc, 1);
-
done:
- kvmppc_remove_runnable(vc, vcpu, tb);
trace_kvmppc_run_vcpu_exit(vcpu);
return vcpu->arch.ret;
@@ -4632,7 +4651,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
kvmppc_save_current_sprs();
- vcpu->arch.waitp = &vcpu->arch.vcore->wait;
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ vcpu->arch.waitp = &vcpu->arch.vcore->wait;
vcpu->arch.pgdir = kvm->mm->pgd;
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
@@ -5094,6 +5114,9 @@ void kvmppc_alloc_host_rm_ops(void)
int cpu, core;
int size;
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ return;
+
/* Not the first time here ? */
if (kvmppc_host_rm_ops_hv != NULL)
return;
--
2.23.0
^ permalink raw reply related
* [PATCH v3 50/52] KVM: PPC: Book3S HV P9: Tidy kvmppc_create_dtl_entry
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>
This goes further to removing vcores from the P9 path. Also avoid the
memset in favour of explicitly initialising all fields.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/kvm/book3s_hv.c | 61 +++++++++++++++++++++---------------
1 file changed, 35 insertions(+), 26 deletions(-)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d614f83c7b3f..57bf49c90e73 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -698,41 +698,30 @@ static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
return p;
}
-static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
- struct kvmppc_vcore *vc, u64 tb)
+static void __kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
+ unsigned int pcpu, u64 now,
+ unsigned long stolen)
{
struct dtl_entry *dt;
struct lppaca *vpa;
- unsigned long stolen;
- unsigned long core_stolen;
- u64 now;
- unsigned long flags;
dt = vcpu->arch.dtl_ptr;
vpa = vcpu->arch.vpa.pinned_addr;
- now = tb;
-
- if (cpu_has_feature(CPU_FTR_ARCH_300)) {
- stolen = 0;
- } else {
- core_stolen = vcore_stolen_time(vc, now);
- stolen = core_stolen - vcpu->arch.stolen_logged;
- vcpu->arch.stolen_logged = core_stolen;
- spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
- stolen += vcpu->arch.busy_stolen;
- vcpu->arch.busy_stolen = 0;
- spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
- }
if (!dt || !vpa)
return;
- memset(dt, 0, sizeof(struct dtl_entry));
+
dt->dispatch_reason = 7;
- dt->processor_id = cpu_to_be16(vc->pcpu + vcpu->arch.ptid);
- dt->timebase = cpu_to_be64(now + vc->tb_offset);
+ dt->preempt_reason = 0;
+ dt->processor_id = cpu_to_be16(pcpu + vcpu->arch.ptid);
dt->enqueue_to_dispatch_time = cpu_to_be32(stolen);
+ dt->ready_to_enqueue_time = 0;
+ dt->waiting_to_ready_time = 0;
+ dt->timebase = cpu_to_be64(now);
+ dt->fault_addr = 0;
dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu));
dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr);
+
++dt;
if (dt == vcpu->arch.dtl.pinned_end)
dt = vcpu->arch.dtl.pinned_addr;
@@ -743,6 +732,27 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
vcpu->arch.dtl.dirty = true;
}
+static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
+ struct kvmppc_vcore *vc)
+{
+ unsigned long stolen;
+ unsigned long core_stolen;
+ u64 now;
+ unsigned long flags;
+
+ now = mftb();
+
+ core_stolen = vcore_stolen_time(vc, now);
+ stolen = core_stolen - vcpu->arch.stolen_logged;
+ vcpu->arch.stolen_logged = core_stolen;
+ spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
+ stolen += vcpu->arch.busy_stolen;
+ vcpu->arch.busy_stolen = 0;
+ spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
+
+ __kvmppc_create_dtl_entry(vcpu, vc->pcpu, now + vc->tb_offset, stolen);
+}
+
/* See if there is a doorbell interrupt pending for a vcpu */
static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
{
@@ -3750,7 +3760,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
pvc->pcpu = pcpu + thr;
for_each_runnable_thread(i, vcpu, pvc) {
kvmppc_start_thread(vcpu, pvc);
- kvmppc_create_dtl_entry(vcpu, pvc, mftb());
+ kvmppc_create_dtl_entry(vcpu, pvc);
trace_kvm_guest_enter(vcpu);
if (!vcpu->arch.ptid)
thr0_done = true;
@@ -4313,7 +4323,7 @@ static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu)
if ((vc->vcore_state == VCORE_PIGGYBACK ||
vc->vcore_state == VCORE_RUNNING) &&
!VCORE_IS_EXITING(vc)) {
- kvmppc_create_dtl_entry(vcpu, vc, mftb());
+ kvmppc_create_dtl_entry(vcpu, vc);
kvmppc_start_thread(vcpu, vc);
trace_kvm_guest_enter(vcpu);
} else if (vc->vcore_state == VCORE_SLEEPING) {
@@ -4490,8 +4500,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
local_paca->kvm_hstate.ptid = 0;
local_paca->kvm_hstate.fake_suspend = 0;
- vc->pcpu = pcpu; // for kvmppc_create_dtl_entry
- kvmppc_create_dtl_entry(vcpu, vc, tb);
+ __kvmppc_create_dtl_entry(vcpu, pcpu, tb + vc->tb_offset, 0);
trace_kvm_guest_enter(vcpu);
--
2.23.0
^ permalink raw reply related
* [PATCH v3 51/52] KVM: PPC: Book3S HV P9: Stop using vc->dpdes
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>
The P9 path uses vc->dpdes only for msgsndp / SMT emulation. This adds
an ordering requirement between vcpu->doorbell_request and vc->dpdes for
no real benefit. Use vcpu->doorbell_request directly.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/kvm/book3s_hv.c | 18 ++++++++++--------
arch/powerpc/kvm/book3s_hv_builtin.c | 2 ++
arch/powerpc/kvm/book3s_hv_p9_entry.c | 14 ++++++++++----
3 files changed, 22 insertions(+), 12 deletions(-)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 57bf49c90e73..351018f617fb 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -761,6 +761,8 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
if (vcpu->arch.doorbell_request)
return true;
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ return false;
/*
* Ensure that the read of vcore->dpdes comes after the read
* of vcpu->doorbell_request. This barrier matches the
@@ -2185,8 +2187,10 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
* either vcore->dpdes or doorbell_request.
* On POWER8, doorbell_request is 0.
*/
- *val = get_reg_val(id, vcpu->arch.vcore->dpdes |
- vcpu->arch.doorbell_request);
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ *val = get_reg_val(id, vcpu->arch.doorbell_request);
+ else
+ *val = get_reg_val(id, vcpu->arch.vcore->dpdes);
break;
case KVM_REG_PPC_VTB:
*val = get_reg_val(id, vcpu->arch.vcore->vtb);
@@ -2423,7 +2427,10 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
vcpu->arch.pspb = set_reg_val(id, *val);
break;
case KVM_REG_PPC_DPDES:
- vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ vcpu->arch.doorbell_request = set_reg_val(id, *val) & 1;
+ else
+ vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
break;
case KVM_REG_PPC_VTB:
vcpu->arch.vcore->vtb = set_reg_val(id, *val);
@@ -4472,11 +4479,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
if (!nested) {
kvmppc_core_prepare_to_enter(vcpu);
- if (vcpu->arch.doorbell_request) {
- vc->dpdes = 1;
- smp_wmb();
- vcpu->arch.doorbell_request = 0;
- }
if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
&vcpu->arch.pending_exceptions))
lpcr |= LPCR_MER;
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index fcf4760a3a0e..a4fc4b2d3806 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -649,6 +649,8 @@ void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
int ext;
unsigned long lpcr;
+ WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
/* Insert EXTERNAL bit into LPCR at the MER bit position */
ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1;
lpcr = mfspr(SPRN_LPCR);
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 5a71532a3adf..fbecbdc42c26 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -705,6 +705,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
unsigned long host_pidr;
unsigned long host_dawr1;
unsigned long host_dawrx1;
+ unsigned long dpdes;
hdec = time_limit - *tb;
if (hdec < 0)
@@ -767,8 +768,10 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
if (vc->pcr)
mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
- if (vc->dpdes)
- mtspr(SPRN_DPDES, vc->dpdes);
+ if (vcpu->arch.doorbell_request) {
+ vcpu->arch.doorbell_request = 0;
+ mtspr(SPRN_DPDES, 1);
+ }
if (dawr_enabled()) {
if (vcpu->arch.dawr0 != host_dawr0)
@@ -999,7 +1002,10 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
- vc->dpdes = mfspr(SPRN_DPDES);
+ dpdes = mfspr(SPRN_DPDES);
+ if (dpdes)
+ vcpu->arch.doorbell_request = 1;
+
vc->vtb = mfspr(SPRN_VTB);
dec = mfspr(SPRN_DEC);
@@ -1061,7 +1067,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
}
}
- if (vc->dpdes)
+ if (dpdes)
mtspr(SPRN_DPDES, 0);
if (vc->pcr)
mtspr(SPRN_PCR, PCR_MASK);
--
2.23.0
^ permalink raw reply related
* [PATCH v3 52/52] KVM: PPC: Book3S HV P9: Remove subcore HMI handling
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>
On POWER9 and newer, rather than the complex HMI synchronisation and
subcore state, have each thread un-apply the guest TB offset before
calling into the early HMI handler.
This allows the subcore state to be avoided, including subcore enter
/ exit guest, which includes an expensive divide that shows up
slightly in profiles.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/include/asm/kvm_ppc.h | 1 +
arch/powerpc/kvm/book3s_hv.c | 12 +++---
arch/powerpc/kvm/book3s_hv_hmi.c | 7 +++-
arch/powerpc/kvm/book3s_hv_p9_entry.c | 2 +-
arch/powerpc/kvm/book3s_hv_ras.c | 54 +++++++++++++++++++++++++++
5 files changed, 67 insertions(+), 9 deletions(-)
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 671fbd1a765e..70ffcb3c91bf 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -760,6 +760,7 @@ void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu);
void kvmppc_subcore_enter_guest(void);
void kvmppc_subcore_exit_guest(void);
long kvmppc_realmode_hmi_handler(void);
+long kvmppc_p9_realmode_hmi_handler(struct kvm_vcpu *vcpu);
long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
long pte_index, unsigned long pteh, unsigned long ptel);
long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 351018f617fb..449ac0a19ceb 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4014,8 +4014,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
vcpu->arch.ceded = 0;
- kvmppc_subcore_enter_guest();
-
vcpu_vpa_increment_dispatch(vcpu);
if (kvmhv_on_pseries()) {
@@ -4068,8 +4066,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
vcpu_vpa_increment_dispatch(vcpu);
- kvmppc_subcore_exit_guest();
-
return trap;
}
@@ -6069,9 +6065,11 @@ static int kvmppc_book3s_init_hv(void)
if (r)
return r;
- r = kvm_init_subcore_bitmap();
- if (r)
- return r;
+ if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+ r = kvm_init_subcore_bitmap();
+ if (r)
+ return r;
+ }
/*
* We need a way of accessing the XICS interrupt controller,
diff --git a/arch/powerpc/kvm/book3s_hv_hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c
index 9af660476314..1ec50c69678b 100644
--- a/arch/powerpc/kvm/book3s_hv_hmi.c
+++ b/arch/powerpc/kvm/book3s_hv_hmi.c
@@ -20,10 +20,15 @@ void wait_for_subcore_guest_exit(void)
/*
* NULL bitmap pointer indicates that KVM module hasn't
- * been loaded yet and hence no guests are running.
+ * been loaded yet and hence no guests are running, or running
+ * on POWER9 or newer CPU.
+ *
* If no KVM is in use, no need to co-ordinate among threads
* as all of them will always be in host and no one is going
* to modify TB other than the opal hmi handler.
+ *
+ * POWER9 and newer don't need this synchronisation.
+ *
* Hence, just return from here.
*/
if (!local_paca->sibling_subcore_state)
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index fbecbdc42c26..86a222f97e8e 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -938,7 +938,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
kvmppc_realmode_machine_check(vcpu);
} else if (unlikely(trap == BOOK3S_INTERRUPT_HMI)) {
- kvmppc_realmode_hmi_handler();
+ kvmppc_p9_realmode_hmi_handler(vcpu);
} else if (trap == BOOK3S_INTERRUPT_H_EMUL_ASSIST) {
vcpu->arch.emul_inst = mfspr(SPRN_HEIR);
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index d4bca93b79f6..ccfd96965630 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -136,6 +136,60 @@ void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
vcpu->arch.mce_evt = mce_evt;
}
+
+long kvmppc_p9_realmode_hmi_handler(struct kvm_vcpu *vcpu)
+{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+ long ret = 0;
+
+ /*
+ * Unapply and clear the offset first. That way, if the TB was not
+ * resynced then it will remain in host-offset, and if it was resynced
+ * then it is brought into host-offset. Then the tb offset is
+ * re-applied before continuing with the KVM exit.
+ *
+ * This way, we don't need to actually know whether not OPAL resynced
+ * the timebase or do any of the complicated dance that the P7/8
+ * path requires.
+ */
+ if (vc->tb_offset_applied) {
+ u64 new_tb = mftb() - vc->tb_offset_applied;
+ mtspr(SPRN_TBU40, new_tb);
+ if ((mftb() & 0xffffff) < (new_tb & 0xffffff)) {
+ new_tb += 0x1000000;
+ mtspr(SPRN_TBU40, new_tb);
+ }
+ vc->tb_offset_applied = 0;
+ }
+
+ local_paca->hmi_irqs++;
+
+ if (hmi_handle_debugtrig(NULL) >= 0) {
+ ret = 1;
+ goto out;
+ }
+
+ if (ppc_md.hmi_exception_early)
+ ppc_md.hmi_exception_early(NULL);
+
+out:
+ if (vc->tb_offset) {
+ u64 new_tb = mftb() + vc->tb_offset;
+ mtspr(SPRN_TBU40, new_tb);
+ if ((mftb() & 0xffffff) < (new_tb & 0xffffff)) {
+ new_tb += 0x1000000;
+ mtspr(SPRN_TBU40, new_tb);
+ }
+ vc->tb_offset_applied = vc->tb_offset;
+ }
+
+ return ret;
+}
+
+/*
+ * The following subcore HMI handling is all only for pre-POWER9 CPUs.
+ */
+
/* Check if dynamic split is in force and return subcore size accordingly. */
static inline int kvmppc_cur_subcore_size(void)
{
--
2.23.0
^ permalink raw reply related
* [RFC PATCH] KVM: PPC: Book3S HV P9: Move H_CEDE logic mostly to one place
From: Nicholas Piggin @ 2021-10-04 16:10 UTC (permalink / raw)
To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
Move the vcpu->arch.ceded, hrtimer, and blocking handling to one place,
except the xive escalation rearm case. The only special case is the
xive handling, as it is to be done before the xive context is pulled.
This means the P9 path does not run with ceded==1 or the hrtimer armed
except in the kvmhv_handle_cede function, and hopefully cede handling is
a bit more understandable.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/include/asm/kvm_ppc.h | 4 +-
arch/powerpc/kvm/book3s_hv.c | 137 +++++++++++++-------------
arch/powerpc/kvm/book3s_hv_p9_entry.c | 3 +-
arch/powerpc/kvm/book3s_xive.c | 26 ++---
4 files changed, 88 insertions(+), 82 deletions(-)
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 70ffcb3c91bf..e2e6cee9dddf 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -674,7 +674,7 @@ extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
int level, bool line_status);
extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
extern void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu);
-extern void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu);
+extern bool kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu);
static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
{
@@ -712,7 +712,7 @@ static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 ir
int level, bool line_status) { return -ENODEV; }
static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
static inline void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) { }
-static inline void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) { }
+static inline bool kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) { return false; }
static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
{ return 0; }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 36c54f483a02..230f10b67f98 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1019,6 +1019,8 @@ static long kvmppc_h_rpt_invalidate(struct kvm_vcpu *vcpu,
return H_SUCCESS;
}
+static int kvmhv_handle_cede(struct kvm_vcpu *vcpu);
+
int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
@@ -1080,7 +1082,9 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
break;
case H_CEDE:
+ ret = kvmhv_handle_cede(vcpu);
break;
+
case H_PROD:
target = kvmppc_get_gpr(vcpu, 4);
tvcpu = kvmppc_find_vcpu(kvm, target);
@@ -1292,25 +1296,6 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
return RESUME_GUEST;
}
-/*
- * Handle H_CEDE in the P9 path where we don't call the real-mode hcall
- * handlers in book3s_hv_rmhandlers.S.
- *
- * This has to be done early, not in kvmppc_pseries_do_hcall(), so
- * that the cede logic in kvmppc_run_single_vcpu() works properly.
- */
-static void kvmppc_cede(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.shregs.msr |= MSR_EE;
- vcpu->arch.ceded = 1;
- smp_mb();
- if (vcpu->arch.prodded) {
- vcpu->arch.prodded = 0;
- smp_mb();
- vcpu->arch.ceded = 0;
- }
-}
-
static int kvmppc_hcall_impl_hv(unsigned long cmd)
{
switch (cmd) {
@@ -2971,7 +2956,7 @@ static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu)
return 1;
}
-static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
+static bool kvmppc_set_timer(struct kvm_vcpu *vcpu)
{
unsigned long dec_nsec, now;
@@ -2980,11 +2965,12 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
/* decrementer has already gone negative */
kvmppc_core_queue_dec(vcpu);
kvmppc_core_prepare_to_enter(vcpu);
- return;
+ return false;
}
dec_nsec = tb_to_ns(kvmppc_dec_expires_host_tb(vcpu) - now);
hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL);
vcpu->arch.timer_running = 1;
+ return true;
}
extern int __kvmppc_vcore_entry(void);
@@ -4015,21 +4001,11 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
else if (*tb >= time_limit) /* nested time limit */
return BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER;
- vcpu->arch.ceded = 0;
-
vcpu_vpa_increment_dispatch(vcpu);
if (kvmhv_on_pseries()) {
trap = kvmhv_vcpu_entry_p9_nested(vcpu, time_limit, lpcr, tb);
- /* H_CEDE has to be handled now, not later */
- if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
- kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
- kvmppc_cede(vcpu);
- kvmppc_set_gpr(vcpu, 3, 0);
- trap = 0;
- }
-
} else {
struct kvm *kvm = vcpu->kvm;
@@ -4043,12 +4019,14 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
!(vcpu->arch.shregs.msr & MSR_PR)) {
unsigned long req = kvmppc_get_gpr(vcpu, 3);
- /* H_CEDE has to be handled now, not later */
+ /* Have to rearm before pulling xive, may abort cede */
if (req == H_CEDE) {
- kvmppc_cede(vcpu);
- kvmppc_xive_rearm_escalation(vcpu); /* may un-cede */
- kvmppc_set_gpr(vcpu, 3, 0);
- trap = 0;
+ if (kvmppc_xive_rearm_escalation(vcpu)) {
+ vcpu->arch.shregs.msr |= MSR_EE;
+ vcpu->arch.prodded = 0;
+ kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+ trap = 0;
+ }
/* XICS hcalls must be handled before xive is pulled */
} else if (hcall_is_xics(req)) {
@@ -4423,6 +4401,63 @@ static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu)
return vcpu->arch.ret;
}
+/* H_CEDE for the P9 path, P7/8 is handled by realmode handlers */
+static int kvmhv_handle_cede(struct kvm_vcpu *vcpu)
+{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+ struct kvm_run *run = vcpu->run;
+
+ kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+ vcpu->arch.trap = 0;
+ vcpu->arch.shregs.msr |= MSR_EE;
+
+ vcpu->arch.ceded = 1;
+ smp_mb();
+ if (vcpu->arch.prodded) {
+ vcpu->arch.prodded = 0;
+ smp_mb();
+ vcpu->arch.ceded = 0;
+ return RESUME_GUEST;
+ }
+
+ if (kvmppc_vcpu_woken(vcpu)) {
+ vcpu->arch.ceded = 0;
+ return RESUME_GUEST;
+ }
+
+ if (!kvmppc_set_timer(vcpu)) {
+ vcpu->arch.ceded = 0;
+ return RESUME_GUEST;
+ }
+
+ prepare_to_rcuwait(&vcpu->wait);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (signal_pending(current)) {
+ vcpu->stat.signal_exits++;
+ run->exit_reason = KVM_EXIT_INTR;
+ vcpu->arch.ret = -EINTR;
+ break;
+ }
+
+ if (kvmppc_vcpu_woken(vcpu) || !vcpu->arch.ceded)
+ break;
+
+ trace_kvmppc_vcore_blocked(vc, 0);
+ schedule();
+ trace_kvmppc_vcore_blocked(vc, 1);
+ }
+ finish_rcuwait(&vcpu->wait);
+
+ if (vcpu->arch.timer_running) {
+ hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+ vcpu->arch.timer_running = 0;
+ }
+
+ vcpu->arch.ceded = 0;
+ return RESUME_GUEST;
+}
+
int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
unsigned long lpcr)
{
@@ -4442,7 +4477,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
vcpu->arch.trap = 0;
vc = vcpu->arch.vcore;
- vcpu->arch.ceded = 0;
vcpu->arch.run_task = current;
vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
@@ -4488,11 +4522,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
goto out;
}
- if (vcpu->arch.timer_running) {
- hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
- vcpu->arch.timer_running = 0;
- }
-
tb = mftb();
vcpu->cpu = pcpu;
@@ -4555,30 +4584,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
}
vcpu->arch.ret = r;
- if (is_kvmppc_resume_guest(r) && !kvmppc_vcpu_check_block(vcpu)) {
- kvmppc_set_timer(vcpu);
-
- prepare_to_rcuwait(&vcpu->wait);
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (signal_pending(current)) {
- vcpu->stat.signal_exits++;
- run->exit_reason = KVM_EXIT_INTR;
- vcpu->arch.ret = -EINTR;
- break;
- }
-
- if (kvmppc_vcpu_check_block(vcpu))
- break;
-
- trace_kvmppc_vcore_blocked(vc, 0);
- schedule();
- trace_kvmppc_vcore_blocked(vc, 1);
- }
- finish_rcuwait(&vcpu->wait);
- }
- vcpu->arch.ceded = 0;
-
done:
trace_kvmppc_run_vcpu_exit(vcpu);
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 86a222f97e8e..a1fdfcba608f 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -713,11 +713,10 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_HV);
WARN_ON_ONCE(!(vcpu->arch.shregs.msr & MSR_ME));
+ WARN_ON_ONCE(vcpu->arch.ceded);
start_timing(vcpu, &vcpu->arch.rm_entry);
- vcpu->arch.ceded = 0;
-
/* Save MSR for restore, with EE clear. */
msr = mfmsr() & ~MSR_EE;
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index a18db9e16ea4..7d45764fd6a8 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -179,37 +179,39 @@ void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu);
-void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu)
+/* Return true if H_CEDE is to be aborted */
+bool kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu)
{
void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
if (!esc_vaddr)
- return;
+ return false;
/* we are using XIVE with single escalation */
if (vcpu->arch.xive_esc_on) {
/*
- * If we still have a pending escalation, abort the cede,
- * and we must set PQ to 10 rather than 00 so that we don't
- * potentially end up with two entries for the escalation
- * interrupt in the XIVE interrupt queue. In that case
- * we also don't want to set xive_esc_on to 1 here in
- * case we race with xive_esc_irq().
- */
- vcpu->arch.ceded = 0;
- /*
+ * If we still have a pending escalation, return true to abort
+ * the cede, and we must set PQ to 10 rather than 00 so that we
+ * don't potentially end up with two entries for the escalation
+ * interrupt in the XIVE interrupt queue. In that case we also
+ * don't want to set xive_esc_on to 1 here in case we race with
+ * xive_esc_irq().
+ *
* The escalation interrupts are special as we don't EOI them.
* There is no need to use the load-after-store ordering offset
* to set PQ to 10 as we won't use StoreEOI.
*/
__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
+ mb();
+ return true;
} else {
vcpu->arch.xive_esc_on = true;
mb();
__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
+ mb();
+ return false;
}
- mb();
}
EXPORT_SYMBOL_GPL(kvmppc_xive_rearm_escalation);
--
2.23.0
^ permalink raw reply related
* Re: [PATCH 3/3] powerpc: Set crashkernel offset to mid of RMA region
From: Aneesh Kumar K.V @ 2021-10-04 16:06 UTC (permalink / raw)
To: Sourabh Jain, mpe
Cc: Abdul haleem, mahesh, linux-kernel, hbathini, linuxppc-dev
In-Reply-To: <20211004151142.256251-4-sourabhjain@linux.ibm.com>
On 10/4/21 20:41, Sourabh Jain wrote:
> On large config LPARs (having 192 and more cores), Linux fails to boot
> due to insufficient memory in the first memory block. It is due to the
> reserve crashkernel area starts at 128MB offset by default and which
> doesn't leave enough space in the first memory block to accommodate
> memory for other essential system resources.
>
> Given that the RMA region size can be 512MB or more, setting the
> crashkernel offset to mid of RMA size will leave enough space to
> kernel to allocate memory for other system resources in the first
> memory block.
>
> Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
> Reported-and-tested-by: Abdul haleem <abdhalee@linux.vnet.ibm.com>
> ---
> arch/powerpc/kernel/rtas.c | 3 +++
> arch/powerpc/kexec/core.c | 13 +++++++++----
> 2 files changed, 12 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index ff80bbad22a5..ce5e62bb4d8e 100644
> --- a/arch/powerpc/kernel/rtas.c
> +++ b/arch/powerpc/kernel/rtas.c
> @@ -1235,6 +1235,9 @@ int __init early_init_dt_scan_rtas(unsigned long node,
> entryp = of_get_flat_dt_prop(node, "linux,rtas-entry", NULL);
> sizep = of_get_flat_dt_prop(node, "rtas-size", NULL);
>
> + if (of_get_flat_dt_prop(node, "ibm,hypertas-functions", NULL))
> + powerpc_firmware_features |= FW_FEATURE_LPAR;
> +
The equivalent check that we currently do more than checking
ibm,hypertas-functions.
if (!strcmp(uname, "rtas") || !strcmp(uname, "rtas@0")) {
prop = of_get_flat_dt_prop(node, "ibm,hypertas-functions",
&len);
if (prop) {
powerpc_firmware_features |= FW_FEATURE_LPAR;
fw_hypertas_feature_init(prop, len);
}
also do we expect other firmware features to be set along with
FW_FEATURE_LPAR?
> if (basep && entryp && sizep) {
> rtas.base = *basep;
> rtas.entry = *entryp;
> diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
> index 48525e8b5730..f69cf3e370ec 100644
> --- a/arch/powerpc/kexec/core.c
> +++ b/arch/powerpc/kexec/core.c
> @@ -147,11 +147,16 @@ void __init reserve_crashkernel(void)
> if (!crashk_res.start) {
> #ifdef CONFIG_PPC64
> /*
> - * On 64bit we split the RMO in half but cap it at half of
> - * a small SLB (128MB) since the crash kernel needs to place
> - * itself and some stacks to be in the first segment.
> + * crash kernel needs to placed in the first segment. On LPAR
> + * setting crash kernel start to mid of RMA size (512MB or more)
> + * would help primary kernel to boot properly on large config
> + * LPAR (with core count 192 or more) and for the reset keep
> + * cap the crash kernel start at 128MB offse.
> */
> - crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2));
> + if (firmware_has_feature(FW_FEATURE_LPAR))
> + crashk_res.start = ppc64_rma_size / 2;
> + else
> + crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2));
> #else
> crashk_res.start = KDUMP_KERNELBASE;
> #endif
>
^ permalink raw reply
* Re: [PATCH 2/5] memory: fsl_ifc: populate child devices without relying on simple-bus
From: Rob Herring @ 2021-10-04 16:27 UTC (permalink / raw)
To: Li Yang
Cc: devicetree, Shawn Guo, linux-kernel, Krzysztof Kozlowski,
linuxppc-dev, linux-arm-kernel
In-Reply-To: <20211001000924.15421-3-leoyang.li@nxp.com>
On Thu, Sep 30, 2021 at 07:09:21PM -0500, Li Yang wrote:
> After we update the binding to not use simple-bus compatible for the
> controller, we need the driver to populate the child devices explicitly.
>
> Signed-off-by: Li Yang <leoyang.li@nxp.com>
> ---
> drivers/memory/fsl_ifc.c | 9 +++++++++
> 1 file changed, 9 insertions(+)
>
> diff --git a/drivers/memory/fsl_ifc.c b/drivers/memory/fsl_ifc.c
> index d062c2f8250f..251d713cd50b 100644
> --- a/drivers/memory/fsl_ifc.c
> +++ b/drivers/memory/fsl_ifc.c
> @@ -88,6 +88,7 @@ static int fsl_ifc_ctrl_remove(struct platform_device *dev)
> {
> struct fsl_ifc_ctrl *ctrl = dev_get_drvdata(&dev->dev);
>
> + of_platform_depopulate(&dev->dev);
> free_irq(ctrl->nand_irq, ctrl);
> free_irq(ctrl->irq, ctrl);
>
> @@ -285,6 +286,14 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
> }
> }
>
> + /* legacy dts may still use "simple-bus" compatible */
> + if (!of_device_is_compatible(dev->dev.of_node, "simple-bus")) {
> + ret = of_platform_populate(dev->dev.of_node, NULL, NULL,
> + &dev->dev);
There's no need to make this conditional. of_platform_populate() is safe
to call multiple times. If that doesn't work, it's a bug.
Rob
^ permalink raw reply
* Re: [PATCH v3 2/2] powerpc/powermac: constify device_node in of_irq_parse_oldworld()
From: Rob Herring @ 2021-10-04 17:00 UTC (permalink / raw)
To: Krzysztof Kozlowski
Cc: devicetree, Frank Rowand, linux-kernel, Rob Herring,
Paul Mackerras, linuxppc-dev
In-Reply-To: <20210924105653.46963-2-krzysztof.kozlowski@canonical.com>
On Fri, 24 Sep 2021 12:56:53 +0200, Krzysztof Kozlowski wrote:
> The of_irq_parse_oldworld() does not modify passed device_node so make
> it a pointer to const for safety. Drop the extern while modifying the
> line.
>
> Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
>
> ---
>
> Changes since v1:
> 1. Drop extern.
> ---
> arch/powerpc/platforms/powermac/pic.c | 2 +-
> include/linux/of_irq.h | 6 +++---
> 2 files changed, 4 insertions(+), 4 deletions(-)
>
Acked-by: Rob Herring <robh@kernel.org>
^ permalink raw reply
* Re: [PATCH 3/3] powerpc: Set crashkernel offset to mid of RMA region
From: Sourabh Jain @ 2021-10-04 17:17 UTC (permalink / raw)
To: Aneesh Kumar K.V, mpe
Cc: Abdul haleem, mahesh, linux-kernel, hbathini, linuxppc-dev
In-Reply-To: <f13e218e-4e38-4076-672f-d555d7abfc02@linux.ibm.com>
Hello Aneesh,
@@ -1235,6 +1235,9 @@ int __init early_init_dt_scan_rtas(unsigned long
node,
>> entryp = of_get_flat_dt_prop(node, "linux,rtas-entry", NULL);
>> sizep = of_get_flat_dt_prop(node, "rtas-size", NULL);
>> + if (of_get_flat_dt_prop(node, "ibm,hypertas-functions", NULL))
>> + powerpc_firmware_features |= FW_FEATURE_LPAR;
>> +
>
> The equivalent check that we currently do more than checking
> ibm,hypertas-functions.
>
> if (!strcmp(uname, "rtas") || !strcmp(uname, "rtas@0")) {
> prop = of_get_flat_dt_prop(node, "ibm,hypertas-functions",
> &len);
> if (prop) {
> powerpc_firmware_features |= FW_FEATURE_LPAR;
> fw_hypertas_feature_init(prop, len);
> }
>
If ibm,hypertas-functions prop has to be part of rtas or rtas@0 node to
decide we are on LPAR then how about splitting the probe_fw_features
functions into two functions, one to detect FW_FEATURE_LPAR and another
function to do the rest?
>
> also do we expect other firmware features to be set along with
> FW_FEATURE_LPAR?
No only FW_FEATURE_LPAR feature so that kernel can decide the
crashkernel offset accordingly.
Thanks for the review.
- Sourabh Jain
^ permalink raw reply
* Re: [PATCH 01/10] dt-bindings: i2c: Add Apple I2C controller bindings
From: Rob Herring @ 2021-10-04 18:01 UTC (permalink / raw)
To: Sven Peter
Cc: devicetree, Arnd Bergmann, Hector Martin, linux-kernel, linux-i2c,
Rob Herring, Paul Mackerras, Alyssa Rosenzweig, Olof Johansson,
Mohamed Mediouni, Stan Skowronek, linuxppc-dev, linux-arm-kernel,
Mark Kettenis
In-Reply-To: <20210926095847.38261-2-sven@svenpeter.dev>
On Sun, 26 Sep 2021 11:58:38 +0200, Sven Peter wrote:
> The Apple I2C controller is based on the PASemi I2C controller.
> It is present on Apple SoCs such as the M1.
>
> Signed-off-by: Sven Peter <sven@svenpeter.dev>
> ---
> .../devicetree/bindings/i2c/apple,i2c.yaml | 61 +++++++++++++++++++
> MAINTAINERS | 1 +
> 2 files changed, 62 insertions(+)
> create mode 100644 Documentation/devicetree/bindings/i2c/apple,i2c.yaml
>
Reviewed-by: Rob Herring <robh@kernel.org>
^ permalink raw reply
* Re: [PATCH 1/9] powerpc/lib: Add helper to check if offset is within conditional branch range
From: Naveen N. Rao @ 2021-10-04 18:02 UTC (permalink / raw)
To: Song Liu
Cc: Daniel Borkmann, Johan Almbladh, Nicholas Piggin, bpf,
linuxppc-dev, Alexei Starovoitov
In-Reply-To: <CAPhsuW4Qv5e=x6WMV1EYy+NUdu+i+i+kGY2E3WAhV66a115C=Q@mail.gmail.com>
Hi Song,
Thanks for the reviews.
Song Liu wrote:
> On Fri, Oct 1, 2021 at 2:16 PM Naveen N. Rao
> <naveen.n.rao@linux.vnet.ibm.com> wrote:
>>
>> Add a helper to check if a given offset is within the branch range for a
>> powerpc conditional branch instruction, and update some sites to use the
>> new helper.
>>
>> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
>
> Acked-by: Song Liu <songliubraving@fb.com>
>
> With one nitpick:
>
>> ---
>> arch/powerpc/include/asm/code-patching.h | 1 +
>> arch/powerpc/lib/code-patching.c | 7 ++++++-
>> arch/powerpc/net/bpf_jit.h | 7 +------
>> 3 files changed, 8 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
>> index a95f63788c6b14..4ba834599c4d4c 100644
>> --- a/arch/powerpc/include/asm/code-patching.h
>> +++ b/arch/powerpc/include/asm/code-patching.h
>> @@ -23,6 +23,7 @@
>> #define BRANCH_ABSOLUTE 0x2
>>
>> bool is_offset_in_branch_range(long offset);
>> +bool is_offset_in_cond_branch_range(long offset);
>> int create_branch(struct ppc_inst *instr, const u32 *addr,
>> unsigned long target, int flags);
>> int create_cond_branch(struct ppc_inst *instr, const u32 *addr,
>> diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
>> index f9a3019e37b43c..e2342b9a1ab9c9 100644
>> --- a/arch/powerpc/lib/code-patching.c
>> +++ b/arch/powerpc/lib/code-patching.c
>> @@ -228,6 +228,11 @@ bool is_offset_in_branch_range(long offset)
>> return (offset >= -0x2000000 && offset <= 0x1fffffc && !(offset & 0x3));
>> }
>>
>> +bool is_offset_in_cond_branch_range(long offset)
>> +{
>> + return offset >= -0x8000 && offset <= 0x7FFF && !(offset & 0x3);
>> +}
>
> Why not inline this one?
Good point. This was modeled after the existing
is_offset_in_branch_range(), and I guess both of those helpers can be
inlined. I'll do a separate patch for that.
- Naveen
^ permalink raw reply
* Re: [PATCH 1/9] powerpc/lib: Add helper to check if offset is within conditional branch range
From: Naveen N. Rao @ 2021-10-04 18:03 UTC (permalink / raw)
To: Alexei Starovoitov, Christophe Leroy, Daniel Borkmann,
Johan Almbladh, Michael Ellerman, Nicholas Piggin
Cc: bpf, linuxppc-dev
In-Reply-To: <6745a836-1991-24d0-f02a-437f06052c63@csgroup.eu>
Hi Christophe,
Thanks for the reviews.
Christophe Leroy wrote:
>
>
> Le 01/10/2021 à 23:14, Naveen N. Rao a écrit :
>> Add a helper to check if a given offset is within the branch range for a
>> powerpc conditional branch instruction, and update some sites to use the
>> new helper.
>>
>> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
>> ---
>> arch/powerpc/include/asm/code-patching.h | 1 +
>> arch/powerpc/lib/code-patching.c | 7 ++++++-
>> arch/powerpc/net/bpf_jit.h | 7 +------
>> 3 files changed, 8 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
>> index a95f63788c6b14..4ba834599c4d4c 100644
>> --- a/arch/powerpc/include/asm/code-patching.h
>> +++ b/arch/powerpc/include/asm/code-patching.h
>> @@ -23,6 +23,7 @@
>> #define BRANCH_ABSOLUTE 0x2
>>
>> bool is_offset_in_branch_range(long offset);
>> +bool is_offset_in_cond_branch_range(long offset);
>> int create_branch(struct ppc_inst *instr, const u32 *addr,
>> unsigned long target, int flags);
>> int create_cond_branch(struct ppc_inst *instr, const u32 *addr,
>> diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
>> index f9a3019e37b43c..e2342b9a1ab9c9 100644
>> --- a/arch/powerpc/lib/code-patching.c
>> +++ b/arch/powerpc/lib/code-patching.c
>> @@ -228,6 +228,11 @@ bool is_offset_in_branch_range(long offset)
>> return (offset >= -0x2000000 && offset <= 0x1fffffc && !(offset & 0x3));
>> }
>>
>> +bool is_offset_in_cond_branch_range(long offset)
>> +{
>> + return offset >= -0x8000 && offset <= 0x7FFF && !(offset & 0x3);
>> +}
>
> Would be better without capital letters in numbers, in extenso 0x7fff
> instead of 0x7FFF
Ack.
- Naveen
^ permalink raw reply
* Re: [PATCH 2/9] powerpc/bpf: Validate branch ranges
From: Naveen N. Rao @ 2021-10-04 18:11 UTC (permalink / raw)
To: Alexei Starovoitov, Christophe Leroy, Daniel Borkmann,
Johan Almbladh, Michael Ellerman, Nicholas Piggin
Cc: bpf, linuxppc-dev
In-Reply-To: <213cac08-b0d2-447f-8448-ab31cc7b1d47@csgroup.eu>
Christophe Leroy wrote:
>
>
> Le 01/10/2021 à 23:14, Naveen N. Rao a écrit :
>> Add checks to ensure that we never emit branch instructions with
>> truncated branch offsets.
>>
>> Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
>> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
>> ---
>> arch/powerpc/net/bpf_jit.h | 26 ++++++++++++++++++++------
>> arch/powerpc/net/bpf_jit_comp.c | 6 +++++-
>> arch/powerpc/net/bpf_jit_comp32.c | 8 ++++++--
>> arch/powerpc/net/bpf_jit_comp64.c | 8 ++++++--
>> 4 files changed, 37 insertions(+), 11 deletions(-)
>>
>> diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
>> index 935ea95b66359e..7e9b978b768ed9 100644
>> --- a/arch/powerpc/net/bpf_jit.h
>> +++ b/arch/powerpc/net/bpf_jit.h
>> @@ -24,16 +24,30 @@
>> #define EMIT(instr) PLANT_INSTR(image, ctx->idx, instr)
>>
>> /* Long jump; (unconditional 'branch') */
>> -#define PPC_JMP(dest) EMIT(PPC_INST_BRANCH | \
>> - (((dest) - (ctx->idx * 4)) & 0x03fffffc))
>> +#define PPC_JMP(dest) \
>> + do { \
>> + long offset = (long)(dest) - (ctx->idx * 4); \
>> + if (!is_offset_in_branch_range(offset)) { \
>> + pr_err_ratelimited("Branch offset 0x%lx (@%u) out of range\n", offset, ctx->idx); \
>
> Does it really deserves a KERN_ERR ?
The intent is to ensure that we handle this when JIT'ing the BPF
instruction. One of the subsequent patches fixes the only scenario where
we can hit this today. In practice, we should never hit this and if we
do see this, then it is a bug with the JIT.
> Isn't that something that can trigger with a userland request ?
This can't be triggered by unprivileged BPF programs since those are
limited to 4096 BPF instructions. You need root privileges to load large
enough BPF programs that can trigger out of range branches.
- Naveen
^ permalink raw reply
* Re: [PATCH 3/9] powerpc/bpf: Remove unused SEEN_STACK
From: Naveen N. Rao @ 2021-10-04 18:11 UTC (permalink / raw)
To: Alexei Starovoitov, Christophe Leroy, Daniel Borkmann,
Johan Almbladh, Michael Ellerman, Nicholas Piggin
Cc: bpf, linuxppc-dev
In-Reply-To: <bdadfd21-7e39-5984-43b9-818f1660ccaf@csgroup.eu>
Christophe Leroy wrote:
>
>
> Le 01/10/2021 à 23:14, Naveen N. Rao a écrit :
>> From: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
>>
>> SEEN_STACK is unused on PowerPC. Remove it. Also, have
>> SEEN_TAILCALL use 0x40000000.
>
> Why change SEEN_TAILCALL ? Would it be a problem to leave it as is ?
>
>>
>> Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
>> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
I prefer the bit usage to be contiguous. Changing SEEN_TAILCALL isn't a
problem either.
- Naveen
^ permalink raw reply
* Re: [PATCH 6/9] powerpc/bpf: Fix BPF_SUB when imm == 0x80000000
From: Naveen N. Rao @ 2021-10-04 18:18 UTC (permalink / raw)
To: Alexei Starovoitov, Christophe Leroy, Daniel Borkmann,
Johan Almbladh, Michael Ellerman, Nicholas Piggin
Cc: bpf, linuxppc-dev
In-Reply-To: <43626c62-9a3a-bbba-8cbc-11efb0468b4b@csgroup.eu>
Christophe Leroy wrote:
>
>
> Le 01/10/2021 à 23:14, Naveen N. Rao a écrit :
>> We aren't handling subtraction involving an immediate value of
>> 0x80000000 properly. Fix the same.
>>
>> Fixes: 156d0e290e969c ("powerpc/ebpf/jit: Implement JIT compiler for extended BPF")
>> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
>> ---
>> arch/powerpc/net/bpf_jit_comp64.c | 16 ++++++++--------
>> 1 file changed, 8 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
>> index ffb7a2877a8469..4641a50e82d50d 100644
>> --- a/arch/powerpc/net/bpf_jit_comp64.c
>> +++ b/arch/powerpc/net/bpf_jit_comp64.c
>> @@ -333,15 +333,15 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
>> case BPF_ALU | BPF_SUB | BPF_K: /* (u32) dst -= (u32) imm */
>> case BPF_ALU64 | BPF_ADD | BPF_K: /* dst += imm */
>> case BPF_ALU64 | BPF_SUB | BPF_K: /* dst -= imm */
>> - if (BPF_OP(code) == BPF_SUB)
>> - imm = -imm;
>> - if (imm) {
>> - if (imm >= -32768 && imm < 32768)
>> - EMIT(PPC_RAW_ADDI(dst_reg, dst_reg, IMM_L(imm)));
>> - else {
>> - PPC_LI32(b2p[TMP_REG_1], imm);
>> + if (imm > -32768 && imm < 32768) {
>> + EMIT(PPC_RAW_ADDI(dst_reg, dst_reg,
>> + BPF_OP(code) == BPF_SUB ? IMM_L(-imm) : IMM_L(imm)));
>> + } else {
>> + PPC_LI32(b2p[TMP_REG_1], imm);
>> + if (BPF_OP(code) == BPF_SUB)
>> + EMIT(PPC_RAW_SUB(dst_reg, dst_reg, b2p[TMP_REG_1]));
>> + else
>> EMIT(PPC_RAW_ADD(dst_reg, dst_reg, b2p[TMP_REG_1]));
>> - }
>> }
>> goto bpf_alu32_trunc;
>
> There is now so few code common to both BPF_ADD and BPF_SUB that you
> should make them different cases.
>
> While at it, why not also use ADDIS if imm is 32 bits ? That would be an
> ADDIS/ADDI instead of LIS/ORI/ADD
Sure. I wanted to limit the change for this fix. We can do a separate
patch to optimize code generation for BPF_ADD.
- Naveen
^ permalink raw reply
* Re: [PATCH 0/9] powerpc/bpf: Various fixes
From: Naveen N. Rao @ 2021-10-04 18:19 UTC (permalink / raw)
To: Johan Almbladh
Cc: bpf, linuxppc-dev, Alexei Starovoitov, Daniel Borkmann,
Nicholas Piggin
In-Reply-To: <CAM1=_QSrOZBk+_h224cdxrZw7djqUqO9jytYNV--9V-KTJmt9Q@mail.gmail.com>
Hi Johan,
Johan Almbladh wrote:
> On Fri, Oct 1, 2021 at 11:15 PM Naveen N. Rao
> <naveen.n.rao@linux.vnet.ibm.com> wrote:
>>
>> Various fixes to the eBPF JIT for powerpc, thanks to some new tests
>> added by Johan. This series fixes all failures in test_bpf on powerpc64.
>> There are still some failures on powerpc32 to be looked into.
>
> Great work! I have tested it on powerpc64 in QEMU, which is the same
> setup that previously triggered an illegal instruction, and all tests
> pass now. On powerpc32 there are still some issues left as you say.
Thanks for the review, and the test!
- Naveen
^ permalink raw reply
* Re: [PATCH 4/9] powerpc/bpf: Handle large branch ranges with BPF_EXIT
From: Naveen N. Rao @ 2021-10-04 18:24 UTC (permalink / raw)
To: Alexei Starovoitov, Christophe Leroy, Daniel Borkmann,
Johan Almbladh, Michael Ellerman, Nicholas Piggin
Cc: bpf, linuxppc-dev
In-Reply-To: <e37766fd-8c52-6961-39a8-2de44a769204@csgroup.eu>
Christophe Leroy wrote:
>
>
> Le 01/10/2021 à 23:14, Naveen N. Rao a écrit :
>> In some scenarios, it is possible that the program epilogue is outside
>> the branch range for a BPF_EXIT instruction. Instead of rejecting such
>> programs, emit an indirect branch. We track the size of the bpf program
>> emitted after the initial run and do a second pass since BPF_EXIT can
>> end up emitting different number of instructions depending on the
>> program size.
>>
>> Suggested-by: Jordan Niethe <jniethe5@gmail.com>
>> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
>> ---
>> arch/powerpc/net/bpf_jit.h | 3 +++
>> arch/powerpc/net/bpf_jit_comp.c | 22 +++++++++++++++++++++-
>> arch/powerpc/net/bpf_jit_comp32.c | 2 +-
>> arch/powerpc/net/bpf_jit_comp64.c | 2 +-
>> 4 files changed, 26 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
>> index 89bd744c2bffd4..4023de1698b9f5 100644
>> --- a/arch/powerpc/net/bpf_jit.h
>> +++ b/arch/powerpc/net/bpf_jit.h
>> @@ -126,6 +126,7 @@
>>
>> #define SEEN_FUNC 0x20000000 /* might call external helpers */
>> #define SEEN_TAILCALL 0x40000000 /* uses tail calls */
>> +#define SEEN_BIG_PROG 0x80000000 /* large prog, >32MB */
>>
>> #define SEEN_VREG_MASK 0x1ff80000 /* Volatile registers r3-r12 */
>> #define SEEN_NVREG_MASK 0x0003ffff /* Non volatile registers r14-r31 */
>> @@ -179,6 +180,8 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
>> void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx);
>> void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx);
>> void bpf_jit_realloc_regs(struct codegen_context *ctx);
>> +int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx,
>> + int tmp_reg, unsigned long exit_addr);
>>
>> #endif
>>
>> diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
>> index fcbf7a917c566e..3204872fbf2738 100644
>> --- a/arch/powerpc/net/bpf_jit_comp.c
>> +++ b/arch/powerpc/net/bpf_jit_comp.c
>> @@ -72,6 +72,21 @@ static int bpf_jit_fixup_subprog_calls(struct bpf_prog *fp, u32 *image,
>> return 0;
>> }
>>
>> +int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx,
>> + int tmp_reg, unsigned long exit_addr)
>> +{
>> + if (!(ctx->seen & SEEN_BIG_PROG) && is_offset_in_branch_range(exit_addr)) {
>> + PPC_JMP(exit_addr);
>> + } else {
>> + ctx->seen |= SEEN_BIG_PROG;
>> + PPC_FUNC_ADDR(tmp_reg, (unsigned long)image + exit_addr);
>> + EMIT(PPC_RAW_MTCTR(tmp_reg));
>> + EMIT(PPC_RAW_BCTR());
>> + }
>> +
>> + return 0;
>> +}
>> +
>> struct powerpc64_jit_data {
>> struct bpf_binary_header *header;
>> u32 *addrs;
>> @@ -155,12 +170,17 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
>> goto out_addrs;
>> }
>>
>> + if (!is_offset_in_branch_range((long)cgctx.idx * 4))
>> + cgctx.seen |= SEEN_BIG_PROG;
>> +
>> /*
>> * If we have seen a tail call, we need a second pass.
>> * This is because bpf_jit_emit_common_epilogue() is called
>> * from bpf_jit_emit_tail_call() with a not yet stable ctx->seen.
>> + * We also need a second pass if we ended up with too large
>> + * a program so as to fix branches.
>> */
>> - if (cgctx.seen & SEEN_TAILCALL) {
>> + if (cgctx.seen & (SEEN_TAILCALL | SEEN_BIG_PROG)) {
>> cgctx.idx = 0;
>> if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) {
>> fp = org_fp;
>> diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c
>> index a74d52204f8da2..d2a67574a23066 100644
>> --- a/arch/powerpc/net/bpf_jit_comp32.c
>> +++ b/arch/powerpc/net/bpf_jit_comp32.c
>> @@ -852,7 +852,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
>> * we'll just fall through to the epilogue.
>> */
>> if (i != flen - 1)
>> - PPC_JMP(exit_addr);
>> + bpf_jit_emit_exit_insn(image, ctx, tmp_reg, exit_addr);
>
> On ppc32, if you use tmp_reg you must flag it. But I think you could use
> r0 instead.
Indeed. Can we drop tracking of the temp registers and using them while
remapping registers? Are you seeing significant benefits with re-use of
those temp registers?
- Naveen
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox