LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 07/13] KVM: PPC: Book3S 64: add hcall interrupt handler
From: Nicholas Piggin @ 2021-02-19  6:35 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin, Fabiano Rosas
In-Reply-To: <20210219063542.1425130-1-npiggin@gmail.com>

Add a separate hcall entry point. This can be used to deal with the
different calling convention.

Reviewed-by: Fabiano Rosas <farosas@linux.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/exceptions-64s.S | 4 ++--
 arch/powerpc/kvm/book3s_64_entry.S   | 6 +++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 96f22c582213..a61a45704925 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -2023,13 +2023,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	 * Requires __LOAD_FAR_HANDLER beause kvmppc_interrupt lives
 	 * outside the head section.
 	 */
-	__LOAD_FAR_HANDLER(r10, kvmppc_interrupt)
+	__LOAD_FAR_HANDLER(r10, kvmppc_hcall)
 	mtctr   r10
 	ld	r10,PACA_EXGEN+EX_R10(r13)
 	bctr
 #else
 	ld	r10,PACA_EXGEN+EX_R10(r13)
-	b       kvmppc_interrupt
+	b       kvmppc_hcall
 #endif
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S
index 820d103e5f50..53addbbe7b1a 100644
--- a/arch/powerpc/kvm/book3s_64_entry.S
+++ b/arch/powerpc/kvm/book3s_64_entry.S
@@ -7,9 +7,13 @@
 #include <asm/reg.h>
 
 /*
- * This is branched to from interrupt handlers in exception-64s.S which set
+ * These are branched to from interrupt handlers in exception-64s.S which set
  * IKVM_REAL or IKVM_VIRT, if HSTATE_IN_GUEST was found to be non-zero.
  */
+.global	kvmppc_hcall
+.balign IFETCH_ALIGN_BYTES
+kvmppc_hcall:
+
 .global	kvmppc_interrupt
 .balign IFETCH_ALIGN_BYTES
 kvmppc_interrupt:
-- 
2.23.0


^ permalink raw reply related

* [PATCH 06/13] KVM: PPC: Book3S 64: Move GUEST_MODE_SKIP test into KVM
From: Nicholas Piggin @ 2021-02-19  6:35 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210219063542.1425130-1-npiggin@gmail.com>

Move the GUEST_MODE_SKIP logic into KVM code. This is quite a KVM
internal detail that has no real need to be in common handlers.

Also add a comment explaining why this this thing exists.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/exceptions-64s.S | 60 --------------------------
 arch/powerpc/kvm/book3s_64_entry.S   | 64 ++++++++++++++++++++++++----
 2 files changed, 56 insertions(+), 68 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index a1640d6ea65d..96f22c582213 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -133,7 +133,6 @@ name:
 #define IBRANCH_TO_COMMON	.L_IBRANCH_TO_COMMON_\name\() /* ENTRY branch to common */
 #define IREALMODE_COMMON	.L_IREALMODE_COMMON_\name\() /* Common runs in realmode */
 #define IMASK		.L_IMASK_\name\()	/* IRQ soft-mask bit */
-#define IKVM_SKIP	.L_IKVM_SKIP_\name\()	/* Generate KVM skip handler */
 #define IKVM_REAL	.L_IKVM_REAL_\name\()	/* Real entry tests KVM */
 #define __IKVM_REAL(name)	.L_IKVM_REAL_ ## name
 #define IKVM_VIRT	.L_IKVM_VIRT_\name\()	/* Virt entry tests KVM */
@@ -191,9 +190,6 @@ do_define_int n
 	.ifndef IMASK
 		IMASK=0
 	.endif
-	.ifndef IKVM_SKIP
-		IKVM_SKIP=0
-	.endif
 	.ifndef IKVM_REAL
 		IKVM_REAL=0
 	.endif
@@ -254,15 +250,10 @@ do_define_int n
 	.balign IFETCH_ALIGN_BYTES
 \name\()_kvm:
 
-	.if IKVM_SKIP
-	cmpwi	r10,KVM_GUEST_MODE_SKIP
-	beq	89f
-	.else
 BEGIN_FTR_SECTION
 	ld	r10,IAREA+EX_CFAR(r13)
 	std	r10,HSTATE_CFAR(r13)
 END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
-	.endif
 
 	ld	r10,IAREA+EX_CTR(r13)
 	mtctr	r10
@@ -289,27 +280,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	ori	r12,r12,(IVEC)
 	.endif
 	b	kvmppc_interrupt
-
-	.if IKVM_SKIP
-89:	mtocrf	0x80,r9
-	ld	r10,IAREA+EX_CTR(r13)
-	mtctr	r10
-	ld	r9,IAREA+EX_R9(r13)
-	ld	r10,IAREA+EX_R10(r13)
-	ld	r11,IAREA+EX_R11(r13)
-	ld	r12,IAREA+EX_R12(r13)
-	.if IHSRR_IF_HVMODE
-	BEGIN_FTR_SECTION
-	b	kvmppc_skip_Hinterrupt
-	FTR_SECTION_ELSE
-	b	kvmppc_skip_interrupt
-	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
-	.elseif IHSRR
-	b	kvmppc_skip_Hinterrupt
-	.else
-	b	kvmppc_skip_interrupt
-	.endif
-	.endif
 .endm
 
 #else
@@ -1128,7 +1098,6 @@ INT_DEFINE_BEGIN(machine_check)
 	ISET_RI=0
 	IDAR=1
 	IDSISR=1
-	IKVM_SKIP=1
 	IKVM_REAL=1
 INT_DEFINE_END(machine_check)
 
@@ -1419,7 +1388,6 @@ INT_DEFINE_BEGIN(data_access)
 	IVEC=0x300
 	IDAR=1
 	IDSISR=1
-	IKVM_SKIP=1
 	IKVM_REAL=1
 INT_DEFINE_END(data_access)
 
@@ -1465,7 +1433,6 @@ INT_DEFINE_BEGIN(data_access_slb)
 	IVEC=0x380
 	IRECONCILE=0
 	IDAR=1
-	IKVM_SKIP=1
 	IKVM_REAL=1
 INT_DEFINE_END(data_access_slb)
 
@@ -2111,7 +2078,6 @@ INT_DEFINE_BEGIN(h_data_storage)
 	IHSRR=1
 	IDAR=1
 	IDSISR=1
-	IKVM_SKIP=1
 	IKVM_REAL=1
 	IKVM_VIRT=1
 INT_DEFINE_END(h_data_storage)
@@ -3088,32 +3054,6 @@ EXPORT_SYMBOL(do_uaccess_flush)
 MASKED_INTERRUPT
 MASKED_INTERRUPT hsrr=1
 
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-kvmppc_skip_interrupt:
-	/*
-	 * Here all GPRs are unchanged from when the interrupt happened
-	 * except for r13, which is saved in SPRG_SCRATCH0.
-	 */
-	mfspr	r13, SPRN_SRR0
-	addi	r13, r13, 4
-	mtspr	SPRN_SRR0, r13
-	GET_SCRATCH0(r13)
-	RFI_TO_KERNEL
-	b	.
-
-kvmppc_skip_Hinterrupt:
-	/*
-	 * Here all GPRs are unchanged from when the interrupt happened
-	 * except for r13, which is saved in SPRG_SCRATCH0.
-	 */
-	mfspr	r13, SPRN_HSRR0
-	addi	r13, r13, 4
-	mtspr	SPRN_HSRR0, r13
-	GET_SCRATCH0(r13)
-	HRFI_TO_KERNEL
-	b	.
-#endif
-
 	/*
 	 * Relocation-on interrupts: A subset of the interrupts can be delivered
 	 * with IR=1/DR=1, if AIL==2 and MSR.HV won't be changed by delivering
diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S
index 147ebf1c3c1f..820d103e5f50 100644
--- a/arch/powerpc/kvm/book3s_64_entry.S
+++ b/arch/powerpc/kvm/book3s_64_entry.S
@@ -1,9 +1,10 @@
+#include <asm/asm-offsets.h>
 #include <asm/cache.h>
-#include <asm/ppc_asm.h>
+#include <asm/exception-64s.h>
 #include <asm/kvm_asm.h>
-#include <asm/reg.h>
-#include <asm/asm-offsets.h>
 #include <asm/kvm_book3s_asm.h>
+#include <asm/ppc_asm.h>
+#include <asm/reg.h>
 
 /*
  * This is branched to from interrupt handlers in exception-64s.S which set
@@ -19,17 +20,64 @@ kvmppc_interrupt:
 	 * guest R12 saved in shadow VCPU SCRATCH0
 	 * guest R13 saved in SPRN_SCRATCH0
 	 */
+	std	r9,HSTATE_SCRATCH2(r13)
+	lbz	r9,HSTATE_IN_GUEST(r13)
+	cmpwi	r9,KVM_GUEST_MODE_SKIP
+	beq-	.Lmaybe_skip
+.Lno_skip:
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-	std	r9, HSTATE_SCRATCH2(r13)
-	lbz	r9, HSTATE_IN_GUEST(r13)
-	cmpwi	r9, KVM_GUEST_MODE_HOST_HV
+	cmpwi	r9,KVM_GUEST_MODE_HOST_HV
 	beq	kvmppc_bad_host_intr
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
-	cmpwi	r9, KVM_GUEST_MODE_GUEST
-	ld	r9, HSTATE_SCRATCH2(r13)
+	cmpwi	r9,KVM_GUEST_MODE_GUEST
+	ld	r9,HSTATE_SCRATCH2(r13)
 	beq	kvmppc_interrupt_pr
 #endif
 	b	kvmppc_interrupt_hv
 #else
 	b	kvmppc_interrupt_pr
 #endif
+
+/*
+ * KVM uses a trick where it is running in MSR[HV]=1 mode in real-mode with the
+ * guest MMU context loaded, and it sets KVM_GUEST_MODE_SKIP and enables
+ * MSR[DR]=1 while leaving MSR[IR]=0, so it continues to fetch HV instructions
+ * but loads and stores will access the guest context. This is used to load
+ * the faulting instruction without walking page tables.
+ *
+ * However the guest context may not be able to translate, or it may cause a
+ * machine check or other issue, which will result in a fault in the host
+ * (even with KVM-HV).
+ *
+ * These faults are caught here and if the fault was (or was likely) due to
+ * that load, then we just return with the PC advanced +4 and skip the load,
+ * which then goes via the slow path.
+ */
+.Lmaybe_skip:
+	cmpwi	r12,BOOK3S_INTERRUPT_MACHINE_CHECK
+	beq	1f
+	cmpwi	r12,BOOK3S_INTERRUPT_DATA_STORAGE
+	beq	1f
+	cmpwi	r12,BOOK3S_INTERRUPT_DATA_SEGMENT
+	beq	1f
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE | 0x2
+	beq	2f
+#endif
+	b	.Lno_skip
+1:	mfspr	r9,SPRN_SRR0
+	addi	r9,r9,4
+	mtspr	SPRN_SRR0,r9
+	ld	r12,HSTATE_SCRATCH0(r13)
+	ld	r9,HSTATE_SCRATCH2(r13)
+	GET_SCRATCH0(r13)
+	RFI_TO_KERNEL
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+2:	mfspr	r9,SPRN_HSRR0
+	addi	r9,r9,4
+	mtspr	SPRN_HSRR0,r9
+	ld	r12,HSTATE_SCRATCH0(r13)
+	ld	r9,HSTATE_SCRATCH2(r13)
+	GET_SCRATCH0(r13)
+	HRFI_TO_KERNEL
+#endif
-- 
2.23.0


^ permalink raw reply related

* [PATCH 05/13] KVM: PPC: Book3S 64: move KVM interrupt entry to a common entry point
From: Nicholas Piggin @ 2021-02-19  6:35 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin, Fabiano Rosas
In-Reply-To: <20210219063542.1425130-1-npiggin@gmail.com>

Rather than bifurcate the call depending on whether or not HV is
possible, and have the HV entry test for PR, just make a single
common point which does the demultiplexing. This makes it simpler
to add another type of exit handler.

Reviewed-by: Fabiano Rosas <farosas@linux.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/exceptions-64s.S    |  8 +-----
 arch/powerpc/kvm/Makefile               |  3 +++
 arch/powerpc/kvm/book3s_64_entry.S      | 35 +++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 11 ++------
 4 files changed, 41 insertions(+), 16 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_64_entry.S

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 5bc689a546ae..a1640d6ea65d 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -212,7 +212,6 @@ do_define_int n
 .endm
 
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 /*
  * All interrupts which set HSRR registers, as well as SRESET and MCE and
  * syscall when invoked with "sc 1" switch to MSR[HV]=1 (HVMODE) to be taken,
@@ -242,13 +241,8 @@ do_define_int n
 
 /*
  * If an interrupt is taken while a guest is running, it is immediately routed
- * to KVM to handle. If both HV and PR KVM arepossible, KVM interrupts go first
- * to kvmppc_interrupt_hv, which handles the PR guest case.
+ * to KVM to handle.
  */
-#define kvmppc_interrupt kvmppc_interrupt_hv
-#else
-#define kvmppc_interrupt kvmppc_interrupt_pr
-#endif
 
 .macro KVMTEST name
 	lbz	r10,HSTATE_IN_GUEST(r13)
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 2bfeaa13befb..cdd119028f64 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -59,6 +59,9 @@ kvm-pr-y := \
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
 	tm.o
 
+kvm-book3s_64-builtin-objs-y += \
+	book3s_64_entry.o
+
 ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
 	book3s_rmhandlers.o
diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S
new file mode 100644
index 000000000000..147ebf1c3c1f
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_entry.S
@@ -0,0 +1,35 @@
+#include <asm/cache.h>
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/asm-offsets.h>
+#include <asm/kvm_book3s_asm.h>
+
+/*
+ * This is branched to from interrupt handlers in exception-64s.S which set
+ * IKVM_REAL or IKVM_VIRT, if HSTATE_IN_GUEST was found to be non-zero.
+ */
+.global	kvmppc_interrupt
+.balign IFETCH_ALIGN_BYTES
+kvmppc_interrupt:
+	/*
+	 * Register contents:
+	 * R12		= (guest CR << 32) | interrupt vector
+	 * R13		= PACA
+	 * guest R12 saved in shadow VCPU SCRATCH0
+	 * guest R13 saved in SPRN_SCRATCH0
+	 */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	std	r9, HSTATE_SCRATCH2(r13)
+	lbz	r9, HSTATE_IN_GUEST(r13)
+	cmpwi	r9, KVM_GUEST_MODE_HOST_HV
+	beq	kvmppc_bad_host_intr
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+	cmpwi	r9, KVM_GUEST_MODE_GUEST
+	ld	r9, HSTATE_SCRATCH2(r13)
+	beq	kvmppc_interrupt_pr
+#endif
+	b	kvmppc_interrupt_hv
+#else
+	b	kvmppc_interrupt_pr
+#endif
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 3988873b044c..bbf786a0c0d6 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1255,16 +1255,8 @@ kvmppc_interrupt_hv:
 	 * R13		= PACA
 	 * guest R12 saved in shadow VCPU SCRATCH0
 	 * guest R13 saved in SPRN_SCRATCH0
+	 * guest R9 saved in HSTATE_SCRATCH2
 	 */
-	std	r9, HSTATE_SCRATCH2(r13)
-	lbz	r9, HSTATE_IN_GUEST(r13)
-	cmpwi	r9, KVM_GUEST_MODE_HOST_HV
-	beq	kvmppc_bad_host_intr
-#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
-	cmpwi	r9, KVM_GUEST_MODE_GUEST
-	ld	r9, HSTATE_SCRATCH2(r13)
-	beq	kvmppc_interrupt_pr
-#endif
 	/* We're now back in the host but in guest MMU context */
 	li	r9, KVM_GUEST_MODE_HOST_HV
 	stb	r9, HSTATE_IN_GUEST(r13)
@@ -3262,6 +3254,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_P9_TM_HV_ASSIST)
  * cfar is saved in HSTATE_CFAR(r13)
  * ppr is saved in HSTATE_PPR(r13)
  */
+.global kvmppc_bad_host_intr
 kvmppc_bad_host_intr:
 	/*
 	 * Switch to the emergency stack, but start half-way down in
-- 
2.23.0


^ permalink raw reply related

* [PATCH 04/13] KVM: PPC: Book3S 64: remove unused kvmppc_h_protect argument
From: Nicholas Piggin @ 2021-02-19  6:35 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210219063542.1425130-1-npiggin@gmail.com>

The va argument is not used in the function or set by its asm caller,
so remove it to be safe.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/kvm_ppc.h  | 3 +--
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0a056c64c317..45b7610773b1 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -765,8 +765,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
                      unsigned long pte_index, unsigned long avpn);
 long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu);
 long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
-                      unsigned long pte_index, unsigned long avpn,
-                      unsigned long va);
+                      unsigned long pte_index, unsigned long avpn);
 long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
                    unsigned long pte_index);
 long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index f87237927096..956522b6ea15 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -667,8 +667,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 }
 
 long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
-		      unsigned long pte_index, unsigned long avpn,
-		      unsigned long va)
+		      unsigned long pte_index, unsigned long avpn)
 {
 	struct kvm *kvm = vcpu->kvm;
 	__be64 *hpte;
-- 
2.23.0


^ permalink raw reply related

* [PATCH 03/13] KVM: PPC: Book3S HV: Ensure MSR[ME] is always set in guest MSR
From: Nicholas Piggin @ 2021-02-19  6:35 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210219063542.1425130-1-npiggin@gmail.com>

Rather than add the ME bit to the MSR when the guest is entered, make
it clear that the hypervisor does not allow the guest to clear the bit.

The ME addition is kept in the code for now, but a future patch will
warn if it's not present.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kvm/book3s_hv_builtin.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index dad118760a4e..ae8f291c5c48 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -661,6 +661,13 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
 
 void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
 {
+	/*
+	 * Guest must always run with machine check interrupt
+	 * enabled.
+	 */
+	if (!(msr & MSR_ME))
+		msr |= MSR_ME;
+
 	/*
 	 * Check for illegal transactional state bit combination
 	 * and if we find it, force the TS field to a safe state.
-- 
2.23.0


^ permalink raw reply related

* [PATCH 02/13] powerpc/64s: remove KVM SKIP test from instruction breakpoint handler
From: Nicholas Piggin @ 2021-02-19  6:35 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210219063542.1425130-1-npiggin@gmail.com>

The code being executed in KVM_GUEST_MODE_SKIP is hypervisor code with
MSR[IR]=0, so the faults of concern are the d-side ones caused by access
to guest context by the hypervisor.

Instruction breakpoint interrupts are not a concern here. It's unlikely
any good would come of causing breaks in this code, but skipping the
instruction that caused it won't help matters (e.g., skip the mtmsr that
sets MSR[DR]=0 or clears KVM_GUEST_MODE_SKIP).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/exceptions-64s.S | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 5d0ad3b38e90..5bc689a546ae 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -2597,7 +2597,6 @@ EXC_VIRT_NONE(0x5200, 0x100)
 INT_DEFINE_BEGIN(instruction_breakpoint)
 	IVEC=0x1300
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
-	IKVM_SKIP=1
 	IKVM_REAL=1
 #endif
 INT_DEFINE_END(instruction_breakpoint)
-- 
2.23.0

^ permalink raw reply related

* [PATCH 01/13] powerpc/64s: Remove KVM handler support from CBE_RAS interrupts
From: Nicholas Piggin @ 2021-02-19  6:35 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210219063542.1425130-1-npiggin@gmail.com>

Cell does not support KVM.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/exceptions-64s.S | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 39cbea495154..5d0ad3b38e90 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -2574,8 +2574,6 @@ EXC_VIRT_NONE(0x5100, 0x100)
 INT_DEFINE_BEGIN(cbe_system_error)
 	IVEC=0x1200
 	IHSRR=1
-	IKVM_SKIP=1
-	IKVM_REAL=1
 INT_DEFINE_END(cbe_system_error)
 
 EXC_REAL_BEGIN(cbe_system_error, 0x1200, 0x100)
@@ -2745,8 +2743,6 @@ EXC_COMMON_BEGIN(denorm_exception_common)
 INT_DEFINE_BEGIN(cbe_maintenance)
 	IVEC=0x1600
 	IHSRR=1
-	IKVM_SKIP=1
-	IKVM_REAL=1
 INT_DEFINE_END(cbe_maintenance)
 
 EXC_REAL_BEGIN(cbe_maintenance, 0x1600, 0x100)
@@ -2798,8 +2794,6 @@ EXC_COMMON_BEGIN(altivec_assist_common)
 INT_DEFINE_BEGIN(cbe_thermal)
 	IVEC=0x1800
 	IHSRR=1
-	IKVM_SKIP=1
-	IKVM_REAL=1
 INT_DEFINE_END(cbe_thermal)
 
 EXC_REAL_BEGIN(cbe_thermal, 0x1800, 0x100)
-- 
2.23.0


^ permalink raw reply related

* [PATCH 00/13] KVM: PPC: Book3S: C-ify the P9 entry/exit code
From: Nicholas Piggin @ 2021-02-19  6:35 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin

This has a lot more implemented, things tidied up, and more things split
out. It's also implemented on top of powerpc next and kvm next which
have a few prerequisite patches (mainly removing EXSLB).

I've got a bunch more things after this including implementing HPT
guests with radix host in the "new" path -- whether we ever actually
want to do that, or port the legacy path up to C, or just leave it to
maintenance mode, I was just testing the waters there and making sure I
wasn't doing something fundamentally incompatible with hash.

I won't post anything further than this for now because I think it's a
good start and gets the total asm code for KVM entry and exit down to
about 160 lines plus the shim for the legacy paths. So would like to
concentrate on getting this in before juggling things around too much
or adding new things.

Thanks,
Nick

Nicholas Piggin (13):
  powerpc/64s: Remove KVM handler support from CBE_RAS interrupts
  powerpc/64s: remove KVM SKIP test from instruction breakpoint handler
  KVM: PPC: Book3S HV: Ensure MSR[ME] is always set in guest MSR
  KVM: PPC: Book3S 64: remove unused kvmppc_h_protect argument
  KVM: PPC: Book3S 64: move KVM interrupt entry to a common entry point
  KVM: PPC: Book3S 64: Move GUEST_MODE_SKIP test into KVM
  KVM: PPC: Book3S 64: add hcall interrupt handler
  KVM: PPC: Book3S HV: Move hcall early register setup to KVM
  KVM: PPC: Book3S HV: Move interrupt early register setup to KVM
  KVM: PPC: Book3S HV: move bad_host_intr check to HV handler
  KVM: PPC: Book3S HV: Minimise hcall handler calling convention
    differences
  KVM: PPC: Book3S HV: Move radix MMU switching together in the P9 path
  KVM: PPC: Book3S HV: Implement the rest of the P9 entry/exit handling
    in C

 arch/powerpc/include/asm/asm-prototypes.h |   3 +-
 arch/powerpc/include/asm/exception-64s.h  |  13 +
 arch/powerpc/include/asm/kvm_asm.h        |   3 +-
 arch/powerpc/include/asm/kvm_book3s_64.h  |   2 +
 arch/powerpc/include/asm/kvm_ppc.h        |   5 +-
 arch/powerpc/kernel/exceptions-64s.S      | 257 +++----------------
 arch/powerpc/kernel/security.c            |   5 +-
 arch/powerpc/kvm/Makefile                 |   6 +
 arch/powerpc/kvm/book3s_64_entry.S        | 295 ++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c              |  69 +++--
 arch/powerpc/kvm/book3s_hv_builtin.c      |   7 +
 arch/powerpc/kvm/book3s_hv_interrupt.c    | 208 +++++++++++++++
 arch/powerpc/kvm/book3s_hv_rm_mmu.c       |   3 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 125 +--------
 arch/powerpc/kvm/book3s_segment.S         |   7 +
 arch/powerpc/kvm/book3s_xive.c            |  32 +++
 16 files changed, 670 insertions(+), 370 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_64_entry.S
 create mode 100644 arch/powerpc/kvm/book3s_hv_interrupt.c

-- 
2.23.0


^ permalink raw reply

* Re: [RFC PATCH 2/9] KVM: PPC: Book3S 64: Move GUEST_MODE_SKIP test into KVM
From: Daniel Axtens @ 2021-02-19  6:03 UTC (permalink / raw)
  To: Nicholas Piggin, kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210202030313.3509446-3-npiggin@gmail.com>

Hi Nick,

> +maybe_skip:
> +	cmpwi	r12,0x200
> +	beq	1f
> +	cmpwi	r12,0x300
> +	beq	1f
> +	cmpwi	r12,0x380
> +	beq	1f
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +	/* XXX: cbe stuff? instruction breakpoint? */
> +	cmpwi	r12,0xe02
> +	beq	2f
> +#endif
> +	b	no_skip
> +1:	mfspr	r9,SPRN_SRR0
> +	addi	r9,r9,4
> +	mtspr	SPRN_SRR0,r9
> +	ld	r12,HSTATE_SCRATCH0(r13)
> +	ld	r9,HSTATE_SCRATCH2(r13)
> +	GET_SCRATCH0(r13)
> +	RFI_TO_KERNEL
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +2:	mfspr	r9,SPRN_HSRR0
> +	addi	r9,r9,4
> +	mtspr	SPRN_HSRR0,r9
> +	ld	r12,HSTATE_SCRATCH0(r13)
> +	ld	r9,HSTATE_SCRATCH2(r13)
> +	GET_SCRATCH0(r13)
> +	HRFI_TO_KERNEL
> +#endif

If I understand correctly, label 1 is the kvmppc_skip_interrupt and
label 2 is the kvmppc_skip_Hinterrupt. Would it be easier to understand
if we used symbolic labels, or do you think the RFI_TO_KERNEL vs
HRFI_TO_KERNEL and other changes are sufficient?

Apart from that, I haven't checked the precise copy-paste to make sure
nothing has changed by accident, but I am able to follow the general
idea of the patch and am vigorously in favour of anything that
simplifies our exception/interrupt paths!

Kind regards,
Daniel

> -- 
> 2.23.0

^ permalink raw reply

* Re: [RFC PATCH 1/9] KVM: PPC: Book3S 64: move KVM interrupt entry to a common entry point
From: Daniel Axtens @ 2021-02-19  5:18 UTC (permalink / raw)
  To: Nicholas Piggin, kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210202030313.3509446-2-npiggin@gmail.com>

Hi Nick,

> +++ b/arch/powerpc/kvm/book3s_64_entry.S
> @@ -0,0 +1,34 @@
> +#include <asm/cache.h>
> +#include <asm/ppc_asm.h>
> +#include <asm/kvm_asm.h>
> +#include <asm/reg.h>
> +#include <asm/asm-offsets.h>
> +#include <asm/kvm_book3s_asm.h>
> +
> +/*
> + * We come here from the first-level interrupt handlers.
> + */
> +.global	kvmppc_interrupt
> +.balign IFETCH_ALIGN_BYTES
> +kvmppc_interrupt:
> +	/*
> +	 * Register contents:

Clearly r9 contains some data at this point, and I think it's guest r9
because of what you say later on in
book3s_hv_rmhandlers.S::kvmppc_interrupt_hv. Is that right? Should that
be documented in this comment as well?

> +	 * R12		= (guest CR << 32) | interrupt vector
> +	 * R13		= PACA
> +	 * guest R12 saved in shadow VCPU SCRATCH0
> +	 * guest R13 saved in SPRN_SCRATCH0
> +	 */
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +	std	r9, HSTATE_SCRATCH2(r13)
> +	lbz	r9, HSTATE_IN_GUEST(r13)
> +	cmpwi	r9, KVM_GUEST_MODE_HOST_HV
> +	beq	kvmppc_bad_host_intr
> +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
> +	cmpwi	r9, KVM_GUEST_MODE_GUEST
> +	ld	r9, HSTATE_SCRATCH2(r13)
> +	beq	kvmppc_interrupt_pr
> +#endif
> +	b	kvmppc_interrupt_hv
> +#else
> +	b	kvmppc_interrupt_pr
> +#endif

Apart from that I had a look and convinced myself that the code will
behave the same as before. On that basis:

Reviewed-by: Daniel Axtens <dja@axtens.net>

Kind regards,
Daniel

^ permalink raw reply

* Re: [PATCH v12 13/14] mm/vmalloc: Hugepage vmalloc mappings
From: Ding Tianhong @ 2021-02-19  3:45 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm, Andrew Morton
  Cc: linux-arch, linux-kernel, Christoph Hellwig, Jonathan Cameron,
	Rick Edgecombe, linuxppc-dev
In-Reply-To: <20210202110515.3575274-14-npiggin@gmail.com>

Hi Nicholas:

I met some problem for this patch, like this:

kva = vmalloc(3*1024k);

remap_vmalloc_range(xxx, kva, xxx)

It failed because that the check for page_count(page) is null so return, it break the some logic for current modules.
because the new huge page is not valid for composed page.

I think some guys really don't get used to the changes for the vmalloc that the small pages was transparency to the hugepage
when the size is bigger than the PMD_SIZE.

can we think about give a new static huge page to fix it? just like use a a new vmalloc_huge_xxx function to disginguish the current function,
the user could choose to use the transparent hugepage or static hugepage for vmalloc.

Thanks
Ding


On 2021/2/2 19:05, Nicholas Piggin wrote:
> Support huge page vmalloc mappings. Config option HAVE_ARCH_HUGE_VMALLOC
> enables support on architectures that define HAVE_ARCH_HUGE_VMAP and
> supports PMD sized vmap mappings.
> 
> vmalloc will attempt to allocate PMD-sized pages if allocating PMD size
> or larger, and fall back to small pages if that was unsuccessful.
> 
> Architectures must ensure that any arch specific vmalloc allocations
> that require PAGE_SIZE mappings (e.g., module allocations vs strict
> module rwx) use the VM_NOHUGE flag to inhibit larger mappings.
> 
> This can result in more internal fragmentation and memory overhead for a
> given allocation, an option nohugevmalloc is added to disable at boot.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/Kconfig            |  11 ++
>  include/linux/vmalloc.h |  21 ++++
>  mm/page_alloc.c         |   5 +-
>  mm/vmalloc.c            | 215 +++++++++++++++++++++++++++++++---------
>  4 files changed, 205 insertions(+), 47 deletions(-)
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 24862d15f3a3..eef170e0c9b8 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -724,6 +724,17 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>  config HAVE_ARCH_HUGE_VMAP
>  	bool
>  
> +#
> +#  Archs that select this would be capable of PMD-sized vmaps (i.e.,
> +#  arch_vmap_pmd_supported() returns true), and they must make no assumptions
> +#  that vmalloc memory is mapped with PAGE_SIZE ptes. The VM_NO_HUGE_VMAP flag
> +#  can be used to prohibit arch-specific allocations from using hugepages to
> +#  help with this (e.g., modules may require it).
> +#
> +config HAVE_ARCH_HUGE_VMALLOC
> +	depends on HAVE_ARCH_HUGE_VMAP
> +	bool
> +
>  config ARCH_WANT_HUGE_PMD_SHARE
>  	bool
>  
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 99ea72d547dc..93270adf5db5 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -25,6 +25,7 @@ struct notifier_block;		/* in notifier.h */
>  #define VM_NO_GUARD		0x00000040      /* don't add guard page */
>  #define VM_KASAN		0x00000080      /* has allocated kasan shadow memory */
>  #define VM_MAP_PUT_PAGES	0x00000100	/* put pages and free array in vfree */
> +#define VM_NO_HUGE_VMAP		0x00000200	/* force PAGE_SIZE pte mapping */
>  
>  /*
>   * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
> @@ -59,6 +60,9 @@ struct vm_struct {
>  	unsigned long		size;
>  	unsigned long		flags;
>  	struct page		**pages;
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
> +	unsigned int		page_order;
> +#endif
>  	unsigned int		nr_pages;
>  	phys_addr_t		phys_addr;
>  	const void		*caller;
> @@ -193,6 +197,22 @@ void free_vm_area(struct vm_struct *area);
>  extern struct vm_struct *remove_vm_area(const void *addr);
>  extern struct vm_struct *find_vm_area(const void *addr);
>  
> +static inline bool is_vm_area_hugepages(const void *addr)
> +{
> +	/*
> +	 * This may not 100% tell if the area is mapped with > PAGE_SIZE
> +	 * page table entries, if for some reason the architecture indicates
> +	 * larger sizes are available but decides not to use them, nothing
> +	 * prevents that. This only indicates the size of the physical page
> +	 * allocated in the vmalloc layer.
> +	 */
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
> +	return find_vm_area(addr)->page_order > 0;
> +#else
> +	return false;
> +#endif
> +}
> +
>  #ifdef CONFIG_MMU
>  int vmap_range(unsigned long addr, unsigned long end,
>  			phys_addr_t phys_addr, pgprot_t prot,
> @@ -210,6 +230,7 @@ static inline void set_vm_flush_reset_perms(void *addr)
>  	if (vm)
>  		vm->flags |= VM_FLUSH_RESET_PERMS;
>  }
> +
>  #else
>  static inline int
>  map_kernel_range_noflush(unsigned long start, unsigned long size,
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 519a60d5b6f7..1116ce45744b 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -72,6 +72,7 @@
>  #include <linux/padata.h>
>  #include <linux/khugepaged.h>
>  #include <linux/buffer_head.h>
> +#include <linux/vmalloc.h>
>  
>  #include <asm/sections.h>
>  #include <asm/tlbflush.h>
> @@ -8240,6 +8241,7 @@ void *__init alloc_large_system_hash(const char *tablename,
>  	void *table = NULL;
>  	gfp_t gfp_flags;
>  	bool virt;
> +	bool huge;
>  
>  	/* allow the kernel cmdline to have a say */
>  	if (!numentries) {
> @@ -8307,6 +8309,7 @@ void *__init alloc_large_system_hash(const char *tablename,
>  		} else if (get_order(size) >= MAX_ORDER || hashdist) {
>  			table = __vmalloc(size, gfp_flags);
>  			virt = true;
> +			huge = is_vm_area_hugepages(table);
>  		} else {
>  			/*
>  			 * If bucketsize is not a power-of-two, we may free
> @@ -8323,7 +8326,7 @@ void *__init alloc_large_system_hash(const char *tablename,
>  
>  	pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
>  		tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
> -		virt ? "vmalloc" : "linear");
> +		virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
>  
>  	if (_hash_shift)
>  		*_hash_shift = log2qty;
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 47ab4338cfff..e9a28de04182 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -42,6 +42,19 @@
>  #include "internal.h"
>  #include "pgalloc-track.h"
>  
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
> +static bool __ro_after_init vmap_allow_huge = true;
> +
> +static int __init set_nohugevmalloc(char *str)
> +{
> +	vmap_allow_huge = false;
> +	return 0;
> +}
> +early_param("nohugevmalloc", set_nohugevmalloc);
> +#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
> +static const bool vmap_allow_huge = false;
> +#endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
> +
>  bool is_vmalloc_addr(const void *x)
>  {
>  	unsigned long addr = (unsigned long)x;
> @@ -483,31 +496,12 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
>  	return 0;
>  }
>  
> -/**
> - * map_kernel_range_noflush - map kernel VM area with the specified pages
> - * @addr: start of the VM area to map
> - * @size: size of the VM area to map
> - * @prot: page protection flags to use
> - * @pages: pages to map
> - *
> - * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify should
> - * have been allocated using get_vm_area() and its friends.
> - *
> - * NOTE:
> - * This function does NOT do any cache flushing.  The caller is responsible for
> - * calling flush_cache_vmap() on to-be-mapped areas before calling this
> - * function.
> - *
> - * RETURNS:
> - * 0 on success, -errno on failure.
> - */
> -int map_kernel_range_noflush(unsigned long addr, unsigned long size,
> -			     pgprot_t prot, struct page **pages)
> +static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
> +		pgprot_t prot, struct page **pages)
>  {
>  	unsigned long start = addr;
> -	unsigned long end = addr + size;
> -	unsigned long next;
>  	pgd_t *pgd;
> +	unsigned long next;
>  	int err = 0;
>  	int nr = 0;
>  	pgtbl_mod_mask mask = 0;
> @@ -529,6 +523,66 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
>  	return 0;
>  }
>  
> +static int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
> +		pgprot_t prot, struct page **pages, unsigned int page_shift)
> +{
> +	unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
> +
> +	WARN_ON(page_shift < PAGE_SHIFT);
> +
> +	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
> +			page_shift == PAGE_SHIFT)
> +		return vmap_small_pages_range_noflush(addr, end, prot, pages);
> +
> +	for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
> +		int err;
> +
> +		err = vmap_range_noflush(addr, addr + (1UL << page_shift),
> +					__pa(page_address(pages[i])), prot,
> +					page_shift);
> +		if (err)
> +			return err;
> +
> +		addr += 1UL << page_shift;
> +	}
> +
> +	return 0;
> +}
> +
> +static int vmap_pages_range(unsigned long addr, unsigned long end,
> +		pgprot_t prot, struct page **pages, unsigned int page_shift)
> +{
> +	int err;
> +
> +	err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
> +	flush_cache_vmap(addr, end);
> +	return err;
> +}
> +
> +/**
> + * map_kernel_range_noflush - map kernel VM area with the specified pages
> + * @addr: start of the VM area to map
> + * @size: size of the VM area to map
> + * @prot: page protection flags to use
> + * @pages: pages to map
> + *
> + * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify should
> + * have been allocated using get_vm_area() and its friends.
> + *
> + * NOTE:
> + * This function does NOT do any cache flushing.  The caller is responsible for
> + * calling flush_cache_vmap() on to-be-mapped areas before calling this
> + * function.
> + *
> + * RETURNS:
> + * 0 on success, -errno on failure.
> + */
> +int map_kernel_range_noflush(unsigned long addr, unsigned long size,
> +			     pgprot_t prot, struct page **pages)
> +{
> +	return vmap_pages_range_noflush(addr, addr + size, prot, pages, PAGE_SHIFT);
> +}
> +
>  int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
>  		struct page **pages)
>  {
> @@ -2112,6 +2166,24 @@ EXPORT_SYMBOL(vm_map_ram);
>  
>  static struct vm_struct *vmlist __initdata;
>  
> +static inline unsigned int vm_area_page_order(struct vm_struct *vm)
> +{
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
> +	return vm->page_order;
> +#else
> +	return 0;
> +#endif
> +}
> +
> +static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
> +{
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
> +	vm->page_order = order;
> +#else
> +	BUG_ON(order != 0);
> +#endif
> +}
> +
>  /**
>   * vm_area_add_early - add vmap area early during boot
>   * @vm: vm_struct to add
> @@ -2422,6 +2494,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
>  {
>  	int i;
>  
> +	/* HUGE_VMALLOC passes small pages to set_direct_map */
>  	for (i = 0; i < area->nr_pages; i++)
>  		if (page_address(area->pages[i]))
>  			set_direct_map(area->pages[i]);
> @@ -2431,6 +2504,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
>  static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
>  {
>  	unsigned long start = ULONG_MAX, end = 0;
> +	unsigned int page_order = vm_area_page_order(area);
>  	int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
>  	int flush_dmap = 0;
>  	int i;
> @@ -2455,11 +2529,14 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
>  	 * map. Find the start and end range of the direct mappings to make sure
>  	 * the vm_unmap_aliases() flush includes the direct map.
>  	 */
> -	for (i = 0; i < area->nr_pages; i++) {
> +	for (i = 0; i < area->nr_pages; i += 1U << page_order) {
>  		unsigned long addr = (unsigned long)page_address(area->pages[i]);
>  		if (addr) {
> +			unsigned long page_size;
> +
> +			page_size = PAGE_SIZE << page_order;
>  			start = min(addr, start);
> -			end = max(addr + PAGE_SIZE, end);
> +			end = max(addr + page_size, end);
>  			flush_dmap = 1;
>  		}
>  	}
> @@ -2500,13 +2577,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
>  	vm_remove_mappings(area, deallocate_pages);
>  
>  	if (deallocate_pages) {
> +		unsigned int page_order = vm_area_page_order(area);
>  		int i;
>  
> -		for (i = 0; i < area->nr_pages; i++) {
> +		for (i = 0; i < area->nr_pages; i += 1U << page_order) {
>  			struct page *page = area->pages[i];
>  
>  			BUG_ON(!page);
> -			__free_pages(page, 0);
> +			__free_pages(page, page_order);
>  		}
>  		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
>  
> @@ -2697,15 +2775,19 @@ EXPORT_SYMBOL_GPL(vmap_pfn);
>  #endif /* CONFIG_VMAP_PFN */
>  
>  static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
> -				 pgprot_t prot, int node)
> +				 pgprot_t prot, unsigned int page_shift,
> +				 int node)
>  {
>  	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
> -	unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
> +	unsigned long addr = (unsigned long)area->addr;
> +	unsigned long size = get_vm_area_size(area);
>  	unsigned long array_size;
> -	unsigned int i;
> +	unsigned int nr_small_pages = size >> PAGE_SHIFT;
> +	unsigned int page_order;
>  	struct page **pages;
> +	unsigned int i;
>  
> -	array_size = (unsigned long)nr_pages * sizeof(struct page *);
> +	array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
>  	gfp_mask |= __GFP_NOWARN;
>  	if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
>  		gfp_mask |= __GFP_HIGHMEM;
> @@ -2724,30 +2806,37 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  	}
>  
>  	area->pages = pages;
> -	area->nr_pages = nr_pages;
> +	area->nr_pages = nr_small_pages;
> +	set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
>  
> -	for (i = 0; i < area->nr_pages; i++) {
> -		struct page *page;
> +	page_order = vm_area_page_order(area);
>  
> -		if (node == NUMA_NO_NODE)
> -			page = alloc_page(gfp_mask);
> -		else
> -			page = alloc_pages_node(node, gfp_mask, 0);
> +	/*
> +	 * Careful, we allocate and map page_order pages, but tracking is done
> +	 * per PAGE_SIZE page so as to keep the vm_struct APIs independent of
> +	 * the physical/mapped size.
> +	 */
> +	for (i = 0; i < area->nr_pages; i += 1U << page_order) {
> +		struct page *page;
> +		int p;
>  
> +		page = alloc_pages_node(node, gfp_mask, page_order);
>  		if (unlikely(!page)) {
>  			/* Successfully allocated i pages, free them in __vfree() */
>  			area->nr_pages = i;
>  			atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
>  			goto fail;
>  		}
> -		area->pages[i] = page;
> +
> +		for (p = 0; p < (1U << page_order); p++)
> +			area->pages[i + p] = page + p;
> +
>  		if (gfpflags_allow_blocking(gfp_mask))
>  			cond_resched();
>  	}
>  	atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
>  
> -	if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
> -			prot, pages) < 0)
> +	if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0)
>  		goto fail;
>  
>  	return area->addr;
> @@ -2755,7 +2844,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  fail:
>  	warn_alloc(gfp_mask, NULL,
>  			  "vmalloc: allocation failure, allocated %ld of %ld bytes",
> -			  (area->nr_pages*PAGE_SIZE), area->size);
> +			  (area->nr_pages*PAGE_SIZE), size);
>  	__vfree(area->addr);
>  	return NULL;
>  }
> @@ -2786,19 +2875,43 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>  	struct vm_struct *area;
>  	void *addr;
>  	unsigned long real_size = size;
> +	unsigned long real_align = align;
> +	unsigned int shift = PAGE_SHIFT;
>  
> -	size = PAGE_ALIGN(size);
>  	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
>  		goto fail;
>  
> -	area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
> +	if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) &&
> +			arch_vmap_pmd_supported(prot)) {
> +		unsigned long size_per_node;
> +
> +		/*
> +		 * Try huge pages. Only try for PAGE_KERNEL allocations,
> +		 * others like modules don't yet expect huge pages in
> +		 * their allocations due to apply_to_page_range not
> +		 * supporting them.
> +		 */
> +
> +		size_per_node = size;
> +		if (node == NUMA_NO_NODE)
> +			size_per_node /= num_online_nodes();
> +		if (size_per_node >= PMD_SIZE) {
> +			shift = PMD_SHIFT;
> +			align = max(real_align, 1UL << shift);
> +			size = ALIGN(real_size, 1UL << shift);
> +		}
> +	}
> +
> +again:
> +	size = PAGE_ALIGN(size);
> +	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
>  				vm_flags, start, end, node, gfp_mask, caller);
>  	if (!area)
>  		goto fail;
>  
> -	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
> +	addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
>  	if (!addr)
> -		return NULL;
> +		goto fail;
>  
>  	/*
>  	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
> @@ -2812,8 +2925,18 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>  	return addr;
>  
>  fail:
> -	warn_alloc(gfp_mask, NULL,
> +	if (shift > PAGE_SHIFT) {
> +		shift = PAGE_SHIFT;
> +		align = real_align;
> +		size = real_size;
> +		goto again;
> +	}
> +
> +	if (!area) {
> +		/* Warn for area allocation, page allocations already warn */
> +		warn_alloc(gfp_mask, NULL,
>  			  "vmalloc: allocation failure: %lu bytes", real_size);
> +	}
>  	return NULL;
>  }
>  
> 


^ permalink raw reply

* Re: [PATCH] ASoC: imx-hdmi: no need to set .owner when using module_platform_driver
From: Shengjiu Wang @ 2021-02-19  2:55 UTC (permalink / raw)
  To: Tian Tao
  Cc: alsa-devel, Timur Tabi, Xiubo Li, linuxppc-dev, s.hauer,
	Takashi Iwai, Jaroslav Kysela, Nicolin Chen, shawnguo,
	linux-arm-kernel
In-Reply-To: <1612756287-4601-1-git-send-email-tiantao6@hisilicon.com>

On Thu, Feb 11, 2021 at 5:21 PM Tian Tao <tiantao6@hisilicon.com> wrote:
>
> the module_platform_driver will call platform_driver_register.
> and It will set the .owner to THIS_MODULE
>
> Signed-off-by: Tian Tao <tiantao6@hisilicon.com>

Acked-by: Shengjiu Wang <shengjiu.wang@gmail.com>

^ permalink raw reply

* Re: [PATCH] of: error: 'const struct kimage' has no member named 'arch'
From: Lakshmi Ramasubramanian @ 2021-02-19  2:53 UTC (permalink / raw)
  To: Thiago Jung Bauermann
  Cc: sashal, robh, sfr, gregkh, linuxppc-dev, linux-kernel, Mimi Zohar,
	takahiro.akashi, devicetree, james.morse, catalin.marinas, joe,
	linux-integrity, will, linux-arm-kernel
In-Reply-To: <87k0r4yi4s.fsf@manicouagan.localdomain>

On 2/18/21 5:13 PM, Thiago Jung Bauermann wrote:
> 
> Lakshmi Ramasubramanian <nramas@linux.microsoft.com> writes:
> 
>> On 2/18/21 4:07 PM, Mimi Zohar wrote:
>>
>> Hi Mimi,
>>
>>> On Thu, 2021-02-18 at 14:33 -0800, Lakshmi Ramasubramanian wrote:
>>>> of_kexec_alloc_and_setup_fdt() defined in drivers/of/kexec.c builds
>>>> a new device tree object that includes architecture specific data
>>>> for kexec system call.  This should be defined only if the architecture
>>>> being built defines kexec architecture structure "struct kimage_arch".
>>>>
>>>> Define a new boolean config OF_KEXEC that is enabled if
>>>> CONFIG_KEXEC_FILE and CONFIG_OF_FLATTREE are enabled, and
>>>> the architecture is arm64 or powerpc64.  Build drivers/of/kexec.c
>>>> if CONFIG_OF_KEXEC is enabled.
>>>>
>>>> Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
>>>> Fixes: 33488dc4d61f ("of: Add a common kexec FDT setup function")
>>>> Reported-by: kernel test robot <lkp@intel.com>
>>>> ---
>>>>    drivers/of/Kconfig  | 6 ++++++
>>>>    drivers/of/Makefile | 7 +------
>>>>    2 files changed, 7 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
>>>> index 18450437d5d5..f2e8fa54862a 100644
>>>> --- a/drivers/of/Kconfig
>>>> +++ b/drivers/of/Kconfig
>>>> @@ -100,4 +100,10 @@ config OF_DMA_DEFAULT_COHERENT
>>>>    	# arches should select this if DMA is coherent by default for OF devices
>>>>    	bool
>>>>    +config OF_KEXEC
>>>> +	bool
>>>> +	depends on KEXEC_FILE
>>>> +	depends on OF_FLATTREE
>>>> +	default y if ARM64 || PPC64
>>>> +
>>>>    endif # OF
>>>> diff --git a/drivers/of/Makefile b/drivers/of/Makefile
>>>> index c13b982084a3..287579dd1695 100644
>>>> --- a/drivers/of/Makefile
>>>> +++ b/drivers/of/Makefile
>>>> @@ -13,11 +13,6 @@ obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o
>>>>    obj-$(CONFIG_OF_RESOLVE)  += resolver.o
>>>>    obj-$(CONFIG_OF_OVERLAY) += overlay.o
>>>>    obj-$(CONFIG_OF_NUMA) += of_numa.o
>>>> -
>>>> -ifdef CONFIG_KEXEC_FILE
>>>> -ifdef CONFIG_OF_FLATTREE
>>>> -obj-y	+= kexec.o
>>>> -endif
>>>> -endif
>>>> +obj-$(CONFIG_OF_KEXEC) += kexec.o
>>>>      obj-$(CONFIG_OF_UNITTEST) += unittest-data/
>>> Is it possible to reuse CONFIG_HAVE_IMA_KEXEC here?
>>>
>>
>> For ppc64 CONFIG_HAVE_IMA_KEXEC is selected when CONFIG_KEXEC_FILE is enabled.
>> So I don't see a problem in reusing CONFIG_HAVE_IMA_KEXEC for ppc.
>>
>> But for arm64, CONFIG_HAVE_IMA_KEXEC is enabled in the final patch in the patch
>> set (the one for carrying forward IMA log across kexec for arm64). arm64 calls
>> of_kexec_alloc_and_setup_fdt() prior to enabling CONFIG_HAVE_IMA_KEXEC and hence
>> breaks the build for arm64.
> 
> One problem is that I believe that this patch won't placate the robot,
> because IIUC it generates config files at random and this change still
> allows hppa and s390 to enable CONFIG_OF_KEXEC.

I enabled CONFIG_OF_KEXEC for s390. With my patch applied, 
CONFIG_OF_KEXEC is removed. So I think the robot enabling this config 
would not be a problem.

> 
> Perhaps a new CONFIG_HAVE_KIMAGE_ARCH option? Not having that option
> would still allow building kexec.o, but would be used inside kexec.c to
> avoid accessing kimage.arch members.
> 

I think this is a good idea - a new CONFIG_HAVE_KIMAGE_ARCH, which will 
be selected by arm64 and ppc for now. I tried this, and it fixes the 
build issue.

Although, the name for the new config can be misleading since PARISC, 
for instance, also defines "struct kimage_arch". Perhaps, 
CONFIG_HAVE_ELF_KIMAGE_ARCH since of_kexec_alloc_and_setup_fdt() is 
accessing ELF specific fields in "struct kimage_arch"?

Rob/Mimi - please let us know which approach you think is better.

thanks,
  -lakshmi

^ permalink raw reply

* Re: [PATCH 5/6] powerpc/mm/64s/hash: Add real-mode change_memory_range() for hash LPAR
From: Daniel Axtens @ 2021-02-19  2:43 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev; +Cc: aneesh.kumar
In-Reply-To: <20210211135130.3474832-5-mpe@ellerman.id.au>

Michael Ellerman <mpe@ellerman.id.au> writes:

> When we enabled STRICT_KERNEL_RWX we received some reports of boot
> failures when using the Hash MMU and running under phyp. The crashes
> are intermittent, and often exhibit as a completely unresponsive
> system, or possibly an oops.
>
> One example, which was caught in xmon:
>
>   [   14.068327][    T1] devtmpfs: mounted
>   [   14.069302][    T1] Freeing unused kernel memory: 5568K
>   [   14.142060][  T347] BUG: Unable to handle kernel instruction fetch
>   [   14.142063][    T1] Run /sbin/init as init process
>   [   14.142074][  T347] Faulting instruction address: 0xc000000000004400
>   cpu 0x2: Vector: 400 (Instruction Access) at [c00000000c7475e0]
>       pc: c000000000004400: exc_virt_0x4400_instruction_access+0x0/0x80
>       lr: c0000000001862d4: update_rq_clock+0x44/0x110
>       sp: c00000000c747880
>      msr: 8000000040001031
>     current = 0xc00000000c60d380
>     paca    = 0xc00000001ec9de80   irqmask: 0x03   irq_happened: 0x01
>       pid   = 347, comm = kworker/2:1
>   ...
>   enter ? for help
>   [c00000000c747880] c0000000001862d4 update_rq_clock+0x44/0x110 (unreliable)
>   [c00000000c7478f0] c000000000198794 update_blocked_averages+0xb4/0x6d0
>   [c00000000c7479f0] c000000000198e40 update_nohz_stats+0x90/0xd0
>   [c00000000c747a20] c0000000001a13b4 _nohz_idle_balance+0x164/0x390
>   [c00000000c747b10] c0000000001a1af8 newidle_balance+0x478/0x610
>   [c00000000c747be0] c0000000001a1d48 pick_next_task_fair+0x58/0x480
>   [c00000000c747c40] c000000000eaab5c __schedule+0x12c/0x950
>   [c00000000c747cd0] c000000000eab3e8 schedule+0x68/0x120
>   [c00000000c747d00] c00000000016b730 worker_thread+0x130/0x640
>   [c00000000c747da0] c000000000174d50 kthread+0x1a0/0x1b0
>   [c00000000c747e10] c00000000000e0f0 ret_from_kernel_thread+0x5c/0x6c
>
> This shows that CPU 2, which was idle, woke up and then appears to
> randomly take an instruction fault on a completely valid area of
> kernel text.
>
> The cause turns out to be the call to hash__mark_rodata_ro(), late in
> boot. Due to the way we layout text and rodata, that function actually
> changes the permissions for all of text and rodata to read-only plus
> execute.
>
> To do the permission change we use a hypervisor call, H_PROTECT. On
> phyp that appears to be implemented by briefly removing the mapping of
> the kernel text, before putting it back with the updated permissions.
> If any other CPU is executing during that window, it will see spurious
> faults on the kernel text and/or data, leading to crashes.

Jordan asked why we saw this on phyp but not under KVM? We had a look at
book3s_hv_rm_mmu.c but the code is a bit too obtuse for me to reason
about!

Nick suggests that the KVM hypervisor is invalidating the HPTE, but
because we run guests in VPM mode, the hypervisor would catch the page
fault and not reflect it down to the guest. It looks like Linux-as-a-HV
will take HPTE_V_HVLOCK, and then because it's running in VPM mode, the
hypervisor will catch the fault and not pass it to the guest. But if
phyp runs with VPM mode off, the guest will see the fault before the
hypervisor. (we think this is what's going on anyway.)

We spent a while pondering if phyp is doing something buggy or not...
Looking at the PAPR definition of H_PROTECT, that claims the hypervisor
will do the 'architected “Modifying a Page Table Entry General Case”
sequence'. s 5.10.1.2 of Book IIIS of the ISAv3 defines that, and the
non-atomic hardware sequence does indeed modify the PTE by going through
the invalid state. So it looks like if phyp is running without VPM mode
it's technically not buggy.

Hopefully I'll get to have a look at the rest of the patch shortly!

Kind regards,
Daniel

> To fix it we use stop machine to collect all other CPUs, and then have
> them drop into real mode (MMU off), while we change the mapping. That
> way they are unaffected by the mapping temporarily disappearing.
>
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
>  arch/powerpc/mm/book3s64/hash_pgtable.c | 105 +++++++++++++++++++++++-
>  1 file changed, 104 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
> index 3663d3cdffac..01de985df2c4 100644
> --- a/arch/powerpc/mm/book3s64/hash_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
> @@ -8,6 +8,7 @@
>  #include <linux/sched.h>
>  #include <linux/mm_types.h>
>  #include <linux/mm.h>
> +#include <linux/stop_machine.h>
>  
>  #include <asm/sections.h>
>  #include <asm/mmu.h>
> @@ -400,6 +401,19 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage);
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>  
>  #ifdef CONFIG_STRICT_KERNEL_RWX
> +
> +struct change_memory_parms {
> +	unsigned long start, end, newpp;
> +	unsigned int step, nr_cpus, master_cpu;
> +	atomic_t cpu_counter;
> +};
> +
> +// We'd rather this was on the stack but it has to be in the RMO
> +static struct change_memory_parms chmem_parms;
> +
> +// And therefore we need a lock to protect it from concurrent use
> +static DEFINE_MUTEX(chmem_lock);
> +
>  static void change_memory_range(unsigned long start, unsigned long end,
>  				unsigned int step, unsigned long newpp)
>  {
> @@ -414,6 +428,73 @@ static void change_memory_range(unsigned long start, unsigned long end,
>  							mmu_kernel_ssize);
>  }
>  
> +static int notrace chmem_secondary_loop(struct change_memory_parms *parms)
> +{
> +	unsigned long msr, tmp, flags;
> +	int *p;
> +
> +	p = &parms->cpu_counter.counter;
> +
> +	local_irq_save(flags);
> +	__hard_EE_RI_disable();
> +
> +	asm volatile (
> +	// Switch to real mode and leave interrupts off
> +	"mfmsr	%[msr]			;"
> +	"li	%[tmp], %[MSR_IR_DR]	;"
> +	"andc	%[tmp], %[msr], %[tmp]	;"
> +	"mtmsrd %[tmp]			;"
> +
> +	// Tell the master we are in real mode
> +	"1:				"
> +	"lwarx	%[tmp], 0, %[p]		;"
> +	"addic	%[tmp], %[tmp], -1	;"
> +	"stwcx.	%[tmp], 0, %[p]		;"
> +	"bne-	1b			;"
> +
> +	// Spin until the counter goes to zero
> +	"2:				;"
> +	"lwz	%[tmp], 0(%[p])		;"
> +	"cmpwi	%[tmp], 0		;"
> +	"bne-	2b			;"
> +
> +	// Switch back to virtual mode
> +	"mtmsrd %[msr]			;"
> +
> +	: // outputs
> +	  [msr] "=&r" (msr), [tmp] "=&b" (tmp), "+m" (*p)
> +	: // inputs
> +	  [p] "b" (p), [MSR_IR_DR] "i" (MSR_IR | MSR_DR)
> +	: // clobbers
> +	  "cc", "xer"
> +	);
> +
> +	local_irq_restore(flags);
> +
> +	return 0;
> +}
> +
> +static int change_memory_range_fn(void *data)
> +{
> +	struct change_memory_parms *parms = data;
> +
> +	if (parms->master_cpu != smp_processor_id())
> +		return chmem_secondary_loop(parms);
> +
> +	// Wait for all but one CPU (this one) to call-in
> +	while (atomic_read(&parms->cpu_counter) > 1)
> +		barrier();
> +
> +	change_memory_range(parms->start, parms->end, parms->step, parms->newpp);
> +
> +	mb();
> +
> +	// Signal the other CPUs that we're done
> +	atomic_dec(&parms->cpu_counter);
> +
> +	return 0;
> +}
> +
>  static bool hash__change_memory_range(unsigned long start, unsigned long end,
>  				      unsigned long newpp)
>  {
> @@ -428,7 +509,29 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end,
>  	if (start >= end)
>  		return false;
>  
> -	change_memory_range(start, end, step, newpp);
> +	if (firmware_has_feature(FW_FEATURE_LPAR)) {
> +		mutex_lock(&chmem_lock);
> +
> +		chmem_parms.start = start;
> +		chmem_parms.end = end;
> +		chmem_parms.step = step;
> +		chmem_parms.newpp = newpp;
> +		chmem_parms.master_cpu = smp_processor_id();
> +
> +		cpus_read_lock();
> +
> +		atomic_set(&chmem_parms.cpu_counter, num_online_cpus());
> +
> +		// Ensure state is consistent before we call the other CPUs
> +		mb();
> +
> +		stop_machine_cpuslocked(change_memory_range_fn, &chmem_parms,
> +					cpu_online_mask);
> +
> +		cpus_read_unlock();
> +		mutex_unlock(&chmem_lock);
> +	} else
> +		change_memory_range(start, end, step, newpp);
>  
>  	return true;
>  }
> -- 
> 2.25.1

^ permalink raw reply

* Re: [PATCH 4/6] powerpc/mm/64s/hash: Factor out change_memory_range()
From: Daniel Axtens @ 2021-02-19  2:08 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev; +Cc: aneesh.kumar
In-Reply-To: <20210211135130.3474832-4-mpe@ellerman.id.au>

Michael Ellerman <mpe@ellerman.id.au> writes:

> Pull the loop calling hpte_updateboltedpp() out of
> hash__change_memory_range() into a helper function. We need it to be a
> separate function for the next patch.
>
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
>  arch/powerpc/mm/book3s64/hash_pgtable.c | 23 +++++++++++++++--------
>  1 file changed, 15 insertions(+), 8 deletions(-)
>
> diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
> index 03819c259f0a..3663d3cdffac 100644
> --- a/arch/powerpc/mm/book3s64/hash_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
> @@ -400,10 +400,23 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage);
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>  
>  #ifdef CONFIG_STRICT_KERNEL_RWX
> +static void change_memory_range(unsigned long start, unsigned long end,
> +				unsigned int step, unsigned long newpp)

Looking at the call paths, this gets called only in bare metal, not
virtualised: should the name reflect that?

> +{
> +	unsigned long idx;
> +
> +	pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
> +		 start, end, newpp, step);
> +
> +	for (idx = start; idx < end; idx += step)
> +		/* Not sure if we can do much with the return value */

Hmm, I realise this comment isn't changed, but it did make me wonder
what the return value!

It turns out that the function doesn't actually return anything.

Tracking back the history of hpte_updateboltedpp, it looks like it has
not had a return value since the start of git history:

^1da177e4c3f4 include/asm-ppc64/machdep.h    void            (*hpte_updateboltedpp)(unsigned long newpp, 
3c726f8dee6f5 include/asm-powerpc/machdep.h                                         unsigned long ea,
1189be6508d45 include/asm-powerpc/machdep.h                                        int psize, int ssize);

The comment comes from commit cd65d6971334 ("powerpc/mm/hash: Implement
mark_rodata_ro() for hash") where Balbir added the comment, but again I
can't figure out what sort of return value there would be to ignore.

Should we drop the comment? (or return something from hpte_updateboltedpp)

> +		mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
> +							mmu_kernel_ssize);
> +}
> +
>  static bool hash__change_memory_range(unsigned long start, unsigned long end,
>  				      unsigned long newpp)
>  {
> -	unsigned long idx;
>  	unsigned int step, shift;
>  
>  	shift = mmu_psize_defs[mmu_linear_psize].shift;
> @@ -415,13 +428,7 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end,
>  	if (start >= end)
>  		return false;
>  
> -	pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
> -		 start, end, newpp, step);
> -
> -	for (idx = start; idx < end; idx += step)
> -		/* Not sure if we can do much with the return value */
> -		mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
> -							mmu_kernel_ssize);
> +	change_memory_range(start, end, step, newpp);

Looking at how change_memory_range is called, step is derived by:

	shift = mmu_psize_defs[mmu_linear_psize].shift;
	step = 1 << shift;

We probably therefore don't really need to pass step in to
change_memory_range. Having said that, I'm not sure it would really be that
much tidier to compute step in change_memory_range, especially since we
also need step for the other branch in hash__change_memory_range.

Beyond that it all looks reasonable to me!

I also checked that the loop operations made sense, I think they do - we
cover from start inclusive to end exclusive and the alignment is done
before we call into change_memory_range.

Regards,
Daniel

>  	return true;
>  }
> -- 
> 2.25.1

^ permalink raw reply

* Re: [PATCH] of: error: 'const struct kimage' has no member named 'arch'
From: Thiago Jung Bauermann @ 2021-02-19  1:13 UTC (permalink / raw)
  To: Lakshmi Ramasubramanian
  Cc: sashal, robh, sfr, gregkh, linuxppc-dev, linux-kernel, Mimi Zohar,
	takahiro.akashi, devicetree, james.morse, catalin.marinas, joe,
	linux-integrity, will, linux-arm-kernel
In-Reply-To: <8b8c0b70-c7ab-33f3-b66c-9ea03388497b@linux.microsoft.com>


Lakshmi Ramasubramanian <nramas@linux.microsoft.com> writes:

> On 2/18/21 4:07 PM, Mimi Zohar wrote:
>
> Hi Mimi,
>
>> On Thu, 2021-02-18 at 14:33 -0800, Lakshmi Ramasubramanian wrote:
>>> of_kexec_alloc_and_setup_fdt() defined in drivers/of/kexec.c builds
>>> a new device tree object that includes architecture specific data
>>> for kexec system call.  This should be defined only if the architecture
>>> being built defines kexec architecture structure "struct kimage_arch".
>>>
>>> Define a new boolean config OF_KEXEC that is enabled if
>>> CONFIG_KEXEC_FILE and CONFIG_OF_FLATTREE are enabled, and
>>> the architecture is arm64 or powerpc64.  Build drivers/of/kexec.c
>>> if CONFIG_OF_KEXEC is enabled.
>>>
>>> Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
>>> Fixes: 33488dc4d61f ("of: Add a common kexec FDT setup function")
>>> Reported-by: kernel test robot <lkp@intel.com>
>>> ---
>>>   drivers/of/Kconfig  | 6 ++++++
>>>   drivers/of/Makefile | 7 +------
>>>   2 files changed, 7 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
>>> index 18450437d5d5..f2e8fa54862a 100644
>>> --- a/drivers/of/Kconfig
>>> +++ b/drivers/of/Kconfig
>>> @@ -100,4 +100,10 @@ config OF_DMA_DEFAULT_COHERENT
>>>   	# arches should select this if DMA is coherent by default for OF devices
>>>   	bool
>>>   +config OF_KEXEC
>>> +	bool
>>> +	depends on KEXEC_FILE
>>> +	depends on OF_FLATTREE
>>> +	default y if ARM64 || PPC64
>>> +
>>>   endif # OF
>>> diff --git a/drivers/of/Makefile b/drivers/of/Makefile
>>> index c13b982084a3..287579dd1695 100644
>>> --- a/drivers/of/Makefile
>>> +++ b/drivers/of/Makefile
>>> @@ -13,11 +13,6 @@ obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o
>>>   obj-$(CONFIG_OF_RESOLVE)  += resolver.o
>>>   obj-$(CONFIG_OF_OVERLAY) += overlay.o
>>>   obj-$(CONFIG_OF_NUMA) += of_numa.o
>>> -
>>> -ifdef CONFIG_KEXEC_FILE
>>> -ifdef CONFIG_OF_FLATTREE
>>> -obj-y	+= kexec.o
>>> -endif
>>> -endif
>>> +obj-$(CONFIG_OF_KEXEC) += kexec.o
>>>     obj-$(CONFIG_OF_UNITTEST) += unittest-data/
>> Is it possible to reuse CONFIG_HAVE_IMA_KEXEC here?
>> 
>
> For ppc64 CONFIG_HAVE_IMA_KEXEC is selected when CONFIG_KEXEC_FILE is enabled.
> So I don't see a problem in reusing CONFIG_HAVE_IMA_KEXEC for ppc.
>
> But for arm64, CONFIG_HAVE_IMA_KEXEC is enabled in the final patch in the patch
> set (the one for carrying forward IMA log across kexec for arm64). arm64 calls
> of_kexec_alloc_and_setup_fdt() prior to enabling CONFIG_HAVE_IMA_KEXEC and hence
> breaks the build for arm64.

One problem is that I believe that this patch won't placate the robot,
because IIUC it generates config files at random and this change still
allows hppa and s390 to enable CONFIG_OF_KEXEC.

Perhaps a new CONFIG_HAVE_KIMAGE_ARCH option? Not having that option
would still allow building kexec.o, but would be used inside kexec.c to
avoid accessing kimage.arch members.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [PATCH] of: error: 'const struct kimage' has no member named 'arch'
From: Lakshmi Ramasubramanian @ 2021-02-19  0:57 UTC (permalink / raw)
  To: Mimi Zohar, bauerman, robh, takahiro.akashi, gregkh, will, joe,
	catalin.marinas, mpe, sfr
  Cc: sashal, devicetree, linux-kernel, james.morse, linux-integrity,
	linuxppc-dev, linux-arm-kernel
In-Reply-To: <c6490f6a126a2f10e3e3445b51ea552a26f896a9.camel@linux.ibm.com>

On 2/18/21 4:07 PM, Mimi Zohar wrote:

Hi Mimi,

> On Thu, 2021-02-18 at 14:33 -0800, Lakshmi Ramasubramanian wrote:
>> of_kexec_alloc_and_setup_fdt() defined in drivers/of/kexec.c builds
>> a new device tree object that includes architecture specific data
>> for kexec system call.  This should be defined only if the architecture
>> being built defines kexec architecture structure "struct kimage_arch".
>>
>> Define a new boolean config OF_KEXEC that is enabled if
>> CONFIG_KEXEC_FILE and CONFIG_OF_FLATTREE are enabled, and
>> the architecture is arm64 or powerpc64.  Build drivers/of/kexec.c
>> if CONFIG_OF_KEXEC is enabled.
>>
>> Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
>> Fixes: 33488dc4d61f ("of: Add a common kexec FDT setup function")
>> Reported-by: kernel test robot <lkp@intel.com>
>> ---
>>   drivers/of/Kconfig  | 6 ++++++
>>   drivers/of/Makefile | 7 +------
>>   2 files changed, 7 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
>> index 18450437d5d5..f2e8fa54862a 100644
>> --- a/drivers/of/Kconfig
>> +++ b/drivers/of/Kconfig
>> @@ -100,4 +100,10 @@ config OF_DMA_DEFAULT_COHERENT
>>   	# arches should select this if DMA is coherent by default for OF devices
>>   	bool
>>   
>> +config OF_KEXEC
>> +	bool
>> +	depends on KEXEC_FILE
>> +	depends on OF_FLATTREE
>> +	default y if ARM64 || PPC64
>> +
>>   endif # OF
>> diff --git a/drivers/of/Makefile b/drivers/of/Makefile
>> index c13b982084a3..287579dd1695 100644
>> --- a/drivers/of/Makefile
>> +++ b/drivers/of/Makefile
>> @@ -13,11 +13,6 @@ obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o
>>   obj-$(CONFIG_OF_RESOLVE)  += resolver.o
>>   obj-$(CONFIG_OF_OVERLAY) += overlay.o
>>   obj-$(CONFIG_OF_NUMA) += of_numa.o
>> -
>> -ifdef CONFIG_KEXEC_FILE
>> -ifdef CONFIG_OF_FLATTREE
>> -obj-y	+= kexec.o
>> -endif
>> -endif
>> +obj-$(CONFIG_OF_KEXEC) += kexec.o
>>   
>>   obj-$(CONFIG_OF_UNITTEST) += unittest-data/
> 
> Is it possible to reuse CONFIG_HAVE_IMA_KEXEC here?
> 

For ppc64 CONFIG_HAVE_IMA_KEXEC is selected when CONFIG_KEXEC_FILE is 
enabled. So I don't see a problem in reusing CONFIG_HAVE_IMA_KEXEC for ppc.

But for arm64, CONFIG_HAVE_IMA_KEXEC is enabled in the final patch in 
the patch set (the one for carrying forward IMA log across kexec for 
arm64). arm64 calls of_kexec_alloc_and_setup_fdt() prior to enabling 
CONFIG_HAVE_IMA_KEXEC and hence breaks the build for arm64.

thanks,
  -lakshmi






^ permalink raw reply

* Re: [PATCH] of: error: 'const struct kimage' has no member named 'arch'
From: Mimi Zohar @ 2021-02-19  0:07 UTC (permalink / raw)
  To: Lakshmi Ramasubramanian, bauerman, robh, takahiro.akashi, gregkh,
	will, joe, catalin.marinas, mpe, sfr
  Cc: sashal, devicetree, linux-kernel, james.morse, linux-integrity,
	linuxppc-dev, linux-arm-kernel
In-Reply-To: <20210218223305.2044-1-nramas@linux.microsoft.com>

On Thu, 2021-02-18 at 14:33 -0800, Lakshmi Ramasubramanian wrote:
> of_kexec_alloc_and_setup_fdt() defined in drivers/of/kexec.c builds
> a new device tree object that includes architecture specific data
> for kexec system call.  This should be defined only if the architecture
> being built defines kexec architecture structure "struct kimage_arch".
> 
> Define a new boolean config OF_KEXEC that is enabled if
> CONFIG_KEXEC_FILE and CONFIG_OF_FLATTREE are enabled, and
> the architecture is arm64 or powerpc64.  Build drivers/of/kexec.c
> if CONFIG_OF_KEXEC is enabled.
> 
> Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
> Fixes: 33488dc4d61f ("of: Add a common kexec FDT setup function")
> Reported-by: kernel test robot <lkp@intel.com>
> ---
>  drivers/of/Kconfig  | 6 ++++++
>  drivers/of/Makefile | 7 +------
>  2 files changed, 7 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
> index 18450437d5d5..f2e8fa54862a 100644
> --- a/drivers/of/Kconfig
> +++ b/drivers/of/Kconfig
> @@ -100,4 +100,10 @@ config OF_DMA_DEFAULT_COHERENT
>  	# arches should select this if DMA is coherent by default for OF devices
>  	bool
>  
> +config OF_KEXEC
> +	bool
> +	depends on KEXEC_FILE
> +	depends on OF_FLATTREE
> +	default y if ARM64 || PPC64
> +
>  endif # OF
> diff --git a/drivers/of/Makefile b/drivers/of/Makefile
> index c13b982084a3..287579dd1695 100644
> --- a/drivers/of/Makefile
> +++ b/drivers/of/Makefile
> @@ -13,11 +13,6 @@ obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o
>  obj-$(CONFIG_OF_RESOLVE)  += resolver.o
>  obj-$(CONFIG_OF_OVERLAY) += overlay.o
>  obj-$(CONFIG_OF_NUMA) += of_numa.o
> -
> -ifdef CONFIG_KEXEC_FILE
> -ifdef CONFIG_OF_FLATTREE
> -obj-y	+= kexec.o
> -endif
> -endif
> +obj-$(CONFIG_OF_KEXEC) += kexec.o
>  
>  obj-$(CONFIG_OF_UNITTEST) += unittest-data/

Is it possible to reuse CONFIG_HAVE_IMA_KEXEC here?

Mimi



^ permalink raw reply

* Re: linux-next: manual merge of the devicetree tree with the powerpc tree
From: Michael Ellerman @ 2021-02-18 23:28 UTC (permalink / raw)
  To: Rob Herring, Stephen Rothwell
  Cc: Lakshmi Ramasubramanian, Linux Next Mailing List, PowerPC,
	Linux Kernel Mailing List, Hari Bathini
In-Reply-To: <CAL_JsqJ9Ske4hkWn3uo8-nef29MQ1DkNdtE=gxbqj8CKrtQorg@mail.gmail.com>

Rob Herring <robherring2@gmail.com> writes:
> On Thu, Feb 18, 2021 at 5:34 AM Stephen Rothwell <sfr@canb.auug.org.au> wrote:
>> On Thu, 18 Feb 2021 21:44:37 +1100 Michael Ellerman <mpe@ellerman.id.au> wrote:
>> >
>> > I think it just needs this?
>> >
>> > diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
>> > index 87e34611f93d..0492ca6003f3 100644
>> > --- a/arch/powerpc/kexec/elf_64.c
>> > +++ b/arch/powerpc/kexec/elf_64.c
>> > @@ -104,7 +104,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
>> >
>> >       fdt = of_kexec_alloc_and_setup_fdt(image, initrd_load_addr,
>> >                                          initrd_len, cmdline,
>> > -                                        fdt_totalsize(initial_boot_params));
>> > +                                        kexec_fdt_totalsize_ppc64(image));
>> >       if (!fdt) {
>> >               pr_err("Error setting up the new device tree.\n");
>> >               ret = -EINVAL;
>> >
>>
>> I thought about that, but the last argument to
>> of_kexec_alloc_and_setup_fdt() is extra_fdt_size and the allocation
>> done is for this:
>>
>> fdt_size = fdt_totalsize(initial_boot_params) +
>>                    (cmdline ? strlen(cmdline) : 0) +
>>                    FDT_EXTRA_SPACE +
>>                    extra_fdt_size;
>>
>> and kexec_fdt_totalsize_ppc64() also includes
>> fdt_totalsize(initial_boot_params) so I was not sure.  Maybe
>> kexec_fdt_totalsize_ppc64() needs modification as well?
>
> You're both right. Michael's fix is sufficient for the merge. The only
> risk with a larger size is failing to allocate it, but we're talking
> only 10s of KB. Historically until the commit causing the conflict,
> PPC was just used 2x fdt_totalsize(initial_boot_params). You could
> drop 'fdt_size = fdt_totalsize(initial_boot_params) + (2 *
> COMMAND_LINE_SIZE);' from kexec_fdt_totalsize_ppc64() as well, but
> then the function name is misleading.
>
> Lakshmi can send a follow-up patch to fine tune the size and rename
> kexec_fdt_totalsize_ppc64.

Sounds good.

cheers

^ permalink raw reply

* Re: [PATCH 2/6] powerpc/pseries: Add key to flags in pSeries_lpar_hpte_updateboltedpp()
From: Michael Ellerman @ 2021-02-18 23:25 UTC (permalink / raw)
  To: Daniel Axtens, linuxppc-dev; +Cc: aneesh.kumar
In-Reply-To: <87tuqca7vi.fsf@linkitivity.dja.id.au>

Daniel Axtens <dja@axtens.net> writes:
> Michael Ellerman <mpe@ellerman.id.au> writes:
>
>> The flags argument to plpar_pte_protect() (aka. H_PROTECT), includes
>> the key in bits 9-13, but currently we always set those bits to zero.
>>
>> In the past that hasn't been a problem because we always used key 0
>> for the kernel, and updateboltedpp() is only used for kernel mappings.
>>
>> However since commit d94b827e89dc ("powerpc/book3s64/kuap: Use Key 3
>> for kernel mapping with hash translation") we are now inadvertently
>> changing the key (to zero) when we call plpar_pte_protect().
>>
>> That hasn't broken anything because updateboltedpp() is only used for
>> STRICT_KERNEL_RWX, which is currently disabled on 64s due to other
>> bugs.
>>
>> But we want to fix that, so first we need to pass the key correctly to
>> plpar_pte_protect(). In the `newpp` value the low 3 bits of the key
>> are already in the correct spot, but the high 2 bits of the key need
>> to be shifted down.
>>
>> Fixes: d94b827e89dc ("powerpc/book3s64/kuap: Use Key 3 for kernel mapping with hash translation")
>> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
>> ---
>>  arch/powerpc/platforms/pseries/lpar.c | 4 +++-
>>  1 file changed, 3 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
>> index 764170fdb0f7..8bbbddff7226 100644
>> --- a/arch/powerpc/platforms/pseries/lpar.c
>> +++ b/arch/powerpc/platforms/pseries/lpar.c
>> @@ -976,11 +976,13 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
>>  	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
>>  	BUG_ON(slot == -1);
>>  
>> -	flags = newpp & 7;
>> +	flags = newpp & (HPTE_R_PP | HPTE_R_N);
>>  	if (mmu_has_feature(MMU_FTR_KERNEL_RO))
>>  		/* Move pp0 into bit 8 (IBM 55) */
>>  		flags |= (newpp & HPTE_R_PP0) >> 55;
>>  
>> +	flags |= ((newpp & HPTE_R_KEY_HI) >> 48) | (newpp & HPTE_R_KEY_LO);
>> +
>
> I'm really confused about how these bits are getting packed into the
> flags parameter. It seems to match how they are unpacked in
> kvmppc_h_pr_protect, but I cannot figure out why they are packed in that
> order, and the LoPAR doesn't seem especially illuminating on this topic
> - although I may have missed the relevant section.

Yeah I agree it's not very clearly specified.

The hcall we're using here is H_PROTECT, which is specified in section
14.5.4.1.6 of LoPAPR v1.1.

It takes a `flags` parameter, and the description for flags says:

 * flags: AVPN, pp0, pp1, pp2, key0-key4, n, and for the CMO
   option: CMO Option flags as defined in Table 189‚


If you then go to the start of the parent section, 14.5.4.1, on page
405, it says:

Register Linkage (For hcall() tokens 0x04 - 0x18)
 * On Call
   * R3 function call token
   * R4 flags (see Table 178‚ “Page Frame Table Access flags field definition‚” on page 401)


Then you have to go to section 14.5.3, and on page 394 there is a list
of hcalls and their tokens (table 176), and there you can see that
H_PROTECT == 0x18.

Finally you can look at table 178, on page 401, where it specifies the
layout of the bits for the key:

 Bit     Function
------------------
 50-54 | key0-key4


Those are big-endian bit numbers, converting to normal bit numbers you
get bits 9-13, or 0x3e00.

If you look at the kernel source we have:

#define HPTE_R_KEY_HI		ASM_CONST(0x3000000000000000)
#define HPTE_R_KEY_LO		ASM_CONST(0x0000000000000e00)

So the LO bits are already in the right place, and the HI bits just need
to be shifted down by 48.

Hope that makes it clearer :)

cheers

^ permalink raw reply

* [PATCH] of: error: 'const struct kimage' has no member named 'arch'
From: Lakshmi Ramasubramanian @ 2021-02-18 22:33 UTC (permalink / raw)
  To: zohar, bauerman, robh, takahiro.akashi, gregkh, will, joe,
	catalin.marinas, mpe, sfr
  Cc: sashal, devicetree, linux-kernel, james.morse, linux-integrity,
	linuxppc-dev, linux-arm-kernel

of_kexec_alloc_and_setup_fdt() defined in drivers/of/kexec.c builds
a new device tree object that includes architecture specific data
for kexec system call.  This should be defined only if the architecture
being built defines kexec architecture structure "struct kimage_arch".

Define a new boolean config OF_KEXEC that is enabled if
CONFIG_KEXEC_FILE and CONFIG_OF_FLATTREE are enabled, and
the architecture is arm64 or powerpc64.  Build drivers/of/kexec.c
if CONFIG_OF_KEXEC is enabled.

Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
Fixes: 33488dc4d61f ("of: Add a common kexec FDT setup function")
Reported-by: kernel test robot <lkp@intel.com>
---
 drivers/of/Kconfig  | 6 ++++++
 drivers/of/Makefile | 7 +------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index 18450437d5d5..f2e8fa54862a 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -100,4 +100,10 @@ config OF_DMA_DEFAULT_COHERENT
 	# arches should select this if DMA is coherent by default for OF devices
 	bool
 
+config OF_KEXEC
+	bool
+	depends on KEXEC_FILE
+	depends on OF_FLATTREE
+	default y if ARM64 || PPC64
+
 endif # OF
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index c13b982084a3..287579dd1695 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -13,11 +13,6 @@ obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o
 obj-$(CONFIG_OF_RESOLVE)  += resolver.o
 obj-$(CONFIG_OF_OVERLAY) += overlay.o
 obj-$(CONFIG_OF_NUMA) += of_numa.o
-
-ifdef CONFIG_KEXEC_FILE
-ifdef CONFIG_OF_FLATTREE
-obj-y	+= kexec.o
-endif
-endif
+obj-$(CONFIG_OF_KEXEC) += kexec.o
 
 obj-$(CONFIG_OF_UNITTEST) += unittest-data/
-- 
2.30.0


^ permalink raw reply related

* Re: linux-next: manual merge of the devicetree tree with the powerpc tree
From: Stephen Rothwell @ 2021-02-18 20:31 UTC (permalink / raw)
  To: Rob Herring
  Cc: Linux Kernel Mailing List, Lakshmi Ramasubramanian,
	Linux Next Mailing List, PowerPC, Hari Bathini
In-Reply-To: <CAL_JsqJ9Ske4hkWn3uo8-nef29MQ1DkNdtE=gxbqj8CKrtQorg@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2287 bytes --]

Hi all,

On Thu, 18 Feb 2021 07:52:52 -0600 Rob Herring <robherring2@gmail.com> wrote:
>
> On Thu, Feb 18, 2021 at 5:34 AM Stephen Rothwell <sfr@canb.auug.org.au> wrote:
> >
> > On Thu, 18 Feb 2021 21:44:37 +1100 Michael Ellerman <mpe@ellerman.id.au> wrote:  
> > >
> > > I think it just needs this?
> > >
> > > diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
> > > index 87e34611f93d..0492ca6003f3 100644
> > > --- a/arch/powerpc/kexec/elf_64.c
> > > +++ b/arch/powerpc/kexec/elf_64.c
> > > @@ -104,7 +104,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
> > >
> > >       fdt = of_kexec_alloc_and_setup_fdt(image, initrd_load_addr,
> > >                                          initrd_len, cmdline,
> > > -                                        fdt_totalsize(initial_boot_params));
> > > +                                        kexec_fdt_totalsize_ppc64(image));
> > >       if (!fdt) {
> > >               pr_err("Error setting up the new device tree.\n");
> > >               ret = -EINVAL;
> > >  
> >
> > I thought about that, but the last argument to
> > of_kexec_alloc_and_setup_fdt() is extra_fdt_size and the allocation
> > done is for this:
> >
> > fdt_size = fdt_totalsize(initial_boot_params) +
> >                    (cmdline ? strlen(cmdline) : 0) +
> >                    FDT_EXTRA_SPACE +
> >                    extra_fdt_size;
> >
> > and kexec_fdt_totalsize_ppc64() also includes
> > fdt_totalsize(initial_boot_params) so I was not sure.  Maybe
> > kexec_fdt_totalsize_ppc64() needs modification as well?  
> 
> You're both right. Michael's fix is sufficient for the merge. The only
> risk with a larger size is failing to allocate it, but we're talking
> only 10s of KB. Historically until the commit causing the conflict,
> PPC was just used 2x fdt_totalsize(initial_boot_params). You could
> drop 'fdt_size = fdt_totalsize(initial_boot_params) + (2 *
> COMMAND_LINE_SIZE);' from kexec_fdt_totalsize_ppc64() as well, but
> then the function name is misleading.
> 
> Lakshmi can send a follow-up patch to fine tune the size and rename
> kexec_fdt_totalsize_ppc64.

OK, I have mode Michael's suggested change to my resolution from today.

-- 
Cheers,
Stephen Rothwell

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 484 bytes --]

^ permalink raw reply

* Re: [PATCH] powerpc/4xx: Fix build errors from mfdcr()
From: Feng Tang @ 2021-02-18 13:04 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev
In-Reply-To: <20210218123058.748882-1-mpe@ellerman.id.au>

On Thu, Feb 18, 2021 at 11:30:58PM +1100, Michael Ellerman wrote:
> lkp reported a build error in fsp2.o:
> 
>   CC      arch/powerpc/platforms/44x/fsp2.o
>   {standard input}:577: Error: unsupported relocation against base
> 
> Which comes from:
> 
>   pr_err("GESR0: 0x%08x\n", mfdcr(base + PLB4OPB_GESR0));
> 
> Where our mfdcr() macro is stringifying "base + PLB4OPB_GESR0", and
> passing that to the assembler, which obviously doesn't work.
> 
> The mfdcr() macro already checks that the argument is constant using
> __builtin_constant_p(), and if not calls the out-of-line version of
> mfdcr(). But in this case GCC is smart enough to notice that "base +
> PLB4OPB_GESR0" will be constant, even though it's not something we can
> immediately stringify into a register number.
> 
> Segher pointed out that passing the register number to the inline asm
> as a constant would be better, and in fact it fixes the build error,
> presumably because it gives GCC a chance to resolve the value.
> 
> While we're at it, change mtdcr() similarly.
> 
> Reported-by: kernel test robot <lkp@intel.com>
> Suggested-by: Segher Boessenkool <segher@kernel.crashing.org>
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

Acked-by: Feng Tang <feng.tang@intel.com>

Thanks!

> ---
>  arch/powerpc/include/asm/dcr-native.h | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/dcr-native.h b/arch/powerpc/include/asm/dcr-native.h
> index 7141ccea8c94..a92059964579 100644
> --- a/arch/powerpc/include/asm/dcr-native.h
> +++ b/arch/powerpc/include/asm/dcr-native.h
> @@ -53,8 +53,8 @@ static inline void mtdcrx(unsigned int reg, unsigned int val)
>  #define mfdcr(rn)						\
>  	({unsigned int rval;					\
>  	if (__builtin_constant_p(rn) && rn < 1024)		\
> -		asm volatile("mfdcr %0," __stringify(rn)	\
> -		              : "=r" (rval));			\
> +		asm volatile("mfdcr %0, %1" : "=r" (rval)	\
> +			      : "n" (rn));			\
>  	else if (likely(cpu_has_feature(CPU_FTR_INDEXED_DCR)))	\
>  		rval = mfdcrx(rn);				\
>  	else							\
> @@ -64,8 +64,8 @@ static inline void mtdcrx(unsigned int reg, unsigned int val)
>  #define mtdcr(rn, v)						\
>  do {								\
>  	if (__builtin_constant_p(rn) && rn < 1024)		\
> -		asm volatile("mtdcr " __stringify(rn) ",%0"	\
> -			      : : "r" (v)); 			\
> +		asm volatile("mtdcr %0, %1"			\
> +			      : : "n" (rn), "r" (v));		\
>  	else if (likely(cpu_has_feature(CPU_FTR_INDEXED_DCR)))	\
>  		mtdcrx(rn, v);					\
>  	else							\
> -- 
> 2.25.1

^ permalink raw reply

* Re: linux-next: manual merge of the devicetree tree with the powerpc tree
From: Rob Herring @ 2021-02-18 13:52 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: Linux Kernel Mailing List, Lakshmi Ramasubramanian,
	Linux Next Mailing List, PowerPC, Hari Bathini
In-Reply-To: <20210218223427.77109d83@canb.auug.org.au>

On Thu, Feb 18, 2021 at 5:34 AM Stephen Rothwell <sfr@canb.auug.org.au> wrote:
>
> Hi Michael,
>
> On Thu, 18 Feb 2021 21:44:37 +1100 Michael Ellerman <mpe@ellerman.id.au> wrote:
> >
> > I think it just needs this?
> >
> > diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
> > index 87e34611f93d..0492ca6003f3 100644
> > --- a/arch/powerpc/kexec/elf_64.c
> > +++ b/arch/powerpc/kexec/elf_64.c
> > @@ -104,7 +104,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
> >
> >       fdt = of_kexec_alloc_and_setup_fdt(image, initrd_load_addr,
> >                                          initrd_len, cmdline,
> > -                                        fdt_totalsize(initial_boot_params));
> > +                                        kexec_fdt_totalsize_ppc64(image));
> >       if (!fdt) {
> >               pr_err("Error setting up the new device tree.\n");
> >               ret = -EINVAL;
> >
>
> I thought about that, but the last argument to
> of_kexec_alloc_and_setup_fdt() is extra_fdt_size and the allocation
> done is for this:
>
> fdt_size = fdt_totalsize(initial_boot_params) +
>                    (cmdline ? strlen(cmdline) : 0) +
>                    FDT_EXTRA_SPACE +
>                    extra_fdt_size;
>
> and kexec_fdt_totalsize_ppc64() also includes
> fdt_totalsize(initial_boot_params) so I was not sure.  Maybe
> kexec_fdt_totalsize_ppc64() needs modification as well?

You're both right. Michael's fix is sufficient for the merge. The only
risk with a larger size is failing to allocate it, but we're talking
only 10s of KB. Historically until the commit causing the conflict,
PPC was just used 2x fdt_totalsize(initial_boot_params). You could
drop 'fdt_size = fdt_totalsize(initial_boot_params) + (2 *
COMMAND_LINE_SIZE);' from kexec_fdt_totalsize_ppc64() as well, but
then the function name is misleading.

Lakshmi can send a follow-up patch to fine tune the size and rename
kexec_fdt_totalsize_ppc64.

Rob

^ permalink raw reply

* Re: [PATCH kernel] powerpc/iommu: Annotate nested lock for lockdep
From: Frederic Barrat @ 2021-02-18 12:59 UTC (permalink / raw)
  To: Alexey Kardashevskiy, linuxppc-dev; +Cc: kvm-ppc
In-Reply-To: <20210216032000.21642-1-aik@ozlabs.ru>



On 16/02/2021 04:20, Alexey Kardashevskiy wrote:
> The IOMMU table is divided into pools for concurrent mappings and each
> pool has a separate spinlock. When taking the ownership of an IOMMU group
> to pass through a device to a VM, we lock these spinlocks which triggers
> a false negative warning in lockdep (below).
> 
> This fixes it by annotating the large pool's spinlock as a nest lock.
> 
> ===
> WARNING: possible recursive locking detected
> 5.11.0-le_syzkaller_a+fstn1 #100 Not tainted
> --------------------------------------------
> qemu-system-ppc/4129 is trying to acquire lock:
> c0000000119bddb0 (&(p->lock)/1){....}-{2:2}, at: iommu_take_ownership+0xac/0x1e0
> 
> but task is already holding lock:
> c0000000119bdd30 (&(p->lock)/1){....}-{2:2}, at: iommu_take_ownership+0xac/0x1e0
> 
> other info that might help us debug this:
>   Possible unsafe locking scenario:
> 
>         CPU0
>         ----
>    lock(&(p->lock)/1);
>    lock(&(p->lock)/1);
> ===
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>   arch/powerpc/kernel/iommu.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 557a09dd5b2f..2ee642a6731a 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -1089,7 +1089,7 @@ int iommu_take_ownership(struct iommu_table *tbl)
>   
>   	spin_lock_irqsave(&tbl->large_pool.lock, flags);
>   	for (i = 0; i < tbl->nr_pools; i++)
> -		spin_lock(&tbl->pools[i].lock);
> +		spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock);


We have the same pattern and therefore should have the same problem in 
iommu_release_ownership().

But as I understand, we're hacking our way around lockdep here, since 
conceptually, those locks are independent. I was wondering why it seems 
to fix it by worrying only about the large pool lock. That loop can take 
many locks (up to 4 with current config). However, if the dma window is 
less than 1GB, we would only have one, so it would make sense for 
lockdep to stop complaining. Is it what happened? In which case, this 
patch doesn't really fix it. Or I'm missing something :-)

   Fred



>   	iommu_table_release_pages(tbl);
>   
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox