* [PATCH v2 37/43] powerpc/32: Return directly from power_save_ppc32_restore()
From: Christophe Leroy @ 2021-03-09 12:10 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1615291471.git.christophe.leroy@csgroup.eu>
transfer_to_handler_cont: is now just a blr.
Directly perform blr in power_save_ppc32_restore().
Also remove useless setting of r11 in e500 version of
power_save_ppc32_restore().
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/kernel/entry_32.S | 3 ---
arch/powerpc/kernel/idle_6xx.S | 2 +-
arch/powerpc/kernel/idle_e500.S | 10 +---------
3 files changed, 2 insertions(+), 13 deletions(-)
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 815a4ff1ba76..5cfa10816261 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -67,8 +67,6 @@ prepare_transfer_to_handler:
bt- 31-TLF_NAPPING,4f
bt- 31-TLF_SLEEPING,7f
#endif /* CONFIG_PPC_BOOK3S_32 || CONFIG_E500 */
- .globl transfer_to_handler_cont
-transfer_to_handler_cont:
blr
#if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500)
@@ -84,7 +82,6 @@ transfer_to_handler_cont:
b fast_exception_return
#endif
_ASM_NOKPROBE_SYMBOL(prepare_transfer_to_handler)
-_ASM_NOKPROBE_SYMBOL(transfer_to_handler_cont)
.globl transfer_to_syscall
transfer_to_syscall:
diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S
index 153366e178c4..13cad9297d82 100644
--- a/arch/powerpc/kernel/idle_6xx.S
+++ b/arch/powerpc/kernel/idle_6xx.S
@@ -176,7 +176,7 @@ BEGIN_FTR_SECTION
lwz r9,nap_save_hid1@l(r9)
mtspr SPRN_HID1, r9
END_FTR_SECTION_IFSET(CPU_FTR_DUAL_PLL_750FX)
- b transfer_to_handler_cont
+ blr
_ASM_NOKPROBE_SYMBOL(power_save_ppc32_restore)
.data
diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_e500.S
index 7795727e7f08..9e1bc4502c50 100644
--- a/arch/powerpc/kernel/idle_e500.S
+++ b/arch/powerpc/kernel/idle_e500.S
@@ -81,13 +81,5 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
_GLOBAL(power_save_ppc32_restore)
lwz r9,_LINK(r11) /* interrupted in e500_idle */
stw r9,_NIP(r11) /* make it do a blr */
-
-#ifdef CONFIG_SMP
- lwz r11,TASK_CPU(r2) /* get cpu number * 4 */
- slwi r11,r11,2
-#else
- li r11,0
-#endif
-
- b transfer_to_handler_cont
+ blr
_ASM_NOKPROBE_SYMBOL(power_save_ppc32_restore)
--
2.25.0
^ permalink raw reply related
* [PATCH v2 38/43] powerpc/32: Only use prepare_transfer_to_handler function on book3s/32 and e500
From: Christophe Leroy @ 2021-03-09 12:10 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1615291471.git.christophe.leroy@csgroup.eu>
Only book3s/32 and e500 have significative work to do in
prepare_transfer_to_handler.
Other 32 bit have nothing to do at all.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/kernel/entry_32.S | 6 ++----
arch/powerpc/kernel/head_32.h | 2 ++
arch/powerpc/kernel/head_booke.h | 2 ++
3 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 5cfa10816261..9c333e6db5fa 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -48,6 +48,7 @@
*/
.align 12
+#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500)
.globl prepare_transfer_to_handler
prepare_transfer_to_handler:
andi. r0,r9,MSR_PR
@@ -61,15 +62,12 @@ prepare_transfer_to_handler:
/* if from kernel, check interrupted DOZE/NAP mode */
2:
kuap_save_and_lock r11, r12, r9, r5, r6
-#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500)
lwz r12,TI_LOCAL_FLAGS(r2)
mtcrf 0x01,r12
bt- 31-TLF_NAPPING,4f
bt- 31-TLF_SLEEPING,7f
-#endif /* CONFIG_PPC_BOOK3S_32 || CONFIG_E500 */
blr
-#if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500)
4: rlwinm r12,r12,0,~_TLF_NAPPING
stw r12,TI_LOCAL_FLAGS(r2)
b power_save_ppc32_restore
@@ -80,8 +78,8 @@ prepare_transfer_to_handler:
rlwinm r9,r9,0,~MSR_EE
stw r9,_MSR(r11)
b fast_exception_return
-#endif
_ASM_NOKPROBE_SYMBOL(prepare_transfer_to_handler)
+#endif /* CONFIG_PPC_BOOK3S_32 || CONFIG_E500 */
.globl transfer_to_syscall
transfer_to_syscall:
diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index 267479072495..ca303762d8cc 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -132,7 +132,9 @@ _ASM_NOKPROBE_SYMBOL(\name\()_virt)
.endm
.macro prepare_transfer_to_handler
+#ifdef CONFIG_PPC_BOOK3S_32
bl prepare_transfer_to_handler
+#endif
.endm
.macro SYSCALL_ENTRY trapno
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 4d583fbef0b6..a2565023d2d0 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -92,7 +92,9 @@ END_BTB_FLUSH_SECTION
.endm
.macro prepare_transfer_to_handler
+#ifdef CONFIG_E500
bl prepare_transfer_to_handler
+#endif
.endm
.macro SYSCALL_ENTRY trapno intno srr1
--
2.25.0
^ permalink raw reply related
* [PATCH v2 39/43] powerpc/32s: Move KUEP locking/unlocking in C
From: Christophe Leroy @ 2021-03-09 12:10 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1615291471.git.christophe.leroy@csgroup.eu>
This can be done in C, do it.
Unrolling the loop gains approx. 15% performance.
From now on, prepare_transfer_to_handler() is only for
interrupts from kernel.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/include/asm/book3s/32/kup.h | 31 -------------------
arch/powerpc/include/asm/interrupt.h | 3 ++
arch/powerpc/include/asm/kup.h | 8 +++++
arch/powerpc/kernel/entry_32.S | 16 +---------
arch/powerpc/kernel/interrupt.c | 4 +++
arch/powerpc/mm/book3s32/Makefile | 1 +
arch/powerpc/mm/book3s32/kuep.c | 38 ++++++++++++++++++++++++
7 files changed, 55 insertions(+), 46 deletions(-)
create mode 100644 arch/powerpc/mm/book3s32/kuep.c
diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h
index 73bc5d2c431d..b97ea60f6fa3 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -7,37 +7,6 @@
#ifdef __ASSEMBLY__
-.macro kuep_update_sr gpr1, gpr2 /* NEVER use r0 as gpr2 due to addis */
-101: mtsrin \gpr1, \gpr2
- addi \gpr1, \gpr1, 0x111 /* next VSID */
- rlwinm \gpr1, \gpr1, 0, 0xf0ffffff /* clear VSID overflow */
- addis \gpr2, \gpr2, 0x1000 /* address of next segment */
- bdnz 101b
- isync
-.endm
-
-.macro kuep_lock gpr1, gpr2
-#ifdef CONFIG_PPC_KUEP
- li \gpr1, NUM_USER_SEGMENTS
- li \gpr2, 0
- mtctr \gpr1
- mfsrin \gpr1, \gpr2
- oris \gpr1, \gpr1, SR_NX@h /* set Nx */
- kuep_update_sr \gpr1, \gpr2
-#endif
-.endm
-
-.macro kuep_unlock gpr1, gpr2
-#ifdef CONFIG_PPC_KUEP
- li \gpr1, NUM_USER_SEGMENTS
- li \gpr2, 0
- mtctr \gpr1
- mfsrin \gpr1, \gpr2
- rlwinm \gpr1, \gpr1, 0, ~SR_NX /* Clear Nx */
- kuep_update_sr \gpr1, \gpr2
-#endif
-.endm
-
#ifdef CONFIG_PPC_KUAP
.macro kuap_update_sr gpr1, gpr2, gpr3 /* NEVER use r0 as gpr2 due to addis */
diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index e6d71c2e3aa2..d4bfe94b4a68 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -34,6 +34,7 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup
trace_hardirqs_off();
if (user_mode(regs)) {
+ kuep_lock();
current->thread.regs = regs;
account_cpu_user_entry();
}
@@ -91,6 +92,8 @@ static inline void interrupt_exit_prepare(struct pt_regs *regs, struct interrupt
exception_exit(state->ctx_state);
#endif
+ if (user_mode(regs))
+ kuep_unlock();
/*
* Book3S exits to user via interrupt_exit_user_prepare(), which does
* context tracking, which is a cleaner way to handle PREEMPT=y
diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
index 7ec21af49a45..25671f711ec2 100644
--- a/arch/powerpc/include/asm/kup.h
+++ b/arch/powerpc/include/asm/kup.h
@@ -55,6 +55,14 @@ void setup_kuep(bool disabled);
static inline void setup_kuep(bool disabled) { }
#endif /* CONFIG_PPC_KUEP */
+#if defined(CONFIG_PPC_KUEP) && defined(CONFIG_PPC_BOOK3S_32)
+void kuep_lock(void);
+void kuep_unlock(void);
+#else
+static inline void kuep_lock(void) { }
+static inline void kuep_unlock(void) { }
+#endif
+
#ifdef CONFIG_PPC_KUAP
void setup_kuap(bool disabled);
#else
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 9c333e6db5fa..850cb17a937f 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -53,14 +53,9 @@
prepare_transfer_to_handler:
andi. r0,r9,MSR_PR
addi r12, r2, THREAD
- beq 2f
-#ifdef CONFIG_PPC_BOOK3S_32
- kuep_lock r11, r12
-#endif
- blr
+ bnelr
/* if from kernel, check interrupted DOZE/NAP mode */
-2:
kuap_save_and_lock r11, r12, r9, r5, r6
lwz r12,TI_LOCAL_FLAGS(r2)
mtcrf 0x01,r12
@@ -84,9 +79,6 @@ _ASM_NOKPROBE_SYMBOL(prepare_transfer_to_handler)
.globl transfer_to_syscall
transfer_to_syscall:
SAVE_NVGPRS(r1)
-#ifdef CONFIG_PPC_BOOK3S_32
- kuep_lock r11, r12
-#endif
/* Calling convention has r9 = orig r0, r10 = regs */
addi r10,r1,STACK_FRAME_OVERHEAD
@@ -104,9 +96,6 @@ ret_from_syscall:
cmplwi cr0,r5,0
bne- 2f
#endif /* CONFIG_PPC_47x */
-#ifdef CONFIG_PPC_BOOK3S_32
- kuep_unlock r5, r7
-#endif
kuap_check r2, r4
lwz r4,_LINK(r1)
lwz r5,_CCR(r1)
@@ -287,9 +276,6 @@ interrupt_return:
bne- .Lrestore_nvgprs
.Lfast_user_interrupt_return:
-#ifdef CONFIG_PPC_BOOK3S_32
- kuep_unlock r10, r11
-#endif
kuap_check r2, r4
lwz r11,_NIP(r1)
lwz r12,_MSR(r1)
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 7082e8ee825e..727b7848c9cc 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -33,6 +33,8 @@ notrace long system_call_exception(long r3, long r4, long r5,
{
syscall_fn f;
+ kuep_lock();
+
regs->orig_gpr3 = r3;
if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
@@ -354,6 +356,8 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
*/
kuap_user_restore(regs);
#endif
+ kuep_unlock();
+
return ret;
}
diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile
index 446d9de88ce4..7f0c8a78ba0c 100644
--- a/arch/powerpc/mm/book3s32/Makefile
+++ b/arch/powerpc/mm/book3s32/Makefile
@@ -9,3 +9,4 @@ endif
obj-y += mmu.o mmu_context.o
obj-$(CONFIG_PPC_BOOK3S_603) += nohash_low.o
obj-$(CONFIG_PPC_BOOK3S_604) += hash_low.o tlb.o
+obj-$(CONFIG_PPC_KUEP) += kuep.o
diff --git a/arch/powerpc/mm/book3s32/kuep.c b/arch/powerpc/mm/book3s32/kuep.c
new file mode 100644
index 000000000000..c70532568a28
--- /dev/null
+++ b/arch/powerpc/mm/book3s32/kuep.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <asm/reg.h>
+#include <asm/task_size_32.h>
+#include <asm/mmu.h>
+
+#define KUEP_UPDATE_TWO_USER_SEGMENTS(n) do { \
+ if (TASK_SIZE > ((n) << 28)) \
+ mtsr(val1, (n) << 28); \
+ if (TASK_SIZE > (((n) + 1) << 28)) \
+ mtsr(val2, ((n) + 1) << 28); \
+ val1 = (val1 + 0x222) & 0xf0ffffff; \
+ val2 = (val2 + 0x222) & 0xf0ffffff; \
+} while (0)
+
+static __always_inline void kuep_update(u32 val)
+{
+ int val1 = val;
+ int val2 = (val + 0x111) & 0xf0ffffff;
+
+ KUEP_UPDATE_TWO_USER_SEGMENTS(0);
+ KUEP_UPDATE_TWO_USER_SEGMENTS(2);
+ KUEP_UPDATE_TWO_USER_SEGMENTS(4);
+ KUEP_UPDATE_TWO_USER_SEGMENTS(6);
+ KUEP_UPDATE_TWO_USER_SEGMENTS(8);
+ KUEP_UPDATE_TWO_USER_SEGMENTS(10);
+ KUEP_UPDATE_TWO_USER_SEGMENTS(12);
+ KUEP_UPDATE_TWO_USER_SEGMENTS(14);
+}
+
+void kuep_lock(void)
+{
+ kuep_update(mfsr(0) | SR_NX);
+}
+
+void kuep_unlock(void)
+{
+ kuep_update(mfsr(0) & ~SR_NX);
+}
--
2.25.0
^ permalink raw reply related
* [PATCH v2 40/43] powerpc/64s: Make kuap_check_amr() and kuap_get_and_check_amr() generic
From: Christophe Leroy @ 2021-03-09 12:10 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1615291471.git.christophe.leroy@csgroup.eu>
In preparation of porting powerpc32 to C syscall entry/exit,
rename kuap_check_amr() and kuap_get_and_check_amr() as kuap_check()
and kuap_get_and_check(), and move in the generic asm/kup.h the stub
for when CONFIG_PPC_KUAP is not selected.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/include/asm/book3s/64/kup.h | 24 ++----------------------
arch/powerpc/include/asm/kup.h | 10 +++++++++-
arch/powerpc/kernel/interrupt.c | 12 ++++++------
arch/powerpc/kernel/irq.c | 2 +-
4 files changed, 18 insertions(+), 30 deletions(-)
diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
index 8bd905050896..d9b07e9998be 100644
--- a/arch/powerpc/include/asm/book3s/64/kup.h
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -287,7 +287,7 @@ static inline void kuap_kernel_restore(struct pt_regs *regs,
*/
}
-static inline unsigned long kuap_get_and_check_amr(void)
+static inline unsigned long kuap_get_and_check(void)
{
if (mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) {
unsigned long amr = mfspr(SPRN_AMR);
@@ -298,27 +298,7 @@ static inline unsigned long kuap_get_and_check_amr(void)
return 0;
}
-#else /* CONFIG_PPC_PKEY */
-
-static inline void kuap_user_restore(struct pt_regs *regs)
-{
-}
-
-static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long amr)
-{
-}
-
-static inline unsigned long kuap_get_and_check_amr(void)
-{
- return 0;
-}
-
-#endif /* CONFIG_PPC_PKEY */
-
-
-#ifdef CONFIG_PPC_KUAP
-
-static inline void kuap_check_amr(void)
+static inline void kuap_check(void)
{
if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && mmu_has_feature(MMU_FTR_BOOK3S_KUAP))
WARN_ON_ONCE(mfspr(SPRN_AMR) != AMR_KUAP_BLOCKED);
diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
index 25671f711ec2..b7efa46b3109 100644
--- a/arch/powerpc/include/asm/kup.h
+++ b/arch/powerpc/include/asm/kup.h
@@ -74,7 +74,15 @@ bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
return false;
}
-static inline void kuap_check_amr(void) { }
+static inline void kuap_check(void) { }
+static inline void kuap_save_and_lock(struct pt_regs *regs) { }
+static inline void kuap_user_restore(struct pt_regs *regs) { }
+static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long amr) { }
+
+static inline unsigned long kuap_get_and_check(void)
+{
+ return 0;
+}
/*
* book3s/64/kup-radix.h defines these functions for the !KUAP case to flush
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 727b7848c9cc..40ed55064e54 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -76,7 +76,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
} else
#endif
#ifdef CONFIG_PPC64
- kuap_check_amr();
+ kuap_check();
#endif
booke_restore_dbcr0();
@@ -254,7 +254,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
CT_WARN_ON(ct_state() == CONTEXT_USER);
#ifdef CONFIG_PPC64
- kuap_check_amr();
+ kuap_check();
#endif
regs->result = r3;
@@ -380,7 +380,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned
* AMR can only have been unlocked if we interrupted the kernel.
*/
#ifdef CONFIG_PPC64
- kuap_check_amr();
+ kuap_check();
#endif
local_irq_save(flags);
@@ -451,7 +451,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
unsigned long flags;
unsigned long ret = 0;
#ifdef CONFIG_PPC64
- unsigned long amr;
+ unsigned long kuap;
#endif
if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x) &&
@@ -467,7 +467,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
CT_WARN_ON(ct_state() == CONTEXT_USER);
#ifdef CONFIG_PPC64
- amr = kuap_get_and_check_amr();
+ kuap = kuap_get_and_check();
#endif
if (unlikely(current_thread_info()->flags & _TIF_EMULATE_STACK_STORE)) {
@@ -511,7 +511,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
* value from the check above.
*/
#ifdef CONFIG_PPC64
- kuap_kernel_restore(regs, amr);
+ kuap_kernel_restore(regs, kuap);
#endif
return ret;
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index d71fd10a1dd4..3b18d2b2c702 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -282,7 +282,7 @@ static inline void replay_soft_interrupts_irqrestore(void)
* and re-locking AMR but we shouldn't get here in the first place,
* hence the warning.
*/
- kuap_check_amr();
+ kuap_check();
if (kuap_state != AMR_KUAP_BLOCKED)
set_kuap(AMR_KUAP_BLOCKED);
--
2.25.0
^ permalink raw reply related
* [PATCH v2 42/43] powerpc/8xx: Create C version of kuap save/restore/check helpers
From: Christophe Leroy @ 2021-03-09 12:10 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1615291471.git.christophe.leroy@csgroup.eu>
In preparation of porting PPC32 to C syscall entry/exit,
create C version of kuap_save_and_lock() and kuap_user_restore() and
kuap_kernel_restore() and kuap_check() and kuap_get_and_check() on 8xx.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/include/asm/nohash/32/kup-8xx.h | 31 ++++++++++++++++++++
1 file changed, 31 insertions(+)
diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
index 17a4a616436f..c74f5704bc47 100644
--- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
@@ -34,6 +34,37 @@
#include <asm/reg.h>
+static inline void kuap_save_and_lock(struct pt_regs *regs)
+{
+ regs->kuap = mfspr(SPRN_MD_AP);
+ mtspr(SPRN_MD_AP, MD_APG_KUAP);
+}
+
+static inline void kuap_user_restore(struct pt_regs *regs)
+{
+}
+
+static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap)
+{
+ mtspr(SPRN_MD_AP, regs->kuap);
+}
+
+static inline unsigned long kuap_get_and_check(void)
+{
+ unsigned long kuap = mfspr(SPRN_MD_AP);
+
+ if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG))
+ WARN_ON_ONCE(kuap >> 16 != MD_APG_KUAP >> 16);
+
+ return kuap;
+}
+
+static inline void kuap_check(void)
+{
+ if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG))
+ kuap_get_and_check();
+}
+
static inline void allow_user_access(void __user *to, const void __user *from,
unsigned long size, unsigned long dir)
{
--
2.25.0
^ permalink raw reply related
* [PATCH v2 41/43] powerpc/32s: Create C version of kuap save/restore/check helpers
From: Christophe Leroy @ 2021-03-09 12:10 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1615291471.git.christophe.leroy@csgroup.eu>
In preparation of porting PPC32 to C syscall entry/exit,
create C version of kuap_save_and_lock() and kuap_user_restore() and
kuap_kernel_restore() and kuap_check() and kuap_get_and_check()
on book3s/32.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/include/asm/book3s/32/kup.h | 45 ++++++++++++++++++++++++
1 file changed, 45 insertions(+)
diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h
index b97ea60f6fa3..c9d6c28bcd10 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -72,6 +72,51 @@ static inline void kuap_update_sr(u32 sr, u32 addr, u32 end)
isync(); /* Context sync required after mtsr() */
}
+static inline void kuap_save_and_lock(struct pt_regs *regs)
+{
+ unsigned long kuap = current->thread.kuap;
+ u32 addr = kuap & 0xf0000000;
+ u32 end = kuap << 28;
+
+ regs->kuap = kuap;
+ if (unlikely(!kuap))
+ return;
+
+ current->thread.kuap = 0;
+ kuap_update_sr(mfsr(addr) | SR_KS, addr, end); /* Set Ks */
+}
+
+static inline void kuap_user_restore(struct pt_regs *regs)
+{
+}
+
+static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap)
+{
+ u32 addr = regs->kuap & 0xf0000000;
+ u32 end = regs->kuap << 28;
+
+ current->thread.kuap = regs->kuap;
+
+ if (unlikely(regs->kuap == kuap))
+ return;
+
+ kuap_update_sr(mfsr(addr) & ~SR_KS, addr, end); /* Clear Ks */
+}
+
+static inline unsigned long kuap_get_and_check(void)
+{
+ unsigned long kuap = current->thread.kuap;
+
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && kuap != 0);
+
+ return kuap;
+}
+
+static inline void kuap_check(void)
+{
+ kuap_get_and_check();
+}
+
static __always_inline void allow_user_access(void __user *to, const void __user *from,
u32 size, unsigned long dir)
{
--
2.25.0
^ permalink raw reply related
* [PATCH v2 43/43] powerpc/32: Manage KUAP in C
From: Christophe Leroy @ 2021-03-09 12:10 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1615291471.git.christophe.leroy@csgroup.eu>
Move all KUAP management in C.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/include/asm/book3s/32/kup.h | 50 +-------------------
arch/powerpc/include/asm/interrupt.h | 2 +
arch/powerpc/include/asm/kup.h | 9 ----
arch/powerpc/include/asm/nohash/32/kup-8xx.h | 25 +---------
arch/powerpc/kernel/entry_32.S | 6 ---
arch/powerpc/kernel/interrupt.c | 19 ++------
arch/powerpc/kernel/process.c | 3 ++
7 files changed, 11 insertions(+), 103 deletions(-)
diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h
index c9d6c28bcd10..27991e0d2cf9 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -5,55 +5,7 @@
#include <asm/bug.h>
#include <asm/book3s/32/mmu-hash.h>
-#ifdef __ASSEMBLY__
-
-#ifdef CONFIG_PPC_KUAP
-
-.macro kuap_update_sr gpr1, gpr2, gpr3 /* NEVER use r0 as gpr2 due to addis */
-101: mtsrin \gpr1, \gpr2
- addi \gpr1, \gpr1, 0x111 /* next VSID */
- rlwinm \gpr1, \gpr1, 0, 0xf0ffffff /* clear VSID overflow */
- addis \gpr2, \gpr2, 0x1000 /* address of next segment */
- cmplw \gpr2, \gpr3
- blt- 101b
- isync
-.endm
-
-.macro kuap_save_and_lock sp, thread, gpr1, gpr2, gpr3
- lwz \gpr2, KUAP(\thread)
- rlwinm. \gpr3, \gpr2, 28, 0xf0000000
- stw \gpr2, STACK_REGS_KUAP(\sp)
- beq+ 102f
- li \gpr1, 0
- stw \gpr1, KUAP(\thread)
- mfsrin \gpr1, \gpr2
- oris \gpr1, \gpr1, SR_KS@h /* set Ks */
- kuap_update_sr \gpr1, \gpr2, \gpr3
-102:
-.endm
-
-.macro kuap_restore sp, current, gpr1, gpr2, gpr3
- lwz \gpr2, STACK_REGS_KUAP(\sp)
- rlwinm. \gpr3, \gpr2, 28, 0xf0000000
- stw \gpr2, THREAD + KUAP(\current)
- beq+ 102f
- mfsrin \gpr1, \gpr2
- rlwinm \gpr1, \gpr1, 0, ~SR_KS /* Clear Ks */
- kuap_update_sr \gpr1, \gpr2, \gpr3
-102:
-.endm
-
-.macro kuap_check current, gpr
-#ifdef CONFIG_PPC_KUAP_DEBUG
- lwz \gpr, THREAD + KUAP(\current)
-999: twnei \gpr, 0
- EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE)
-#endif
-.endm
-
-#endif /* CONFIG_PPC_KUAP */
-
-#else /* !__ASSEMBLY__ */
+#ifndef __ASSEMBLY__
#ifdef CONFIG_PPC_KUAP
diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index d4bfe94b4a68..b41cb4e014b2 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -37,6 +37,8 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup
kuep_lock();
current->thread.regs = regs;
account_cpu_user_entry();
+ } else {
+ kuap_save_and_lock(regs);
}
#endif
/*
diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
index b7efa46b3109..5bbe8f28d26b 100644
--- a/arch/powerpc/include/asm/kup.h
+++ b/arch/powerpc/include/asm/kup.h
@@ -28,15 +28,6 @@
#ifdef __ASSEMBLY__
#ifndef CONFIG_PPC_KUAP
-.macro kuap_save_and_lock sp, thread, gpr1, gpr2, gpr3
-.endm
-
-.macro kuap_restore sp, current, gpr1, gpr2, gpr3
-.endm
-
-.macro kuap_check current, gpr
-.endm
-
.macro kuap_check_amr gpr1, gpr2
.endm
diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
index c74f5704bc47..fb294dbca102 100644
--- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
@@ -7,30 +7,7 @@
#ifdef CONFIG_PPC_KUAP
-#ifdef __ASSEMBLY__
-
-.macro kuap_save_and_lock sp, thread, gpr1, gpr2, gpr3
- lis \gpr2, MD_APG_KUAP@h /* only APG0 and APG1 are used */
- mfspr \gpr1, SPRN_MD_AP
- mtspr SPRN_MD_AP, \gpr2
- stw \gpr1, STACK_REGS_KUAP(\sp)
-.endm
-
-.macro kuap_restore sp, current, gpr1, gpr2, gpr3
- lwz \gpr1, STACK_REGS_KUAP(\sp)
- mtspr SPRN_MD_AP, \gpr1
-.endm
-
-.macro kuap_check current, gpr
-#ifdef CONFIG_PPC_KUAP_DEBUG
- mfspr \gpr, SPRN_MD_AP
- rlwinm \gpr, \gpr, 16, 0xffff
-999: twnei \gpr, MD_APG_KUAP@h
- EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE)
-#endif
-.endm
-
-#else /* !__ASSEMBLY__ */
+#ifndef __ASSEMBLY__
#include <asm/reg.h>
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 850cb17a937f..f5ac021ff9ed 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -52,11 +52,9 @@
.globl prepare_transfer_to_handler
prepare_transfer_to_handler:
andi. r0,r9,MSR_PR
- addi r12, r2, THREAD
bnelr
/* if from kernel, check interrupted DOZE/NAP mode */
- kuap_save_and_lock r11, r12, r9, r5, r6
lwz r12,TI_LOCAL_FLAGS(r2)
mtcrf 0x01,r12
bt- 31-TLF_NAPPING,4f
@@ -96,7 +94,6 @@ ret_from_syscall:
cmplwi cr0,r5,0
bne- 2f
#endif /* CONFIG_PPC_47x */
- kuap_check r2, r4
lwz r4,_LINK(r1)
lwz r5,_CCR(r1)
mtlr r4
@@ -208,7 +205,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPE)
stw r10,_CCR(r1)
stw r1,KSP(r3) /* Set old stack pointer */
- kuap_check r2, r0
#ifdef CONFIG_SMP
/* We need a sync somewhere here to make sure that if the
* previous task gets rescheduled on another CPU, it sees all
@@ -276,7 +272,6 @@ interrupt_return:
bne- .Lrestore_nvgprs
.Lfast_user_interrupt_return:
- kuap_check r2, r4
lwz r11,_NIP(r1)
lwz r12,_MSR(r1)
mtspr SPRN_SRR0,r11
@@ -326,7 +321,6 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
.Lfast_kernel_interrupt_return:
cmpwi cr1,r3,0
- kuap_restore r1, r2, r3, r4, r5
lwz r11,_NIP(r1)
lwz r12,_MSR(r1)
mtspr SPRN_SRR0,r11
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 40ed55064e54..864cf47b629a 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -34,6 +34,9 @@ notrace long system_call_exception(long r3, long r4, long r5,
syscall_fn f;
kuep_lock();
+#ifdef CONFIG_PPC32
+ kuap_save_and_lock(regs);
+#endif
regs->orig_gpr3 = r3;
@@ -75,9 +78,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
isync();
} else
#endif
-#ifdef CONFIG_PPC64
kuap_check();
-#endif
booke_restore_dbcr0();
@@ -253,9 +254,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
CT_WARN_ON(ct_state() == CONTEXT_USER);
-#ifdef CONFIG_PPC64
kuap_check();
-#endif
regs->result = r3;
@@ -350,7 +349,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
account_cpu_user_exit();
-#ifdef CONFIG_PPC_BOOK3S_64 /* BOOK3E and ppc32 not using this */
+#ifndef CONFIG_PPC_BOOK3E_64 /* BOOK3E not using this */
/*
* We do this at the end so that we do context switch with KERNEL AMR
*/
@@ -379,9 +378,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned
* We don't need to restore AMR on the way back to userspace for KUAP.
* AMR can only have been unlocked if we interrupted the kernel.
*/
-#ifdef CONFIG_PPC64
kuap_check();
-#endif
local_irq_save(flags);
@@ -438,9 +435,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned
/*
* We do this at the end so that we do context switch with KERNEL AMR
*/
-#ifdef CONFIG_PPC64
kuap_user_restore(regs);
-#endif
return ret;
}
@@ -450,9 +445,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
{
unsigned long flags;
unsigned long ret = 0;
-#ifdef CONFIG_PPC64
unsigned long kuap;
-#endif
if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x) &&
unlikely(!(regs->msr & MSR_RI)))
@@ -466,9 +459,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
if (TRAP(regs) != 0x700)
CT_WARN_ON(ct_state() == CONTEXT_USER);
-#ifdef CONFIG_PPC64
kuap = kuap_get_and_check();
-#endif
if (unlikely(current_thread_info()->flags & _TIF_EMULATE_STACK_STORE)) {
clear_bits(_TIF_EMULATE_STACK_STORE, ¤t_thread_info()->flags);
@@ -510,9 +501,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
* which would cause Read-After-Write stalls. Hence, we take the AMR
* value from the check above.
*/
-#ifdef CONFIG_PPC64
kuap_kernel_restore(regs, kuap);
-#endif
return ret;
}
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 5d5d64be2679..fd4da71d92d1 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1255,6 +1255,9 @@ struct task_struct *__switch_to(struct task_struct *prev,
*/
restore_sprs(old_thread, new_thread);
+#ifdef CONFIG_PPC32
+ kuap_check();
+#endif
last = _switch(old_thread, new_thread);
#ifdef CONFIG_PPC_BOOK3S_64
--
2.25.0
^ permalink raw reply related
* Re: [PATCH v4] powerpc/uprobes: Validation for prefixed instruction
From: Ravi Bangoria @ 2021-03-09 12:58 UTC (permalink / raw)
To: Naveen N. Rao, Michael Ellerman
Cc: Ravi Bangoria, jniethe5, oleg, rostedt, linux-kernel, paulus,
sandipan, naveen.n.rao, linuxppc-dev
In-Reply-To: <20210309112115.GG145@DESKTOP-TDPLP67.localdomain>
On 3/9/21 4:51 PM, Naveen N. Rao wrote:
> On 2021/03/09 08:54PM, Michael Ellerman wrote:
>> Ravi Bangoria <ravi.bangoria@linux.ibm.com> writes:
>>> As per ISA 3.1, prefixed instruction should not cross 64-byte
>>> boundary. So don't allow Uprobe on such prefixed instruction.
>>>
>>> There are two ways probed instruction is changed in mapped pages.
>>> First, when Uprobe is activated, it searches for all the relevant
>>> pages and replace instruction in them. In this case, if that probe
>>> is on the 64-byte unaligned prefixed instruction, error out
>>> directly. Second, when Uprobe is already active and user maps a
>>> relevant page via mmap(), instruction is replaced via mmap() code
>>> path. But because Uprobe is invalid, entire mmap() operation can
>>> not be stopped. In this case just print an error and continue.
>>>
>>> Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
>>> Acked-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
>>
>> Do we have a Fixes: tag for this?
>
> Since this is an additional check we are adding, I don't think we should
> add a Fixes: tag. Nothing is broken per-se -- we're just adding more
> checks to catch simple mistakes. Also, like Oleg pointed out, there are
> still many other ways for users to shoot themselves in the foot with
> uprobes and prefixed instructions, if they so desire.
>
> However, if you still think we should add a Fixes: tag, we can perhaps
> use the below commit since I didn't see any specific commit adding
> support for prefixed instructions for uprobes:
>
> Fixes: 650b55b707fdfa ("powerpc: Add prefixed instructions to
> instruction data type")
True. IMO, It doesn't really need any Fixes tag.
>
>>
>>> ---
>>> v3: https://lore.kernel.org/r/20210304050529.59391-1-ravi.bangoria@linux.ibm.com
>>> v3->v4:
>>> - CONFIG_PPC64 check was not required, remove it.
>>> - Use SZ_ macros instead of hardcoded numbers.
>>>
>>> arch/powerpc/kernel/uprobes.c | 7 +++++++
>>> 1 file changed, 7 insertions(+)
>>>
>>> diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c
>>> index e8a63713e655..4cbfff6e94a3 100644
>>> --- a/arch/powerpc/kernel/uprobes.c
>>> +++ b/arch/powerpc/kernel/uprobes.c
>>> @@ -41,6 +41,13 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe,
>>> if (addr & 0x03)
>>> return -EINVAL;
>>>
>>> + if (cpu_has_feature(CPU_FTR_ARCH_31) &&
>>> + ppc_inst_prefixed(auprobe->insn) &&
>>> + (addr & (SZ_64 - 4)) == SZ_64 - 4) {
>>> + pr_info_ratelimited("Cannot register a uprobe on 64 byte unaligned prefixed instruction\n");
>>> + return -EINVAL;
>>
>> I realise we already did the 0x03 check above, but I still think this
>> would be clearer simply as:
>>
>> (addr & 0x3f == 60)
>
> Indeed, I like the use of `60' there -- hex is overrated ;)
Sure. Will resend.
Ravi
^ permalink raw reply
* Re: [PATCH v2 8/8] powerpc/xive: Map one IPI interrupt per node
From: Greg Kurz @ 2021-03-09 13:23 UTC (permalink / raw)
To: Cédric Le Goater; +Cc: linuxppc-dev
In-Reply-To: <20210303174857.1760393-9-clg@kaod.org>
On Wed, 3 Mar 2021 18:48:57 +0100
Cédric Le Goater <clg@kaod.org> wrote:
> ipistorm [*] can be used to benchmark the raw interrupt rate of an
> interrupt controller by measuring the number of IPIs a system can
> sustain. When applied to the XIVE interrupt controller of POWER9 and
> POWER10 systems, a significant drop of the interrupt rate can be
> observed when crossing the second node boundary.
>
> This is due to the fact that a single IPI interrupt is used for all
> CPUs of the system. The structure is shared and the cache line updates
> impact greatly the traffic between nodes and the overall IPI
> performance.
>
> As a workaround, the impact can be reduced by deactivating the IRQ
> lockup detector ("noirqdebug") which does a lot of accounting in the
> Linux IRQ descriptor structure and is responsible for most of the
> performance penalty.
>
> As a fix, this proposal allocates an IPI interrupt per node, to be
> shared by all CPUs of that node. It solves the scaling issue, the IRQ
> lockup detector still has an impact but the XIVE interrupt rate scales
> linearly. It also improves the "noirqdebug" case as showed in the
> tables below.
>
> * P9 DD2.2 - 2s * 64 threads
>
> "noirqdebug"
> Mint/s Mint/s
> chips cpus IPI/sys IPI/chip IPI/chip IPI/sys
> --------------------------------------------------------------
> 1 0-15 4.984023 4.875405 4.996536 5.048892
> 0-31 10.879164 10.544040 10.757632 11.037859
> 0-47 15.345301 14.688764 14.926520 15.310053
> 0-63 17.064907 17.066812 17.613416 17.874511
> 2 0-79 11.768764 21.650749 22.689120 22.566508
> 0-95 10.616812 26.878789 28.434703 28.320324
> 0-111 10.151693 31.397803 31.771773 32.388122
> 0-127 9.948502 33.139336 34.875716 35.224548
>
> * P10 DD1 - 4s (not homogeneous) 352 threads
>
> "noirqdebug"
> Mint/s Mint/s
> chips cpus IPI/sys IPI/chip IPI/chip IPI/sys
> --------------------------------------------------------------
> 1 0-15 2.409402 2.364108 2.383303 2.395091
> 0-31 6.028325 6.046075 6.089999 6.073750
> 0-47 8.655178 8.644531 8.712830 8.724702
> 0-63 11.629652 11.735953 12.088203 12.055979
> 0-79 14.392321 14.729959 14.986701 14.973073
> 0-95 12.604158 13.004034 17.528748 17.568095
> 2 0-111 9.767753 13.719831 19.968606 20.024218
> 0-127 6.744566 16.418854 22.898066 22.995110
> 0-143 6.005699 19.174421 25.425622 25.417541
> 0-159 5.649719 21.938836 27.952662 28.059603
> 0-175 5.441410 24.109484 31.133915 31.127996
> 3 0-191 5.318341 24.405322 33.999221 33.775354
> 0-207 5.191382 26.449769 36.050161 35.867307
> 0-223 5.102790 29.356943 39.544135 39.508169
> 0-239 5.035295 31.933051 42.135075 42.071975
> 0-255 4.969209 34.477367 44.655395 44.757074
> 4 0-271 4.907652 35.887016 47.080545 47.318537
> 0-287 4.839581 38.076137 50.464307 50.636219
> 0-303 4.786031 40.881319 53.478684 53.310759
> 0-319 4.743750 43.448424 56.388102 55.973969
> 0-335 4.709936 45.623532 59.400930 58.926857
> 0-351 4.681413 45.646151 62.035804 61.830057
>
> [*] https://github.com/antonblanchard/ipistorm
>
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
> arch/powerpc/sysdev/xive/xive-internal.h | 2 --
> arch/powerpc/sysdev/xive/common.c | 39 ++++++++++++++++++------
> 2 files changed, 30 insertions(+), 11 deletions(-)
>
> diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h
> index 9cf57c722faa..b3a456fdd3a5 100644
> --- a/arch/powerpc/sysdev/xive/xive-internal.h
> +++ b/arch/powerpc/sysdev/xive/xive-internal.h
> @@ -5,8 +5,6 @@
> #ifndef __XIVE_INTERNAL_H
> #define __XIVE_INTERNAL_H
>
> -#define XIVE_IPI_HW_IRQ 0 /* interrupt source # for IPIs */
> -
> /*
> * A "disabled" interrupt should never fire, to catch problems
> * we set its logical number to this
> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
> index 8eefd152b947..c27f7bb0494b 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -65,8 +65,16 @@ static struct irq_domain *xive_irq_domain;
> #ifdef CONFIG_SMP
> static struct irq_domain *xive_ipi_irq_domain;
>
> -/* The IPIs all use the same logical irq number */
> -static u32 xive_ipi_irq;
> +/* The IPIs use the same logical irq number when on the same chip */
> +static struct xive_ipi_desc {
> + unsigned int irq;
> + char name[8]; /* enough bytes to fit IPI-XXX */
So this assumes that the node number that node is <= 999 ? This
is certainly the case for now since CONFIG_NODES_SHIFT is 8
on ppc64 but starting with 10, you'd have truncated names.
What about deriving the size of name[] from CONFIG_NODES_SHIFT ?
Apart from that, LGTM. Probably not worth to respin just for
this.
I also could give a try in a KVM guest.
Topology passed to QEMU:
-smp 8,maxcpus=8,cores=2,threads=2,sockets=2 \
-numa node,nodeid=0,cpus=0-4 \
-numa node,nodeid=1,cpus=4-7
Topology observed in guest with lstopo :
Package L#0
NUMANode L#0 (P#0 30GB)
L1d L#0 (32KB) + L1i L#0 (32KB) + Core L#0
PU L#0 (P#0)
PU L#1 (P#1)
L1d L#1 (32KB) + L1i L#1 (32KB) + Core L#1
PU L#2 (P#2)
PU L#3 (P#3)
Package L#1
NUMANode L#1 (P#1 32GB)
L1d L#2 (32KB) + L1i L#2 (32KB) + Core L#2
PU L#4 (P#4)
PU L#5 (P#5)
L1d L#3 (32KB) + L1i L#3 (32KB) + Core L#3
PU L#6 (P#6)
PU L#7 (P#7)
Interrupts in guest:
$ cat /proc/interrupts
CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
16: 1023 871 1042 749 0 0 0 0 XIVE-IPI 0 Edge IPI-0
17: 0 0 0 0 2123 1019 1263 1288 XIVE-IPI 1 Edge IPI-1
IPIs are mapped to the appropriate nodes, and the numbers indicate
that everything is working as expected.
Reviewed-and-tested-by: Greg Kurz <groug@kaod.org>
> +} *xive_ipis;
> +
> +static unsigned int xive_ipi_cpu_to_irq(unsigned int cpu)
> +{
> + return xive_ipis[cpu_to_node(cpu)].irq;
> +}
> #endif
>
> /* Xive state for each CPU */
> @@ -1106,25 +1114,36 @@ static const struct irq_domain_ops xive_ipi_irq_domain_ops = {
>
> static void __init xive_request_ipi(void)
> {
> - unsigned int virq;
> + unsigned int node;
>
> - xive_ipi_irq_domain = irq_domain_add_linear(NULL, 1,
> + xive_ipi_irq_domain = irq_domain_add_linear(NULL, nr_node_ids,
> &xive_ipi_irq_domain_ops, NULL);
> if (WARN_ON(xive_ipi_irq_domain == NULL))
> return;
>
> - /* Initialize it */
> - virq = irq_create_mapping(xive_ipi_irq_domain, XIVE_IPI_HW_IRQ);
> - xive_ipi_irq = virq;
> + xive_ipis = kcalloc(nr_node_ids, sizeof(*xive_ipis), GFP_KERNEL | __GFP_NOFAIL);
> + for_each_node(node) {
> + struct xive_ipi_desc *xid = &xive_ipis[node];
> + irq_hw_number_t node_ipi_hwirq = node;
> +
> + /*
> + * Map one IPI interrupt per node for all cpus of that node.
> + * Since the HW interrupt number doesn't have any meaning,
> + * simply use the node number.
> + */
> + xid->irq = irq_create_mapping(xive_ipi_irq_domain, node_ipi_hwirq);
> + snprintf(xid->name, sizeof(xid->name), "IPI-%d", node);
>
> - WARN_ON(request_irq(virq, xive_muxed_ipi_action,
> - IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
> + WARN_ON(request_irq(xid->irq, xive_muxed_ipi_action,
> + IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL));
> + }
> }
>
> static int xive_setup_cpu_ipi(unsigned int cpu)
> {
> struct xive_cpu *xc;
> int rc;
> + unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
>
> pr_debug("Setting up IPI for CPU %d\n", cpu);
>
> @@ -1165,6 +1184,8 @@ static int xive_setup_cpu_ipi(unsigned int cpu)
>
> static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc)
> {
> + unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
> +
> /* Disable the IPI and free the IRQ data */
>
> /* Already cleaned up ? */
^ permalink raw reply
* [PATCH 0/4] powerpc/perf: Export processor pipeline stage cycles information
From: Athira Rajeev @ 2021-03-09 14:03 UTC (permalink / raw)
To: linuxppc-dev, linux-kernel, linux-perf-users, mpe, acme, jolsa
Cc: kan.liang, ravi.bangoria, peterz, maddy, kjain
Performance Monitoring Unit (PMU) registers in powerpc exports
number of cycles elapsed between different stages in the pipeline.
Example, sampling registers in ISA v3.1.
This patchset implements kernel and perf tools support to expose
these pipeline stage cycles using the sample type PERF_SAMPLE_WEIGHT_TYPE.
Patch 1/4 adds kernel side support to store the cycle counter
values as part of 'var2_w' and 'var3_w' fields of perf_sample_weight
structure.
Patch 2/4 adds support to make the perf report column header
strings as dynamic.
Patch 3/4 adds powerpc support in perf tools for PERF_SAMPLE_WEIGHT_STRUCT
in sample type: PERF_SAMPLE_WEIGHT_TYPE.
Patch 4/4 adds support to present pipeline stage cycles as part of
mem-mode.
Sample output on powerpc:
# perf mem record ls
# perf mem report
# To display the perf.data header info, please use --header/--header-only options.
#
#
# Total Lost Samples: 0
#
# Samples: 11 of event 'cpu/mem-loads/'
# Total weight : 1332
# Sort order : local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat,stall_cyc
#
# Overhead Samples Local Weight Memory access Symbol Shared Object Data Symbol Data Object Snoop TLB access Locked Blocked Finish Cyc Dispatch Cyc
# ........ ............ ............ ........................ .................................. ................ ............................................. ..................... ............ ...................... ...... .......... ............. .............
#
44.14% 1 588 L1 hit [k] rcu_nmi_exit [kernel.vmlinux] [k] 0xc0000007ffdd21b0 [unknown] N/A N/A No N/A 7 5
22.22% 1 296 L1 hit [k] copypage_power7 [kernel.vmlinux] [k] 0xc0000000ff6a1780 [unknown] N/A N/A No N/A 293 3
6.98% 1 93 L1 hit [.] _dl_addr libc-2.31.so [.] 0x00007fff86fa5058 libc-2.31.so N/A N/A No N/A 7 1
6.61% 1 88 L2 hit [.] new_do_write libc-2.31.so [.] _IO_2_1_stdout_+0x0 libc-2.31.so N/A N/A No N/A 84 1
5.93% 1 79 L1 hit [k] printk_nmi_exit [kernel.vmlinux] [k] 0xc0000006085df6b0 [unknown] N/A N/A No N/A 7 1
4.05% 1 54 L2 hit [.] __alloc_dir libc-2.31.so [.] 0x00007fffdb70a640 [stack] N/A N/A No N/A 18 1
3.60% 1 48 L1 hit [.] _init ls [.] 0x000000016ca82118 [heap] N/A N/A No N/A 7 6
2.40% 1 32 L1 hit [k] desc_read [kernel.vmlinux] [k] _printk_rb_static_descs+0x1ea10 [kernel.vmlinux].data N/A N/A No N/A 7 1
1.65% 1 22 L2 hit [k] perf_iterate_ctx.constprop.139 [kernel.vmlinux] [k] 0xc00000064d79e8a8 [unknown] N/A N/A No N/A 16 1
1.58% 1 21 L1 hit [k] perf_event_interrupt [kernel.vmlinux] [k] 0xc0000006085df6b0 [unknown] N/A N/A No N/A 7 1
0.83% 1 11 L1 hit [k] perf_event_exec [kernel.vmlinux] [k] 0xc0000007ffdd3288 [unknown] N/A N/A No N/A 7 4
Athira Rajeev (4):
powerpc/perf: Expose processor pipeline stage cycles using
PERF_SAMPLE_WEIGHT_STRUCT
tools/perf: Add dynamic headers for perf report columns
tools/perf: Add powerpc support for PERF_SAMPLE_WEIGHT_STRUCT
tools/perf: Support pipeline stage cycles for powerpc
arch/powerpc/include/asm/perf_event_server.h | 2 +-
arch/powerpc/perf/core-book3s.c | 4 +--
arch/powerpc/perf/isa207-common.c | 29 ++++++++++++++++--
arch/powerpc/perf/isa207-common.h | 6 +++-
tools/perf/Documentation/perf-report.txt | 1 +
tools/perf/arch/powerpc/util/Build | 2 ++
tools/perf/arch/powerpc/util/event.c | 46 ++++++++++++++++++++++++++++
tools/perf/arch/powerpc/util/evsel.c | 8 +++++
tools/perf/util/event.h | 2 ++
tools/perf/util/hist.c | 11 +++++--
tools/perf/util/hist.h | 1 +
tools/perf/util/session.c | 4 ++-
tools/perf/util/sort.c | 41 +++++++++++++++++++++++--
tools/perf/util/sort.h | 2 ++
14 files changed, 146 insertions(+), 13 deletions(-)
create mode 100644 tools/perf/arch/powerpc/util/event.c
create mode 100644 tools/perf/arch/powerpc/util/evsel.c
--
1.8.3.1
^ permalink raw reply
* [PATCH 1/4] powerpc/perf: Expose processor pipeline stage cycles using PERF_SAMPLE_WEIGHT_STRUCT
From: Athira Rajeev @ 2021-03-09 14:03 UTC (permalink / raw)
To: linuxppc-dev, linux-kernel, linux-perf-users, mpe, acme, jolsa
Cc: kan.liang, ravi.bangoria, peterz, maddy, kjain
In-Reply-To: <1615298640-1529-1-git-send-email-atrajeev@linux.vnet.ibm.com>
Performance Monitoring Unit (PMU) registers in powerpc provides
information on cycles elapsed between different stages in the
pipeline. This can be used for application tuning. On ISA v3.1
platform, this information is exposed by sampling registers.
Patch adds kernel support to capture two of the cycle counters
as part of perf sample using the sample type:
PERF_SAMPLE_WEIGHT_STRUCT.
The power PMU function 'get_mem_weight' currently uses 64 bit weight
field of perf_sample_data to capture memory latency. But following the
introduction of PERF_SAMPLE_WEIGHT_TYPE, weight field could contain
64-bit or 32-bit value depending on the architexture support for
PERF_SAMPLE_WEIGHT_STRUCT. Patches uses WEIGHT_STRUCT to expose the
pipeline stage cycles info. Hence update the ppmu functions to work for
64-bit and 32-bit weight values.
If the sample type is PERF_SAMPLE_WEIGHT, use the 64-bit weight field.
if the sample type is PERF_SAMPLE_WEIGHT_STRUCT, memory subsystem
latency is stored in the low 32bits of perf_sample_weight structure.
Also for CPU_FTR_ARCH_31, capture the two cycle counter information in
two 16 bit fields of perf_sample_weight structure.
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/perf_event_server.h | 2 +-
arch/powerpc/perf/core-book3s.c | 4 ++--
arch/powerpc/perf/isa207-common.c | 29 +++++++++++++++++++++++++---
arch/powerpc/perf/isa207-common.h | 6 +++++-
4 files changed, 34 insertions(+), 7 deletions(-)
diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index 00e7e671bb4b..112cf092d7b3 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -43,7 +43,7 @@ struct power_pmu {
u64 alt[]);
void (*get_mem_data_src)(union perf_mem_data_src *dsrc,
u32 flags, struct pt_regs *regs);
- void (*get_mem_weight)(u64 *weight);
+ void (*get_mem_weight)(u64 *weight, u64 type);
unsigned long group_constraint_mask;
unsigned long group_constraint_val;
u64 (*bhrb_filter_map)(u64 branch_sample_type);
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 6817331e22ff..57ff2494880c 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2206,9 +2206,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
ppmu->get_mem_data_src)
ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs);
- if (event->attr.sample_type & PERF_SAMPLE_WEIGHT &&
+ if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE &&
ppmu->get_mem_weight)
- ppmu->get_mem_weight(&data.weight.full);
+ ppmu->get_mem_weight(&data.weight.full, event->attr.sample_type);
if (perf_event_overflow(event, &data, regs))
power_pmu_stop(event, 0);
diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c
index e4f577da33d8..5dcbdbd54598 100644
--- a/arch/powerpc/perf/isa207-common.c
+++ b/arch/powerpc/perf/isa207-common.c
@@ -284,8 +284,10 @@ void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
}
}
-void isa207_get_mem_weight(u64 *weight)
+void isa207_get_mem_weight(u64 *weight, u64 type)
{
+ union perf_sample_weight *weight_fields;
+ u64 weight_lat;
u64 mmcra = mfspr(SPRN_MMCRA);
u64 exp = MMCRA_THR_CTR_EXP(mmcra);
u64 mantissa = MMCRA_THR_CTR_MANT(mmcra);
@@ -296,9 +298,30 @@ void isa207_get_mem_weight(u64 *weight)
mantissa = P10_MMCRA_THR_CTR_MANT(mmcra);
if (val == 0 || val == 7)
- *weight = 0;
+ weight_lat = 0;
else
- *weight = mantissa << (2 * exp);
+ weight_lat = mantissa << (2 * exp);
+
+ /*
+ * Use 64 bit weight field (full) if sample type is
+ * WEIGHT.
+ *
+ * if sample type is WEIGHT_STRUCT:
+ * - store memory latency in the lower 32 bits.
+ * - For ISA v3.1, use remaining two 16 bit fields of
+ * perf_sample_weight to store cycle counter values
+ * from sier2.
+ */
+ weight_fields = (union perf_sample_weight *)weight;
+ if (type & PERF_SAMPLE_WEIGHT)
+ weight_fields->full = weight_lat;
+ else {
+ weight_fields->var1_dw = (u32)weight_lat;
+ if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+ weight_fields->var2_w = P10_SIER2_FINISH_CYC(mfspr(SPRN_SIER2));
+ weight_fields->var3_w = P10_SIER2_DISPATCH_CYC(mfspr(SPRN_SIER2));
+ }
+ }
}
int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp, u64 event_config1)
diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h
index 1af0e8c97ac7..fc30d43c4d0c 100644
--- a/arch/powerpc/perf/isa207-common.h
+++ b/arch/powerpc/perf/isa207-common.h
@@ -265,6 +265,10 @@
#define ISA207_SIER_DATA_SRC_SHIFT 53
#define ISA207_SIER_DATA_SRC_MASK (0x7ull << ISA207_SIER_DATA_SRC_SHIFT)
+/* Bits in SIER2/SIER3 for Power10 */
+#define P10_SIER2_FINISH_CYC(sier2) (((sier2) >> (63 - 37)) & 0x7fful)
+#define P10_SIER2_DISPATCH_CYC(sier2) (((sier2) >> (63 - 13)) & 0x7fful)
+
#define P(a, b) PERF_MEM_S(a, b)
#define PH(a, b) (P(LVL, HIT) | P(a, b))
#define PM(a, b) (P(LVL, MISS) | P(a, b))
@@ -278,6 +282,6 @@ int isa207_get_alternatives(u64 event, u64 alt[], int size, unsigned int flags,
const unsigned int ev_alt[][MAX_ALT]);
void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
struct pt_regs *regs);
-void isa207_get_mem_weight(u64 *weight);
+void isa207_get_mem_weight(u64 *weight, u64 type);
#endif
--
1.8.3.1
^ permalink raw reply related
* [PATCH 2/4] tools/perf: Add dynamic headers for perf report columns
From: Athira Rajeev @ 2021-03-09 14:03 UTC (permalink / raw)
To: linuxppc-dev, linux-kernel, linux-perf-users, mpe, acme, jolsa
Cc: kan.liang, ravi.bangoria, peterz, maddy, kjain
In-Reply-To: <1615298640-1529-1-git-send-email-atrajeev@linux.vnet.ibm.com>
Currently the header string for different columns in perf report
is fixed. Some fields of perf sample could have different meaning
for different architectures than the meaning conveyed by the header
string. An example is the new field 'var2_w' of perf_sample_weight
structure. This is presently captured as 'Local INSTR Latency' in
perf mem report. But this could be used to denote a different latency
cycle in another architecture.
Introduce a weak function arch_perf_header_entry__add() to set
the arch specific header string for the fields which can contain dynamic
header. If the architecture do not have this function, fall back to the
default header string value.
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
---
tools/perf/util/event.h | 1 +
tools/perf/util/sort.c | 19 ++++++++++++++++++-
2 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index f603edbbbc6f..89b149e2e70a 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -427,5 +427,6 @@ void cpu_map_data__synthesize(struct perf_record_cpu_map_data *data, struct per
void arch_perf_parse_sample_weight(struct perf_sample *data, const __u64 *array, u64 type);
void arch_perf_synthesize_sample_weight(const struct perf_sample *data, __u64 *array, u64 type);
+const char *arch_perf_header_entry__add(const char *se_header);
#endif /* __PERF_RECORD_H */
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 0d5ad42812b9..741a6df29fa0 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -25,6 +25,7 @@
#include <traceevent/event-parse.h>
#include "mem-events.h"
#include "annotate.h"
+#include "event.h"
#include "time-utils.h"
#include "cgroup.h"
#include "machine.h"
@@ -45,6 +46,7 @@
regex_t ignore_callees_regex;
int have_ignore_callees = 0;
enum sort_mode sort__mode = SORT_MODE__NORMAL;
+const char *dynamic_headers[] = {"local_ins_lat"};
/*
* Replaces all occurrences of a char used with the:
@@ -1816,6 +1818,16 @@ struct sort_dimension {
int taken;
};
+const char * __weak arch_perf_header_entry__add(const char *se_header)
+{
+ return se_header;
+}
+
+static void sort_dimension_add_dynamic_header(struct sort_dimension *sd)
+{
+ sd->entry->se_header = arch_perf_header_entry__add(sd->entry->se_header);
+}
+
#define DIM(d, n, func) [d] = { .name = n, .entry = &(func) }
static struct sort_dimension common_sort_dimensions[] = {
@@ -2739,11 +2751,16 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
struct evlist *evlist,
int level)
{
- unsigned int i;
+ unsigned int i, j;
for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) {
struct sort_dimension *sd = &common_sort_dimensions[i];
+ for (j = 0; j < ARRAY_SIZE(dynamic_headers); j++) {
+ if (!strcmp(dynamic_headers[j], sd->name))
+ sort_dimension_add_dynamic_header(sd);
+ }
+
if (strncasecmp(tok, sd->name, strlen(tok)))
continue;
--
1.8.3.1
^ permalink raw reply related
* [PATCH 3/4] tools/perf: Add powerpc support for PERF_SAMPLE_WEIGHT_STRUCT
From: Athira Rajeev @ 2021-03-09 14:03 UTC (permalink / raw)
To: linuxppc-dev, linux-kernel, linux-perf-users, mpe, acme, jolsa
Cc: kan.liang, ravi.bangoria, peterz, maddy, kjain
In-Reply-To: <1615298640-1529-1-git-send-email-atrajeev@linux.vnet.ibm.com>
Add arch specific arch_evsel__set_sample_weight() to set the new
sample type for powerpc.
Add arch specific arch_perf_parse_sample_weight() to store the
sample->weight values depending on the sample type applied.
if the new sample type (PERF_SAMPLE_WEIGHT_STRUCT) is applied,
store only the lower 32 bits to sample->weight. If sample type
is 'PERF_SAMPLE_WEIGHT', store the full 64-bit to sample->weight.
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
---
tools/perf/arch/powerpc/util/Build | 2 ++
tools/perf/arch/powerpc/util/event.c | 32 ++++++++++++++++++++++++++++++++
tools/perf/arch/powerpc/util/evsel.c | 8 ++++++++
3 files changed, 42 insertions(+)
create mode 100644 tools/perf/arch/powerpc/util/event.c
create mode 100644 tools/perf/arch/powerpc/util/evsel.c
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
index b7945e5a543b..8a79c4126e5b 100644
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -4,6 +4,8 @@ perf-y += kvm-stat.o
perf-y += perf_regs.o
perf-y += mem-events.o
perf-y += sym-handling.o
+perf-y += evsel.o
+perf-y += event.o
perf-$(CONFIG_DWARF) += dwarf-regs.o
perf-$(CONFIG_DWARF) += skip-callchain-idx.o
diff --git a/tools/perf/arch/powerpc/util/event.c b/tools/perf/arch/powerpc/util/event.c
new file mode 100644
index 000000000000..f49d32c2c8ae
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/event.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/zalloc.h>
+
+#include "../../../util/event.h"
+#include "../../../util/synthetic-events.h"
+#include "../../../util/machine.h"
+#include "../../../util/tool.h"
+#include "../../../util/map.h"
+#include "../../../util/debug.h"
+
+void arch_perf_parse_sample_weight(struct perf_sample *data,
+ const __u64 *array, u64 type)
+{
+ union perf_sample_weight weight;
+
+ weight.full = *array;
+ if (type & PERF_SAMPLE_WEIGHT)
+ data->weight = weight.full;
+ else
+ data->weight = weight.var1_dw;
+}
+
+void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
+ __u64 *array, u64 type)
+{
+ *array = data->weight;
+
+ if (type & PERF_SAMPLE_WEIGHT_STRUCT)
+ *array &= 0xffffffff;
+}
diff --git a/tools/perf/arch/powerpc/util/evsel.c b/tools/perf/arch/powerpc/util/evsel.c
new file mode 100644
index 000000000000..2f733cdc8dbb
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/evsel.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include "util/evsel.h"
+
+void arch_evsel__set_sample_weight(struct evsel *evsel)
+{
+ evsel__set_sample_bit(evsel, WEIGHT_STRUCT);
+}
--
1.8.3.1
^ permalink raw reply related
* [PATCH 4/4] tools/perf: Support pipeline stage cycles for powerpc
From: Athira Rajeev @ 2021-03-09 14:04 UTC (permalink / raw)
To: linuxppc-dev, linux-kernel, linux-perf-users, mpe, acme, jolsa
Cc: kan.liang, ravi.bangoria, peterz, maddy, kjain
In-Reply-To: <1615298640-1529-1-git-send-email-atrajeev@linux.vnet.ibm.com>
The pipeline stage cycles details can be recorded on powerpc from
the contents of Performance Monitor Unit (PMU) registers. On
ISA v3.1 platform, sampling registers exposes the cycles spent in
different pipeline stages. Patch adds perf tools support to present
two of the cycle counter information along with memory latency (weight).
Re-use the field 'ins_lat' for storing the first pipeline stage cycle.
This is stored in 'var2_w' field of 'perf_sample_weight'.
Add a new field 'p_stage_cyc' to store the second pipeline stage cycle
which is stored in 'var3_w' field of perf_sample_weight.
Add new sort function 'Pipeline Stage Cycle' and include this in
default_mem_sort_order[]. This new sort function may be used to denote
some other pipeline stage in another architecture. So add this to
list of sort entries that can have dynamic header string.
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
---
tools/perf/Documentation/perf-report.txt | 1 +
tools/perf/arch/powerpc/util/event.c | 18 ++++++++++++++++--
tools/perf/util/event.h | 1 +
tools/perf/util/hist.c | 11 ++++++++---
tools/perf/util/hist.h | 1 +
tools/perf/util/session.c | 4 +++-
tools/perf/util/sort.c | 24 ++++++++++++++++++++++--
tools/perf/util/sort.h | 2 ++
8 files changed, 54 insertions(+), 8 deletions(-)
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index f546b5e9db05..9691d9c227ba 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -112,6 +112,7 @@ OPTIONS
- ins_lat: Instruction latency in core cycles. This is the global instruction
latency
- local_ins_lat: Local instruction latency version
+ - p_stage_cyc: Number of cycles spent in a pipeline stage.
By default, comm, dso and symbol keys are used.
(i.e. --sort comm,dso,symbol)
diff --git a/tools/perf/arch/powerpc/util/event.c b/tools/perf/arch/powerpc/util/event.c
index f49d32c2c8ae..b80fbee83b6e 100644
--- a/tools/perf/arch/powerpc/util/event.c
+++ b/tools/perf/arch/powerpc/util/event.c
@@ -18,8 +18,11 @@ void arch_perf_parse_sample_weight(struct perf_sample *data,
weight.full = *array;
if (type & PERF_SAMPLE_WEIGHT)
data->weight = weight.full;
- else
+ else {
data->weight = weight.var1_dw;
+ data->ins_lat = weight.var2_w;
+ data->p_stage_cyc = weight.var3_w;
+ }
}
void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
@@ -27,6 +30,17 @@ void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
{
*array = data->weight;
- if (type & PERF_SAMPLE_WEIGHT_STRUCT)
+ if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
*array &= 0xffffffff;
+ *array |= ((u64)data->ins_lat << 32);
+ }
+}
+
+const char *arch_perf_header_entry__add(const char *se_header)
+{
+ if (!strcmp(se_header, "Local INSTR Latency"))
+ return "Finish Cyc";
+ else if (!strcmp(se_header, "Pipeline Stage Cycle"))
+ return "Dispatch Cyc";
+ return se_header;
}
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 89b149e2e70a..65f89e80916f 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -147,6 +147,7 @@ struct perf_sample {
u8 cpumode;
u16 misc;
u16 ins_lat;
+ u16 p_stage_cyc;
bool no_hw_idx; /* No hw_idx collected in branch_stack */
char insn[MAX_INSN];
void *raw_data;
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index c82f5fc26af8..9299ee535518 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -211,6 +211,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
hists__new_col_len(hists, HISTC_MEM_BLOCKED, 10);
hists__new_col_len(hists, HISTC_LOCAL_INS_LAT, 13);
hists__new_col_len(hists, HISTC_GLOBAL_INS_LAT, 13);
+ hists__new_col_len(hists, HISTC_P_STAGE_CYC, 13);
if (symbol_conf.nanosecs)
hists__new_col_len(hists, HISTC_TIME, 16);
else
@@ -289,13 +290,14 @@ static long hist_time(unsigned long htime)
}
static void he_stat__add_period(struct he_stat *he_stat, u64 period,
- u64 weight, u64 ins_lat)
+ u64 weight, u64 ins_lat, u64 p_stage_cyc)
{
he_stat->period += period;
he_stat->weight += weight;
he_stat->nr_events += 1;
he_stat->ins_lat += ins_lat;
+ he_stat->p_stage_cyc += p_stage_cyc;
}
static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src)
@@ -308,6 +310,7 @@ static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src)
dest->nr_events += src->nr_events;
dest->weight += src->weight;
dest->ins_lat += src->ins_lat;
+ dest->p_stage_cyc += src->p_stage_cyc;
}
static void he_stat__decay(struct he_stat *he_stat)
@@ -597,6 +600,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
u64 period = entry->stat.period;
u64 weight = entry->stat.weight;
u64 ins_lat = entry->stat.ins_lat;
+ u64 p_stage_cyc = entry->stat.p_stage_cyc;
bool leftmost = true;
p = &hists->entries_in->rb_root.rb_node;
@@ -615,11 +619,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
if (!cmp) {
if (sample_self) {
- he_stat__add_period(&he->stat, period, weight, ins_lat);
+ he_stat__add_period(&he->stat, period, weight, ins_lat, p_stage_cyc);
hist_entry__add_callchain_period(he, period);
}
if (symbol_conf.cumulate_callchain)
- he_stat__add_period(he->stat_acc, period, weight, ins_lat);
+ he_stat__add_period(he->stat_acc, period, weight, ins_lat, p_stage_cyc);
/*
* This mem info was allocated from sample__resolve_mem
@@ -731,6 +735,7 @@ static void hists__res_sample(struct hist_entry *he, struct perf_sample *sample)
.period = sample->period,
.weight = sample->weight,
.ins_lat = sample->ins_lat,
+ .p_stage_cyc = sample->p_stage_cyc,
},
.parent = sym_parent,
.filtered = symbol__parent_filter(sym_parent) | al->filtered,
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 3c537232294b..e2faa745c8d6 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -75,6 +75,7 @@ enum hist_column {
HISTC_MEM_BLOCKED,
HISTC_LOCAL_INS_LAT,
HISTC_GLOBAL_INS_LAT,
+ HISTC_P_STAGE_CYC,
HISTC_NR_COLS, /* Last entry */
};
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 859832a82496..a6fed96d783d 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1302,8 +1302,10 @@ static void dump_sample(struct evsel *evsel, union perf_event *event,
if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
printf("... weight: %" PRIu64 "", sample->weight);
- if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT)
+ if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
printf(",0x%"PRIx16"", sample->ins_lat);
+ printf(",0x%"PRIx16"", sample->p_stage_cyc);
+ }
printf("\n");
}
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 741a6df29fa0..cbb3899e7eca 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -37,7 +37,7 @@
const char *parent_pattern = default_parent_pattern;
const char *default_sort_order = "comm,dso,symbol";
const char default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles";
-const char default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat";
+const char default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat,p_stage_cyc";
const char default_top_sort_order[] = "dso,symbol";
const char default_diff_sort_order[] = "dso,symbol";
const char default_tracepoint_sort_order[] = "trace";
@@ -46,7 +46,7 @@
regex_t ignore_callees_regex;
int have_ignore_callees = 0;
enum sort_mode sort__mode = SORT_MODE__NORMAL;
-const char *dynamic_headers[] = {"local_ins_lat"};
+const char *dynamic_headers[] = {"local_ins_lat", "p_stage_cyc"};
/*
* Replaces all occurrences of a char used with the:
@@ -1410,6 +1410,25 @@ struct sort_entry sort_global_ins_lat = {
.se_width_idx = HISTC_GLOBAL_INS_LAT,
};
+static int64_t
+sort__global_p_stage_cyc_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+ return left->stat.p_stage_cyc - right->stat.p_stage_cyc;
+}
+
+static int hist_entry__p_stage_cyc_snprintf(struct hist_entry *he, char *bf,
+ size_t size, unsigned int width)
+{
+ return repsep_snprintf(bf, size, "%-*u", width, he->stat.p_stage_cyc);
+}
+
+struct sort_entry sort_p_stage_cyc = {
+ .se_header = "Pipeline Stage Cycle",
+ .se_cmp = sort__global_p_stage_cyc_cmp,
+ .se_snprintf = hist_entry__p_stage_cyc_snprintf,
+ .se_width_idx = HISTC_P_STAGE_CYC,
+};
+
struct sort_entry sort_mem_daddr_sym = {
.se_header = "Data Symbol",
.se_cmp = sort__daddr_cmp,
@@ -1853,6 +1872,7 @@ static void sort_dimension_add_dynamic_header(struct sort_dimension *sd)
DIM(SORT_CODE_PAGE_SIZE, "code_page_size", sort_code_page_size),
DIM(SORT_LOCAL_INS_LAT, "local_ins_lat", sort_local_ins_lat),
DIM(SORT_GLOBAL_INS_LAT, "ins_lat", sort_global_ins_lat),
+ DIM(SORT_P_STAGE_CYC, "p_stage_cyc", sort_p_stage_cyc),
};
#undef DIM
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 63f67a3f3630..23b20cbbc846 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -51,6 +51,7 @@ struct he_stat {
u64 period_guest_us;
u64 weight;
u64 ins_lat;
+ u64 p_stage_cyc;
u32 nr_events;
};
@@ -234,6 +235,7 @@ enum sort_type {
SORT_CODE_PAGE_SIZE,
SORT_LOCAL_INS_LAT,
SORT_GLOBAL_INS_LAT,
+ SORT_P_STAGE_CYC,
/* branch stack specific sort keys */
__SORT_BRANCH_STACK,
--
1.8.3.1
^ permalink raw reply related
* Re: [PATCH v2 1/8] powerpc/xive: Use cpu_to_node() instead of ibm,chip-id property
From: Cédric Le Goater @ 2021-03-09 15:33 UTC (permalink / raw)
To: Greg Kurz; +Cc: Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <20210308181359.789c143b@bahia.lan>
On 3/8/21 6:13 PM, Greg Kurz wrote:
> On Wed, 3 Mar 2021 18:48:50 +0100
> Cédric Le Goater <clg@kaod.org> wrote:
>
>> The 'chip_id' field of the XIVE CPU structure is used to choose a
>> target for a source located on the same chip when possible. This field
>> is assigned on the PowerNV platform using the "ibm,chip-id" property
>> on pSeries under KVM when NUMA nodes are defined but it is undefined
>
> This sentence seems to have a syntax problem... like it is missing an
> 'and' before 'on pSeries'.
ah yes, or simply a comma.
>> under PowerVM. The XIVE source structure has a similar field
>> 'src_chip' which is only assigned on the PowerNV platform.
>>
>> cpu_to_node() returns a compatible value on all platforms, 0 being the
>> default node. It will also give us the opportunity to set the affinity
>> of a source on pSeries when we can localize them.
>>
>
> IIUC this relies on the fact that the NUMA node id is == to chip id
> on PowerNV, i.e. xc->chip_id which is passed to OPAL remain stable
> with this change.
Linux sets the NUMA node in numa_setup_cpu(). On pseries, the hcall
H_HOME_NODE_ASSOCIATIVITY returns the node id if I am correct (Daniel
in Cc:)
On PowerNV, Linux uses "ibm,associativity" property of the CPU to find
the node id. This value is built from the chip id in OPAL, so the
value returned by cpu_to_node(cpu) and the value of the "ibm,chip-id"
property are unlikely to be different.
cpu_to_node(cpu) is used in many places to allocate the structures
locally to the owning node. XIVE is not an exception (see below in the
same patch), it is better to be consistent and get the same information
(node id) using the same routine.
In Linux, "ibm,chip-id" is only used in low level PowerNV drivers :
LPC, XSCOM, RNG, VAS, NX. XIVE should be in that list also but skiboot
unifies the controllers of the system to only expose one the OS. This
is problematic and should be changed but it's another topic.
> On the other hand, you have the pSeries case under PowerVM that
> doesn't xc->chip_id, which isn't passed to any hcall AFAICT.
yes "ibm,chip-id" is an OPAL concept unfortunately and it has no meaning
under PAPR. xc->chip_id on pseries (PowerVM) will contains an invalid
chip id.
QEMU/KVM exposes "ibm,chip-id" but it's not used. (its value is not
always correct btw)
> It looks like the chip id is only used for localization purpose in
> this case, right ?
Yes and PAPR sources are not localized. So it's not used. MSI sources
could be if we rewrote the MSI driver.
> In this case, what about doing this change for pSeries only,
> somewhere in spapr.c ?
The IPI code is common to all platforms and all have the same issue.
I rather not.
Thanks,
C.
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>> arch/powerpc/sysdev/xive/common.c | 7 +------
>> 1 file changed, 1 insertion(+), 6 deletions(-)
>>
>> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
>> index 595310e056f4..b8e456da28aa 100644
>> --- a/arch/powerpc/sysdev/xive/common.c
>> +++ b/arch/powerpc/sysdev/xive/common.c
>> @@ -1335,16 +1335,11 @@ static int xive_prepare_cpu(unsigned int cpu)
>>
>> xc = per_cpu(xive_cpu, cpu);
>> if (!xc) {
>> - struct device_node *np;
>> -
>> xc = kzalloc_node(sizeof(struct xive_cpu),
>> GFP_KERNEL, cpu_to_node(cpu));
>> if (!xc)
>> return -ENOMEM;
>> - np = of_get_cpu_node(cpu, NULL);
>> - if (np)
>> - xc->chip_id = of_get_ibm_chip_id(np);
>> - of_node_put(np);
>> + xc->chip_id = cpu_to_node(cpu);
>> xc->hw_ipi = XIVE_BAD_IRQ;
>>
>> per_cpu(xive_cpu, cpu) = xc;
>
^ permalink raw reply
* Re: [PATCH v2 4/8] powerpc/xive: Simplify xive_core_debug_show()
From: Cédric Le Goater @ 2021-03-09 15:39 UTC (permalink / raw)
To: Greg Kurz; +Cc: linuxppc-dev
In-Reply-To: <20210309104220.663840a2@bahia.lan>
On 3/9/21 10:42 AM, Greg Kurz wrote:
> On Tue, 9 Mar 2021 10:13:39 +0100
> Greg Kurz <groug@kaod.org> wrote:
>
>> On Mon, 8 Mar 2021 19:11:11 +0100
>> Cédric Le Goater <clg@kaod.org> wrote:
>>
>>> On 3/8/21 7:07 PM, Greg Kurz wrote:
>>>> On Wed, 3 Mar 2021 18:48:53 +0100
>>>> Cédric Le Goater <clg@kaod.org> wrote:
>>>>
>>>>> Now that the IPI interrupt has its own domain, the checks on the HW
>>>>> interrupt number XIVE_IPI_HW_IRQ and on the chip can be replaced by a
>>>>> check on the domain.
>>>>>
>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>>> ---
>>>>
>>>> Shouldn't this have the following tags ?
>>>>
>>>> Reported-by: kernel test robot <lkp@intel.com>
>>>> Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
>>>> Fixes: 930914b7d528 ("powerpc/xive: Add a debugfs file to dump internal XIVE state")
>>>
>>> The next patch has because it removes the useless check on irq_data.
>>>
>>
>> Ok I get it. This report isn't about an actual crash. Just a false
>> positive because of the not needed check in the caller.
>>
>
> Hrm... I meant because of the check in xive_debug_show_irq(). On the
> contrary, the check removed by this patch in xive_core_debug_show()
> was rather an explicit hint that xive_debug_show_irq() couldn't be
> called with d being NULL.
yes. irq_desc_get_irq_data() does not return a NULL value and
xive_debug_show_irq() is only called from the for_each_irq_desc()
loop.
C.
>
>>> C.
>>>
>>>>
>>>> Anyway,
>>>>
>>>> Reviewed-by: Greg Kurz <groug@kaod.org>
>>>>
>>>>> arch/powerpc/sysdev/xive/common.c | 18 ++++--------------
>>>>> 1 file changed, 4 insertions(+), 14 deletions(-)
>>>>>
>>>>> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
>>>>> index 678680531d26..7581cb12bb53 100644
>>>>> --- a/arch/powerpc/sysdev/xive/common.c
>>>>> +++ b/arch/powerpc/sysdev/xive/common.c
>>>>> @@ -1579,17 +1579,14 @@ static void xive_debug_show_cpu(struct seq_file *m, int cpu)
>>>>> seq_puts(m, "\n");
>>>>> }
>>>>>
>>>>> -static void xive_debug_show_irq(struct seq_file *m, u32 hw_irq, struct irq_data *d)
>>>>> +static void xive_debug_show_irq(struct seq_file *m, struct irq_data *d)
>>>>> {
>>>>> - struct irq_chip *chip = irq_data_get_irq_chip(d);
>>>>> + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
>>>>> int rc;
>>>>> u32 target;
>>>>> u8 prio;
>>>>> u32 lirq;
>>>>>
>>>>> - if (!is_xive_irq(chip))
>>>>> - return;
>>>>> -
>>>>> rc = xive_ops->get_irq_config(hw_irq, &target, &prio, &lirq);
>>>>> if (rc) {
>>>>> seq_printf(m, "IRQ 0x%08x : no config rc=%d\n", hw_irq, rc);
>>>>> @@ -1627,16 +1624,9 @@ static int xive_core_debug_show(struct seq_file *m, void *private)
>>>>>
>>>>> for_each_irq_desc(i, desc) {
>>>>> struct irq_data *d = irq_desc_get_irq_data(desc);
>>>>> - unsigned int hw_irq;
>>>>> -
>>>>> - if (!d)
>>>>> - continue;
>>>>> -
>>>>> - hw_irq = (unsigned int)irqd_to_hwirq(d);
>>>>>
>>>>> - /* IPIs are special (HW number 0) */
>>>>> - if (hw_irq != XIVE_IPI_HW_IRQ)
>>>>> - xive_debug_show_irq(m, hw_irq, d);
>>>>> + if (d->domain == xive_irq_domain)
>>>>> + xive_debug_show_irq(m, d);
>>>>> }
>>>>> return 0;
>>>>> }
>>>>
>>>
>>
>
^ permalink raw reply
* Re: [PATCH 3/3] powerpc/qspinlock: Use generic smp_cond_load_relaxed
From: Davidlohr Bueso @ 2021-03-09 15:46 UTC (permalink / raw)
To: Michal Such�nek
Cc: Davidlohr Bueso, peterz, will, linux-kernel, npiggin, mingo,
paulus, longman, linuxppc-dev
In-Reply-To: <20210309093912.GW6564@kitsune.suse.cz>
On Tue, 09 Mar 2021, Michal Such�nek wrote:
>On Mon, Mar 08, 2021 at 05:59:50PM -0800, Davidlohr Bueso wrote:
>> 49a7d46a06c3 (powerpc: Implement smp_cond_load_relaxed()) added
>> busy-waiting pausing with a preferred SMT priority pattern, lowering
>> the priority (reducing decode cycles) during the whole loop slowpath.
>>
>> However, data shows that while this pattern works well with simple
> ^^^^^^^^^^^^^^^^^^^^^^
>> spinlocks, queued spinlocks benefit more being kept in medium priority,
>> with a cpu_relax() instead, being a low+medium combo on powerpc.
>...
>>
>> diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
>> index aecfde829d5d..7ae29cfb06c0 100644
>> --- a/arch/powerpc/include/asm/barrier.h
>> +++ b/arch/powerpc/include/asm/barrier.h
>> @@ -80,22 +80,6 @@ do { \
>> ___p1; \
>> })
>>
>> -#ifdef CONFIG_PPC64
>Maybe it should be kept for the simple spinlock case then?
It is kept, note that simple spinlocks don't use smp_cond_load_relaxed,
but instead deal with the priorities in arch_spin_lock(), so it will
spin in low priority until it sees a chance to take the lock, where
it switches back to medium.
Thanks,
Davidlohr
^ permalink raw reply
* Re: [PATCH v2 7/8] powerpc/xive: Fix xmon command "dxi"
From: Cédric Le Goater @ 2021-03-09 15:49 UTC (permalink / raw)
To: Greg Kurz; +Cc: linuxppc-dev, kernel test robot, Dan Carpenter
In-Reply-To: <20210309112325.7b161cc7@bahia.lan>
On 3/9/21 11:23 AM, Greg Kurz wrote:
> On Wed, 3 Mar 2021 18:48:56 +0100
> Cédric Le Goater <clg@kaod.org> wrote:
>
>> When under xmon, the "dxi" command dumps the state of the XIVE
>> interrupts. If an interrupt number is specified, only the state of
>> the associated XIVE interrupt is dumped. This form of the command
>> lacks an irq_data parameter which is nevertheless used by
>> xmon_xive_get_irq_config(), leading to an xmon crash.
>>
>> Fix that by doing a lookup in the system IRQ mapping to query the IRQ
>> descriptor data. Invalid interrupt numbers, or not belonging to the
>> XIVE IRQ domain, OPAL event interrupt number for instance, should be
>> caught by the previous query done at the firmware level.
>>
>> Reported-by: kernel test robot <lkp@intel.com>
>> Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
>> Fixes: 97ef27507793 ("powerpc/xive: Fix xmon support on the PowerNV platform")
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>
> I've tested this in a KVM guest and it seems to do the job.
>
> 6:mon> dxi 1201
> IRQ 0x00001201 : target=0xfffffc00 prio=ff lirq=0x0 flags= LH PQ=-Q
>
> Bad HW irq numbers are filtered by the hypervisor:
>
> 6:mon> dxi bad
> [ 696.390577] xive: H_INT_GET_SOURCE_CONFIG lisn=2989 failed -55
> IRQ 0x00000bad : no config rc=-6
>
> Note that this also allows to show IPIs:
>
> 6:mon> dxi 0
> IRQ 0x00000000 : target=0x0 prio=06 lirq=0x10
>
> This is a bit inconsistent with output of the 0-argument form of "dxi",
It's an hidden feature ! :)
Yes. You can query at the FW level the configuration of any valid HW
interrupt number where as "dxi" without an argument only loops on the
XIVE IRQ domain which does not include the XIVE CPU IPIs which are
special. You should "dxa" for these.
> which filters them out for a reason that isn't obvious to me.
For historical reason. XIVE support for PowerNV was the first to reach
Linux. If you run the same xmon commands on a PowerNV machine (you could
use QEMU), the ouput is different. it has more low level details.
> No big deal though, this should be addressed in another patch anyway.
We could simplify the xmon helpers to be sync with the debugfs one
and the QEMU/KVM "info pic" command. I agree.
Thanks,
C.
> Reviewed-and-tested-by: Greg Kurz <groug@kaod.org>
>
>> arch/powerpc/sysdev/xive/common.c | 14 ++++++++++----
>> 1 file changed, 10 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
>> index f6b7b15bbb3a..8eefd152b947 100644
>> --- a/arch/powerpc/sysdev/xive/common.c
>> +++ b/arch/powerpc/sysdev/xive/common.c
>> @@ -255,17 +255,20 @@ notrace void xmon_xive_do_dump(int cpu)
>> xmon_printf("\n");
>> }
>>
>> +static struct irq_data *xive_get_irq_data(u32 hw_irq)
>> +{
>> + unsigned int irq = irq_find_mapping(xive_irq_domain, hw_irq);
>> +
>> + return irq ? irq_get_irq_data(irq) : NULL;
>> +}
>> +
>> int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d)
>> {
>> - struct irq_chip *chip = irq_data_get_irq_chip(d);
>> int rc;
>> u32 target;
>> u8 prio;
>> u32 lirq;
>>
>> - if (!is_xive_irq(chip))
>> - return -EINVAL;
>> -
>> rc = xive_ops->get_irq_config(hw_irq, &target, &prio, &lirq);
>> if (rc) {
>> xmon_printf("IRQ 0x%08x : no config rc=%d\n", hw_irq, rc);
>> @@ -275,6 +278,9 @@ int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d)
>> xmon_printf("IRQ 0x%08x : target=0x%x prio=%02x lirq=0x%x ",
>> hw_irq, target, prio, lirq);
>>
>> + if (!d)
>> + d = xive_get_irq_data(hw_irq);
>> +
>> if (d) {
>> struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
>> u64 val = xive_esb_read(xd, XIVE_ESB_GET);
>
^ permalink raw reply
* [PATCH 1/9] fs: rename alloc_anon_inode to alloc_anon_inode_sb
From: Christoph Hellwig @ 2021-03-09 15:53 UTC (permalink / raw)
To: Al Viro
Cc: Jason Gunthorpe, David Hildenbrand, VMware, Inc.,
Michael S. Tsirkin, linux-kernel, dri-devel, virtualization,
linux-mm, Minchan Kim, Alex Williamson, Nadav Amit, Daniel Vetter,
linux-fsdevel, Andrew Morton, linuxppc-dev, Nitin Gupta
In-Reply-To: <20210309155348.974875-1-hch@lst.de>
Rename alloc_inode to free the name for a new variant that does not
need boilerplate to create a super_block first.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
arch/powerpc/platforms/pseries/cmm.c | 2 +-
drivers/dma-buf/dma-buf.c | 2 +-
drivers/gpu/drm/drm_drv.c | 2 +-
drivers/misc/cxl/api.c | 2 +-
drivers/misc/vmw_balloon.c | 2 +-
drivers/scsi/cxlflash/ocxl_hw.c | 2 +-
drivers/virtio/virtio_balloon.c | 2 +-
fs/aio.c | 2 +-
fs/anon_inodes.c | 4 ++--
fs/libfs.c | 2 +-
include/linux/fs.h | 2 +-
kernel/resource.c | 2 +-
mm/z3fold.c | 2 +-
mm/zsmalloc.c | 2 +-
14 files changed, 15 insertions(+), 15 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 45a3a3022a85c9..6d36b858b14df1 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -580,7 +580,7 @@ static int cmm_balloon_compaction_init(void)
return rc;
}
- b_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
+ b_dev_info.inode = alloc_anon_inode_sb(balloon_mnt->mnt_sb);
if (IS_ERR(b_dev_info.inode)) {
rc = PTR_ERR(b_dev_info.inode);
b_dev_info.inode = NULL;
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index f264b70c383eb4..dedcc9483352dc 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -445,7 +445,7 @@ static inline int is_dma_buf_file(struct file *file)
static struct file *dma_buf_getfile(struct dma_buf *dmabuf, int flags)
{
struct file *file;
- struct inode *inode = alloc_anon_inode(dma_buf_mnt->mnt_sb);
+ struct inode *inode = alloc_anon_inode_sb(dma_buf_mnt->mnt_sb);
if (IS_ERR(inode))
return ERR_CAST(inode);
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 20d22e41d7ce74..87e7214a8e3565 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -519,7 +519,7 @@ static struct inode *drm_fs_inode_new(void)
return ERR_PTR(r);
}
- inode = alloc_anon_inode(drm_fs_mnt->mnt_sb);
+ inode = alloc_anon_inode_sb(drm_fs_mnt->mnt_sb);
if (IS_ERR(inode))
simple_release_fs(&drm_fs_mnt, &drm_fs_cnt);
diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index b493de962153ba..2efbf6c98028ef 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -73,7 +73,7 @@ static struct file *cxl_getfile(const char *name,
goto err_module;
}
- inode = alloc_anon_inode(cxl_vfs_mount->mnt_sb);
+ inode = alloc_anon_inode_sb(cxl_vfs_mount->mnt_sb);
if (IS_ERR(inode)) {
file = ERR_CAST(inode);
goto err_fs;
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index b837e7eba5f7dc..5d057a05ddbee8 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -1900,7 +1900,7 @@ static __init int vmballoon_compaction_init(struct vmballoon *b)
return PTR_ERR(vmballoon_mnt);
b->b_dev_info.migratepage = vmballoon_migratepage;
- b->b_dev_info.inode = alloc_anon_inode(vmballoon_mnt->mnt_sb);
+ b->b_dev_info.inode = alloc_anon_inode_sb(vmballoon_mnt->mnt_sb);
if (IS_ERR(b->b_dev_info.inode))
return PTR_ERR(b->b_dev_info.inode);
diff --git a/drivers/scsi/cxlflash/ocxl_hw.c b/drivers/scsi/cxlflash/ocxl_hw.c
index 244fc27215dc79..40184ed926b557 100644
--- a/drivers/scsi/cxlflash/ocxl_hw.c
+++ b/drivers/scsi/cxlflash/ocxl_hw.c
@@ -88,7 +88,7 @@ static struct file *ocxlflash_getfile(struct device *dev, const char *name,
goto err2;
}
- inode = alloc_anon_inode(ocxlflash_vfs_mount->mnt_sb);
+ inode = alloc_anon_inode_sb(ocxlflash_vfs_mount->mnt_sb);
if (IS_ERR(inode)) {
rc = PTR_ERR(inode);
dev_err(dev, "%s: alloc_anon_inode failed rc=%d\n",
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 8985fc2cea8615..cae76ee5bdd688 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -916,7 +916,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
}
vb->vb_dev_info.migratepage = virtballoon_migratepage;
- vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
+ vb->vb_dev_info.inode = alloc_anon_inode_sb(balloon_mnt->mnt_sb);
if (IS_ERR(vb->vb_dev_info.inode)) {
err = PTR_ERR(vb->vb_dev_info.inode);
goto out_kern_unmount;
diff --git a/fs/aio.c b/fs/aio.c
index 1f32da13d39ee6..d1c2aa7fd6de7c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -234,7 +234,7 @@ static const struct address_space_operations aio_ctx_aops;
static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
{
struct file *file;
- struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
+ struct inode *inode = alloc_anon_inode_sb(aio_mnt->mnt_sb);
if (IS_ERR(inode))
return ERR_CAST(inode);
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index a280156138ed89..4745fc37014332 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -63,7 +63,7 @@ static struct inode *anon_inode_make_secure_inode(
const struct qstr qname = QSTR_INIT(name, strlen(name));
int error;
- inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
+ inode = alloc_anon_inode_sb(anon_inode_mnt->mnt_sb);
if (IS_ERR(inode))
return inode;
inode->i_flags &= ~S_PRIVATE;
@@ -231,7 +231,7 @@ static int __init anon_inode_init(void)
if (IS_ERR(anon_inode_mnt))
panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt));
- anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
+ anon_inode_inode = alloc_anon_inode_sb(anon_inode_mnt->mnt_sb);
if (IS_ERR(anon_inode_inode))
panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
diff --git a/fs/libfs.c b/fs/libfs.c
index e2de5401abca5a..600bebc1cd847f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1216,7 +1216,7 @@ static int anon_set_page_dirty(struct page *page)
return 0;
};
-struct inode *alloc_anon_inode(struct super_block *s)
+struct inode *alloc_anon_inode_sb(struct super_block *s)
{
static const struct address_space_operations anon_aops = {
.set_page_dirty = anon_set_page_dirty,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ec8f3ddf4a6aa8..52387368af3c00 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3286,7 +3286,7 @@ extern int simple_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
extern int always_delete_dentry(const struct dentry *);
-extern struct inode *alloc_anon_inode(struct super_block *);
+extern struct inode *alloc_anon_inode_sb(struct super_block *);
extern int simple_nosetlease(struct file *, long, struct file_lock **, void **);
extern const struct dentry_operations simple_dentry_operations;
diff --git a/kernel/resource.c b/kernel/resource.c
index 627e61b0c12418..0fd091a3f2fc66 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1863,7 +1863,7 @@ static int __init iomem_init_inode(void)
return rc;
}
- inode = alloc_anon_inode(iomem_vfs_mount->mnt_sb);
+ inode = alloc_anon_inode_sb(iomem_vfs_mount->mnt_sb);
if (IS_ERR(inode)) {
rc = PTR_ERR(inode);
pr_err("Cannot allocate inode for iomem: %d\n", rc);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index b5dafa7e44e429..e7cd9298b221f5 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -376,7 +376,7 @@ static void z3fold_unmount(void)
static const struct address_space_operations z3fold_aops;
static int z3fold_register_migration(struct z3fold_pool *pool)
{
- pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
+ pool->inode = alloc_anon_inode_sb(z3fold_mnt->mnt_sb);
if (IS_ERR(pool->inode)) {
pool->inode = NULL;
return 1;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 30c358b7202510..a6449a2ad861de 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2086,7 +2086,7 @@ static const struct address_space_operations zsmalloc_aops = {
static int zs_register_migration(struct zs_pool *pool)
{
- pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb);
+ pool->inode = alloc_anon_inode_sb(zsmalloc_mnt->mnt_sb);
if (IS_ERR(pool->inode)) {
pool->inode = NULL;
return 1;
--
2.30.1
^ permalink raw reply related
* make alloc_anon_inode more useful
From: Christoph Hellwig @ 2021-03-09 15:53 UTC (permalink / raw)
To: Al Viro
Cc: Jason Gunthorpe, David Hildenbrand, VMware, Inc.,
Michael S. Tsirkin, linux-kernel, dri-devel, virtualization,
linux-mm, Minchan Kim, Alex Williamson, Nadav Amit, Daniel Vetter,
linux-fsdevel, Andrew Morton, linuxppc-dev, Nitin Gupta
Hi all,
this series first renames the existing alloc_anon_inode to
alloc_anon_inode_sb to clearly mark it as requiring a superblock.
It then adds a new alloc_anon_inode that works on the anon_inode
file system super block, thus removing tons of boilerplate code.
The few remainig callers of alloc_anon_inode_sb all use alloc_file_pseudo
later, but might also be ripe for some cleanup.
Diffstat:
arch/powerpc/platforms/pseries/cmm.c | 27 +-------------
drivers/dma-buf/dma-buf.c | 2 -
drivers/gpu/drm/drm_drv.c | 64 +----------------------------------
drivers/misc/cxl/api.c | 2 -
drivers/misc/vmw_balloon.c | 24 +------------
drivers/scsi/cxlflash/ocxl_hw.c | 2 -
drivers/virtio/virtio_balloon.c | 30 +---------------
fs/aio.c | 2 -
fs/anon_inodes.c | 15 +++++++-
fs/libfs.c | 2 -
include/linux/anon_inodes.h | 1
include/linux/fs.h | 2 -
kernel/resource.c | 30 ++--------------
mm/z3fold.c | 38 +-------------------
mm/zsmalloc.c | 48 +-------------------------
15 files changed, 39 insertions(+), 250 deletions(-)
^ permalink raw reply
* [PATCH 2/9] fs: add an argument-less alloc_anon_inode
From: Christoph Hellwig @ 2021-03-09 15:53 UTC (permalink / raw)
To: Al Viro
Cc: Jason Gunthorpe, David Hildenbrand, VMware, Inc.,
Michael S. Tsirkin, linux-kernel, dri-devel, virtualization,
linux-mm, Minchan Kim, Alex Williamson, Nadav Amit, Daniel Vetter,
linux-fsdevel, Andrew Morton, linuxppc-dev, Nitin Gupta
In-Reply-To: <20210309155348.974875-1-hch@lst.de>
Add a new alloc_anon_inode helper that allocates an inode on
the anon_inode file system.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/anon_inodes.c | 15 +++++++++++++--
include/linux/anon_inodes.h | 1 +
2 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 4745fc37014332..b6a8ea71920bc3 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -63,7 +63,7 @@ static struct inode *anon_inode_make_secure_inode(
const struct qstr qname = QSTR_INIT(name, strlen(name));
int error;
- inode = alloc_anon_inode_sb(anon_inode_mnt->mnt_sb);
+ inode = alloc_anon_inode();
if (IS_ERR(inode))
return inode;
inode->i_flags &= ~S_PRIVATE;
@@ -225,13 +225,24 @@ int anon_inode_getfd_secure(const char *name, const struct file_operations *fops
}
EXPORT_SYMBOL_GPL(anon_inode_getfd_secure);
+/**
+ * alloc_anon_inode - create a new anonymous inode
+ *
+ * Create an inode on the anon_inode file system and return it.
+ */
+struct inode *alloc_anon_inode(void)
+{
+ return alloc_anon_inode_sb(anon_inode_mnt->mnt_sb);
+}
+EXPORT_SYMBOL_GPL(alloc_anon_inode);
+
static int __init anon_inode_init(void)
{
anon_inode_mnt = kern_mount(&anon_inode_fs_type);
if (IS_ERR(anon_inode_mnt))
panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt));
- anon_inode_inode = alloc_anon_inode_sb(anon_inode_mnt->mnt_sb);
+ anon_inode_inode = alloc_anon_inode();
if (IS_ERR(anon_inode_inode))
panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h
index 71881a2b6f7860..b5ae9a6eda9923 100644
--- a/include/linux/anon_inodes.h
+++ b/include/linux/anon_inodes.h
@@ -21,6 +21,7 @@ int anon_inode_getfd_secure(const char *name,
const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode);
+struct inode *alloc_anon_inode(void);
#endif /* _LINUX_ANON_INODES_H */
--
2.30.1
^ permalink raw reply related
* [PATCH 3/9] powerpc/pseries: remove the ppc-cmm file system
From: Christoph Hellwig @ 2021-03-09 15:53 UTC (permalink / raw)
To: Al Viro
Cc: Jason Gunthorpe, David Hildenbrand, VMware, Inc.,
Michael S. Tsirkin, linux-kernel, dri-devel, virtualization,
linux-mm, Minchan Kim, Alex Williamson, Nadav Amit, Daniel Vetter,
linux-fsdevel, Andrew Morton, linuxppc-dev, Nitin Gupta
In-Reply-To: <20210309155348.974875-1-hch@lst.de>
Just use the generic anon_inode file system.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
arch/powerpc/platforms/pseries/cmm.c | 27 ++-------------------------
1 file changed, 2 insertions(+), 25 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 6d36b858b14df1..9d07e6bea7126c 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -6,6 +6,7 @@
* Author(s): Brian King (brking@linux.vnet.ibm.com),
*/
+#include <linux/anon_inodes.h>
#include <linux/ctype.h>
#include <linux/delay.h>
#include <linux/errno.h>
@@ -502,19 +503,6 @@ static struct notifier_block cmm_mem_nb = {
};
#ifdef CONFIG_BALLOON_COMPACTION
-static struct vfsmount *balloon_mnt;
-
-static int cmm_init_fs_context(struct fs_context *fc)
-{
- return init_pseudo(fc, PPC_CMM_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type balloon_fs = {
- .name = "ppc-cmm",
- .init_fs_context = cmm_init_fs_context,
- .kill_sb = kill_anon_super,
-};
-
static int cmm_migratepage(struct balloon_dev_info *b_dev_info,
struct page *newpage, struct page *page,
enum migrate_mode mode)
@@ -573,19 +561,10 @@ static int cmm_balloon_compaction_init(void)
balloon_devinfo_init(&b_dev_info);
b_dev_info.migratepage = cmm_migratepage;
- balloon_mnt = kern_mount(&balloon_fs);
- if (IS_ERR(balloon_mnt)) {
- rc = PTR_ERR(balloon_mnt);
- balloon_mnt = NULL;
- return rc;
- }
-
- b_dev_info.inode = alloc_anon_inode_sb(balloon_mnt->mnt_sb);
+ b_dev_info.inode = alloc_anon_inode();
if (IS_ERR(b_dev_info.inode)) {
rc = PTR_ERR(b_dev_info.inode);
b_dev_info.inode = NULL;
- kern_unmount(balloon_mnt);
- balloon_mnt = NULL;
return rc;
}
@@ -597,8 +576,6 @@ static void cmm_balloon_compaction_deinit(void)
if (b_dev_info.inode)
iput(b_dev_info.inode);
b_dev_info.inode = NULL;
- kern_unmount(balloon_mnt);
- balloon_mnt = NULL;
}
#else /* CONFIG_BALLOON_COMPACTION */
static int cmm_balloon_compaction_init(void)
--
2.30.1
^ permalink raw reply related
* [PATCH 4/9] drm: remove the drm file system
From: Christoph Hellwig @ 2021-03-09 15:53 UTC (permalink / raw)
To: Al Viro
Cc: Jason Gunthorpe, David Hildenbrand, VMware, Inc.,
Michael S. Tsirkin, linux-kernel, dri-devel, virtualization,
linux-mm, Minchan Kim, Alex Williamson, Nadav Amit, Daniel Vetter,
linux-fsdevel, Andrew Morton, linuxppc-dev, Nitin Gupta
In-Reply-To: <20210309155348.974875-1-hch@lst.de>
Just use the generic anon_inode file system.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
drivers/gpu/drm/drm_drv.c | 64 ++-------------------------------------
1 file changed, 3 insertions(+), 61 deletions(-)
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 87e7214a8e3565..af293d76f979e5 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -26,6 +26,7 @@
* DEALINGS IN THE SOFTWARE.
*/
+#include <linux/anon_inodes.h>
#include <linux/debugfs.h>
#include <linux/fs.h>
#include <linux/module.h>
@@ -475,65 +476,6 @@ void drm_dev_unplug(struct drm_device *dev)
}
EXPORT_SYMBOL(drm_dev_unplug);
-/*
- * DRM internal mount
- * We want to be able to allocate our own "struct address_space" to control
- * memory-mappings in VRAM (or stolen RAM, ...). However, core MM does not allow
- * stand-alone address_space objects, so we need an underlying inode. As there
- * is no way to allocate an independent inode easily, we need a fake internal
- * VFS mount-point.
- *
- * The drm_fs_inode_new() function allocates a new inode, drm_fs_inode_free()
- * frees it again. You are allowed to use iget() and iput() to get references to
- * the inode. But each drm_fs_inode_new() call must be paired with exactly one
- * drm_fs_inode_free() call (which does not have to be the last iput()).
- * We use drm_fs_inode_*() to manage our internal VFS mount-point and share it
- * between multiple inode-users. You could, technically, call
- * iget() + drm_fs_inode_free() directly after alloc and sometime later do an
- * iput(), but this way you'd end up with a new vfsmount for each inode.
- */
-
-static int drm_fs_cnt;
-static struct vfsmount *drm_fs_mnt;
-
-static int drm_fs_init_fs_context(struct fs_context *fc)
-{
- return init_pseudo(fc, 0x010203ff) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type drm_fs_type = {
- .name = "drm",
- .owner = THIS_MODULE,
- .init_fs_context = drm_fs_init_fs_context,
- .kill_sb = kill_anon_super,
-};
-
-static struct inode *drm_fs_inode_new(void)
-{
- struct inode *inode;
- int r;
-
- r = simple_pin_fs(&drm_fs_type, &drm_fs_mnt, &drm_fs_cnt);
- if (r < 0) {
- DRM_ERROR("Cannot mount pseudo fs: %d\n", r);
- return ERR_PTR(r);
- }
-
- inode = alloc_anon_inode_sb(drm_fs_mnt->mnt_sb);
- if (IS_ERR(inode))
- simple_release_fs(&drm_fs_mnt, &drm_fs_cnt);
-
- return inode;
-}
-
-static void drm_fs_inode_free(struct inode *inode)
-{
- if (inode) {
- iput(inode);
- simple_release_fs(&drm_fs_mnt, &drm_fs_cnt);
- }
-}
-
/**
* DOC: component helper usage recommendations
*
@@ -563,7 +505,7 @@ static void drm_dev_init_release(struct drm_device *dev, void *res)
{
drm_legacy_ctxbitmap_cleanup(dev);
drm_legacy_remove_map_hash(dev);
- drm_fs_inode_free(dev->anon_inode);
+ iput(dev->anon_inode);
put_device(dev->dev);
/* Prevent use-after-free in drm_managed_release when debugging is
@@ -616,7 +558,7 @@ static int drm_dev_init(struct drm_device *dev,
if (ret)
return ret;
- dev->anon_inode = drm_fs_inode_new();
+ dev->anon_inode = alloc_anon_inode();
if (IS_ERR(dev->anon_inode)) {
ret = PTR_ERR(dev->anon_inode);
DRM_ERROR("Cannot allocate anonymous inode: %d\n", ret);
--
2.30.1
^ permalink raw reply related
* [PATCH 5/9] vmw_balloon: remove the balloon-vmware file system
From: Christoph Hellwig @ 2021-03-09 15:53 UTC (permalink / raw)
To: Al Viro
Cc: Jason Gunthorpe, David Hildenbrand, VMware, Inc.,
Michael S. Tsirkin, linux-kernel, dri-devel, virtualization,
linux-mm, Minchan Kim, Alex Williamson, Nadav Amit, Daniel Vetter,
linux-fsdevel, Andrew Morton, linuxppc-dev, Nitin Gupta
In-Reply-To: <20210309155348.974875-1-hch@lst.de>
Just use the generic anon_inode file system.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
drivers/misc/vmw_balloon.c | 24 ++----------------------
1 file changed, 2 insertions(+), 22 deletions(-)
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index 5d057a05ddbee8..be4be32f858253 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -16,6 +16,7 @@
//#define DEBUG
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/anon_inodes.h>
#include <linux/types.h>
#include <linux/io.h>
#include <linux/kernel.h>
@@ -1735,20 +1736,6 @@ static inline void vmballoon_debugfs_exit(struct vmballoon *b)
#ifdef CONFIG_BALLOON_COMPACTION
-
-static int vmballoon_init_fs_context(struct fs_context *fc)
-{
- return init_pseudo(fc, BALLOON_VMW_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type vmballoon_fs = {
- .name = "balloon-vmware",
- .init_fs_context = vmballoon_init_fs_context,
- .kill_sb = kill_anon_super,
-};
-
-static struct vfsmount *vmballoon_mnt;
-
/**
* vmballoon_migratepage() - migrates a balloon page.
* @b_dev_info: balloon device information descriptor.
@@ -1878,8 +1865,6 @@ static void vmballoon_compaction_deinit(struct vmballoon *b)
iput(b->b_dev_info.inode);
b->b_dev_info.inode = NULL;
- kern_unmount(vmballoon_mnt);
- vmballoon_mnt = NULL;
}
/**
@@ -1895,13 +1880,8 @@ static void vmballoon_compaction_deinit(struct vmballoon *b)
*/
static __init int vmballoon_compaction_init(struct vmballoon *b)
{
- vmballoon_mnt = kern_mount(&vmballoon_fs);
- if (IS_ERR(vmballoon_mnt))
- return PTR_ERR(vmballoon_mnt);
-
b->b_dev_info.migratepage = vmballoon_migratepage;
- b->b_dev_info.inode = alloc_anon_inode_sb(vmballoon_mnt->mnt_sb);
-
+ b->b_dev_info.inode = alloc_anon_inode();
if (IS_ERR(b->b_dev_info.inode))
return PTR_ERR(b->b_dev_info.inode);
--
2.30.1
^ permalink raw reply related
* [PATCH 6/9] virtio_balloon: remove the balloon-kvm file system
From: Christoph Hellwig @ 2021-03-09 15:53 UTC (permalink / raw)
To: Al Viro
Cc: Jason Gunthorpe, David Hildenbrand, VMware, Inc.,
Michael S. Tsirkin, linux-kernel, dri-devel, virtualization,
linux-mm, Minchan Kim, Alex Williamson, Nadav Amit, Daniel Vetter,
linux-fsdevel, Andrew Morton, linuxppc-dev, Nitin Gupta
In-Reply-To: <20210309155348.974875-1-hch@lst.de>
Just use the generic anon_inode file system.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
drivers/virtio/virtio_balloon.c | 30 +++---------------------------
1 file changed, 3 insertions(+), 27 deletions(-)
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index cae76ee5bdd688..1efb890cd3ff09 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -6,6 +6,7 @@
* Copyright 2008 Rusty Russell IBM Corporation
*/
+#include <linux/anon_inodes.h>
#include <linux/virtio.h>
#include <linux/virtio_balloon.h>
#include <linux/swap.h>
@@ -42,10 +43,6 @@
(1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT))
#define VIRTIO_BALLOON_HINT_BLOCK_PAGES (1 << VIRTIO_BALLOON_HINT_BLOCK_ORDER)
-#ifdef CONFIG_BALLOON_COMPACTION
-static struct vfsmount *balloon_mnt;
-#endif
-
enum virtio_balloon_vq {
VIRTIO_BALLOON_VQ_INFLATE,
VIRTIO_BALLOON_VQ_DEFLATE,
@@ -805,18 +802,6 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
return MIGRATEPAGE_SUCCESS;
}
-
-static int balloon_init_fs_context(struct fs_context *fc)
-{
- return init_pseudo(fc, BALLOON_KVM_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type balloon_fs = {
- .name = "balloon-kvm",
- .init_fs_context = balloon_init_fs_context,
- .kill_sb = kill_anon_super,
-};
-
#endif /* CONFIG_BALLOON_COMPACTION */
static unsigned long shrink_free_pages(struct virtio_balloon *vb,
@@ -909,17 +894,11 @@ static int virtballoon_probe(struct virtio_device *vdev)
goto out_free_vb;
#ifdef CONFIG_BALLOON_COMPACTION
- balloon_mnt = kern_mount(&balloon_fs);
- if (IS_ERR(balloon_mnt)) {
- err = PTR_ERR(balloon_mnt);
- goto out_del_vqs;
- }
-
vb->vb_dev_info.migratepage = virtballoon_migratepage;
- vb->vb_dev_info.inode = alloc_anon_inode_sb(balloon_mnt->mnt_sb);
+ vb->vb_dev_info.inode = alloc_anon_inode();
if (IS_ERR(vb->vb_dev_info.inode)) {
err = PTR_ERR(vb->vb_dev_info.inode);
- goto out_kern_unmount;
+ goto out_del_vqs;
}
vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
#endif
@@ -1016,8 +995,6 @@ static int virtballoon_probe(struct virtio_device *vdev)
out_iput:
#ifdef CONFIG_BALLOON_COMPACTION
iput(vb->vb_dev_info.inode);
-out_kern_unmount:
- kern_unmount(balloon_mnt);
out_del_vqs:
#endif
vdev->config->del_vqs(vdev);
@@ -1070,7 +1047,6 @@ static void virtballoon_remove(struct virtio_device *vdev)
if (vb->vb_dev_info.inode)
iput(vb->vb_dev_info.inode);
- kern_unmount(balloon_mnt);
#endif
kfree(vb);
}
--
2.30.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox