LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [RFC PATCH] powerpc/64s: remove POWER9 DD1 support
From: Michael Ellerman @ 2018-06-13  3:56 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20180610133027.16819-1-npiggin@gmail.com>

Nicholas Piggin <npiggin@gmail.com> writes:

> POWER9 DD1 was never a product. It is no longer supported by upstream
> firmware, and it is not effectively supported in Linux due to lack of
> testing.
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Fine by me.

cheers

> ---
>  arch/powerpc/include/asm/book3s/64/hugetlb.h  | 15 +-----
>  arch/powerpc/include/asm/book3s/64/pgtable.h  |  5 +-
>  arch/powerpc/include/asm/book3s/64/radix.h    | 35 ++-----------
>  .../include/asm/book3s/64/tlbflush-radix.h    |  2 -
>  arch/powerpc/include/asm/cputable.h           | 13 ++---
>  arch/powerpc/include/asm/paca.h               |  5 --
>  arch/powerpc/kernel/asm-offsets.c             |  1 -
>  arch/powerpc/kernel/cputable.c                | 19 -------
>  arch/powerpc/kernel/dt_cpu_ftrs.c             |  4 +-
>  arch/powerpc/kernel/exceptions-64s.S          |  4 +-
>  arch/powerpc/kernel/idle_book3s.S             | 50 -------------------
>  arch/powerpc/kernel/process.c                 | 10 +---
>  arch/powerpc/kvm/book3s_64_mmu_radix.c        | 15 +-----
>  arch/powerpc/kvm/book3s_hv.c                  | 10 ----
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S       | 16 +-----
>  arch/powerpc/kvm/book3s_xive_template.c       | 14 +-----
>  arch/powerpc/mm/hash_utils_64.c               |  5 --
>  arch/powerpc/mm/hugetlbpage.c                 |  8 ++-
>  arch/powerpc/mm/mmu_context_book3s64.c        | 12 +----
>  arch/powerpc/mm/pgtable-radix.c               | 31 +-----------
>  arch/powerpc/mm/tlb-radix.c                   | 18 -------
>  arch/powerpc/perf/core-book3s.c               | 33 ------------
>  arch/powerpc/perf/isa207-common.c             | 12 ++---
>  arch/powerpc/perf/isa207-common.h             |  5 --
>  arch/powerpc/perf/power9-pmu.c                | 37 +-------------
>  arch/powerpc/platforms/powernv/idle.c         | 28 -----------
>  arch/powerpc/platforms/powernv/smp.c          | 27 ++--------
>  arch/powerpc/xmon/xmon.c                      |  1 -
>  drivers/misc/cxl/cxl.h                        |  8 ---
>  drivers/misc/cxl/cxllib.c                     |  4 --
>  drivers/misc/cxl/pci.c                        | 41 ++++++---------
>  31 files changed, 51 insertions(+), 437 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/=
include/asm/book3s/64/hugetlb.h
> index c459f937d484..8000aa4990d2 100644
> --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
> +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
> @@ -36,20 +36,7 @@ static inline int hstate_get_psize(struct hstate *hsta=
te)
>  static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struc=
t *vma,
>  				       struct page *page, int writable)
>  {
> -	unsigned long page_shift;
> -
> -	if (!cpu_has_feature(CPU_FTR_POWER9_DD1))
> -		return entry;
> -
> -	page_shift =3D huge_page_shift(hstate_vma(vma));
> -	/*
> -	 * We don't support 1G hugetlb pages yet.
> -	 */
> -	VM_WARN_ON(page_shift =3D=3D mmu_psize_defs[MMU_PAGE_1G].shift);
> -	if (page_shift =3D=3D mmu_psize_defs[MMU_PAGE_2M].shift)
> -		return __pte(pte_val(entry) | R_PAGE_LARGE);
> -	else
> -		return entry;
> +	return entry;
>  }
>=20=20
>  #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/=
include/asm/book3s/64/pgtable.h
> index 63cee159022b..d334e6b9a46d 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -474,9 +474,8 @@ static inline pte_t ptep_get_and_clear_full(struct mm=
_struct *mm,
>  {
>  	if (full && radix_enabled()) {
>  		/*
> -		 * Let's skip the DD1 style pte update here. We know that
> -		 * this is a full mm pte clear and hence can be sure there is
> -		 * no parallel set_pte.
> +		 * We know that this is a full mm pte clear and
> +		 * hence can be sure there is no parallel set_pte.
>  		 */
>  		return radix__ptep_get_and_clear_full(mm, addr, ptep, full);
>  	}
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/in=
clude/asm/book3s/64/radix.h
> index ef9f96742ce1..3ab3f7aef022 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -12,12 +12,6 @@
>  #include <asm/book3s/64/radix-4k.h>
>  #endif
>=20=20
> -/*
> - * For P9 DD1 only, we need to track whether the pte's huge.
> - */
> -#define R_PAGE_LARGE	_RPAGE_RSV1
> -
> -
>  #ifndef __ASSEMBLY__
>  #include <asm/book3s/64/tlbflush-radix.h>
>  #include <asm/cpu_has_feature.h>
> @@ -154,20 +148,7 @@ static inline unsigned long radix__pte_update(struct=
 mm_struct *mm,
>  {
>  	unsigned long old_pte;
>=20=20
> -	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
> -
> -		unsigned long new_pte;
> -
> -		old_pte =3D __radix_pte_update(ptep, ~0ul, 0);
> -		/*
> -		 * new value of pte
> -		 */
> -		new_pte =3D (old_pte | set) & ~clr;
> -		radix__flush_tlb_pte_p9_dd1(old_pte, mm, addr);
> -		if (new_pte)
> -			__radix_pte_update(ptep, 0, new_pte);
> -	} else
> -		old_pte =3D __radix_pte_update(ptep, clr, set);
> +	old_pte =3D __radix_pte_update(ptep, clr, set);
>  	if (!huge)
>  		assert_pte_locked(mm, addr);
>=20=20
> @@ -253,8 +234,6 @@ static inline int radix__pmd_trans_huge(pmd_t pmd)
>=20=20
>  static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
>  {
> -	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
> -		return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE);
>  	return __pmd(pmd_val(pmd) | _PAGE_PTE);
>  }
>=20=20
> @@ -285,18 +264,14 @@ static inline unsigned long radix__get_tree_size(vo=
id)
>  	unsigned long rts_field;
>  	/*
>  	 * We support 52 bits, hence:
> -	 *  DD1    52-28 =3D 24, 0b11000
> -	 *  Others 52-31 =3D 21, 0b10101
> +	 * bits 52 - 31 =3D 21, 0b10101
>  	 * RTS encoding details
>  	 * bits 0 - 3 of rts -> bits 6 - 8 unsigned long
>  	 * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long
>  	 */
> -	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
> -		rts_field =3D (0x3UL << 61);
> -	else {
> -		rts_field =3D (0x5UL << 5); /* 6 - 8 bits */
> -		rts_field |=3D (0x2UL << 61);
> -	}
> +	rts_field =3D (0x5UL << 5); /* 6 - 8 bits */
> +	rts_field |=3D (0x2UL << 61);
> +
>  	return rts_field;
>  }
>=20=20
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/p=
owerpc/include/asm/book3s/64/tlbflush-radix.h
> index ef5c3f2994c9..1154a6dc6d26 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> @@ -48,8 +48,6 @@ extern void radix__flush_tlb_page_psize(struct mm_struc=
t *mm, unsigned long vmad
>  extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long a=
ddr);
>  extern void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigne=
d long addr);
>  extern void radix__flush_tlb_all(void);
> -extern void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm=
_struct *mm,
> -					unsigned long address);
>=20=20
>  extern void radix__flush_tlb_lpid_page(unsigned int lpid,
>  					unsigned long addr,
> diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/a=
sm/cputable.h
> index 9c0a3083571b..f980f91cad8a 100644
> --- a/arch/powerpc/include/asm/cputable.h
> +++ b/arch/powerpc/include/asm/cputable.h
> @@ -210,7 +210,6 @@ static inline void cpu_feature_keys_init(void) { }
>  #define CPU_FTR_DAWR			LONG_ASM_CONST(0x0000008000000000)
>  #define CPU_FTR_DABRX			LONG_ASM_CONST(0x0000010000000000)
>  #define CPU_FTR_PMAO_BUG		LONG_ASM_CONST(0x0000020000000000)
> -#define CPU_FTR_POWER9_DD1		LONG_ASM_CONST(0x0000040000000000)
>  #define CPU_FTR_POWER9_DD2_1		LONG_ASM_CONST(0x0000080000000000)
>  #define CPU_FTR_P9_TM_HV_ASSIST		LONG_ASM_CONST(0x0000100000000000)
>  #define CPU_FTR_P9_TM_XER_SO_BUG	LONG_ASM_CONST(0x0000200000000000)
> @@ -464,8 +463,6 @@ static inline void cpu_feature_keys_init(void) { }
>  	    CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \
>  	    CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \
>  	    CPU_FTR_P9_TLBIE_BUG | CPU_FTR_P9_TIDR)
> -#define CPU_FTRS_POWER9_DD1 ((CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) & \
> -			     (~CPU_FTR_SAO))
>  #define CPU_FTRS_POWER9_DD2_0 CPU_FTRS_POWER9
>  #define CPU_FTRS_POWER9_DD2_1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1)
>  #define CPU_FTRS_POWER9_DD2_2 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1 | \
> @@ -489,16 +486,14 @@ static inline void cpu_feature_keys_init(void) { }
>  #define CPU_FTRS_POSSIBLE	\
>  	    (CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | CPU_FTRS_POWER8 | \
>  	     CPU_FTRS_POWER8_DD1 | CPU_FTR_ALTIVEC_COMP | CPU_FTR_VSX_COMP | \
> -	     CPU_FTRS_POWER9 | CPU_FTRS_POWER9_DD1 | CPU_FTRS_POWER9_DD2_1 | \
> -	     CPU_FTRS_POWER9_DD2_2)
> +	     CPU_FTRS_POWER9 | CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2)
>  #else
>  #define CPU_FTRS_POSSIBLE	\
>  	    (CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \
>  	     CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \
>  	     CPU_FTRS_POWER8 | CPU_FTRS_POWER8_DD1 | CPU_FTRS_CELL | \
>  	     CPU_FTRS_PA6T | CPU_FTR_VSX_COMP | CPU_FTR_ALTIVEC_COMP | \
> -	     CPU_FTRS_POWER9 | CPU_FTRS_POWER9_DD1 | CPU_FTRS_POWER9_DD2_1 | \
> -	     CPU_FTRS_POWER9_DD2_2)
> +	     CPU_FTRS_POWER9 | CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2)
>  #endif /* CONFIG_CPU_LITTLE_ENDIAN */
>  #endif
>  #else
> @@ -567,7 +562,7 @@ enum {
>  #define CPU_FTRS_ALWAYS \
>  	    (CPU_FTRS_POSSIBLE & ~CPU_FTR_HVMODE & CPU_FTRS_POWER7 & \
>  	     CPU_FTRS_POWER8E & CPU_FTRS_POWER8 & CPU_FTRS_POWER8_DD1 & \
> -	     CPU_FTRS_POWER9 & CPU_FTRS_POWER9_DD1 & CPU_FTRS_POWER9_DD2_1 & \
> +	     CPU_FTRS_POWER9 & CPU_FTRS_POWER9_DD2_1 & \
>  	     CPU_FTRS_DT_CPU_BASE)
>  #else
>  #define CPU_FTRS_ALWAYS		\
> @@ -575,7 +570,7 @@ enum {
>  	     CPU_FTRS_POWER6 & CPU_FTRS_POWER7 & CPU_FTRS_CELL & \
>  	     CPU_FTRS_PA6T & CPU_FTRS_POWER8 & CPU_FTRS_POWER8E & \
>  	     CPU_FTRS_POWER8_DD1 & ~CPU_FTR_HVMODE & CPU_FTRS_POSSIBLE & \
> -	     CPU_FTRS_POWER9 & CPU_FTRS_POWER9_DD1 & CPU_FTRS_POWER9_DD2_1 & \
> +	     CPU_FTRS_POWER9 & CPU_FTRS_POWER9_DD2_1 & \
>  	     CPU_FTRS_DT_CPU_BASE)
>  #endif /* CONFIG_CPU_LITTLE_ENDIAN */
>  #endif
> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/p=
aca.h
> index 6d34bd71139d..4e9cede5a7e7 100644
> --- a/arch/powerpc/include/asm/paca.h
> +++ b/arch/powerpc/include/asm/paca.h
> @@ -187,11 +187,6 @@ struct paca_struct {
>  	u8 subcore_sibling_mask;
>  	/* Flag to request this thread not to stop */
>  	atomic_t dont_stop;
> -	/*
> -	 * Pointer to an array which contains pointer
> -	 * to the sibling threads' paca.
> -	 */
> -	struct paca_struct **thread_sibling_pacas;
>  	/* The PSSCR value that the kernel requested before going to stop */
>  	u64 requested_psscr;
>=20=20
> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-=
offsets.c
> index 9fc9e0977009..e329c71a60dd 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -766,7 +766,6 @@ int main(void)
>  	OFFSET(PACA_THREAD_IDLE_STATE, paca_struct, thread_idle_state);
>  	OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask);
>  	OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask);
> -	OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas);
>  	OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr);
>  	OFFSET(PACA_DONT_STOP, paca_struct, dont_stop);
>  #define STOP_SPR(x, f)	OFFSET(x, paca_struct, stop_sprs.f)
> diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputabl=
e.c
> index c8fc9691f8c7..bc75a2908a7e 100644
> --- a/arch/powerpc/kernel/cputable.c
> +++ b/arch/powerpc/kernel/cputable.c
> @@ -485,25 +485,6 @@ static struct cpu_spec __initdata cpu_specs[] =3D {
>  		.machine_check_early	=3D __machine_check_early_realmode_p8,
>  		.platform		=3D "power8",
>  	},
> -	{	/* Power9 DD1*/
> -		.pvr_mask		=3D 0xffffff00,
> -		.pvr_value		=3D 0x004e0100,
> -		.cpu_name		=3D "POWER9 (raw)",
> -		.cpu_features		=3D CPU_FTRS_POWER9_DD1,
> -		.cpu_user_features	=3D COMMON_USER_POWER9,
> -		.cpu_user_features2	=3D COMMON_USER2_POWER9,
> -		.mmu_features		=3D MMU_FTRS_POWER9,
> -		.icache_bsize		=3D 128,
> -		.dcache_bsize		=3D 128,
> -		.num_pmcs		=3D 6,
> -		.pmc_type		=3D PPC_PMC_IBM,
> -		.oprofile_cpu_type	=3D "ppc64/power9",
> -		.oprofile_type		=3D PPC_OPROFILE_INVALID,
> -		.cpu_setup		=3D __setup_cpu_power9,
> -		.cpu_restore		=3D __restore_cpu_power9,
> -		.machine_check_early	=3D __machine_check_early_realmode_p9,
> -		.platform		=3D "power9",
> -	},
>  	{	/* Power9 DD2.0 */
>  		.pvr_mask		=3D 0xffffefff,
>  		.pvr_value		=3D 0x004e0200,
> diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_c=
pu_ftrs.c
> index 4be1c0de9406..98c373a4c1cf 100644
> --- a/arch/powerpc/kernel/dt_cpu_ftrs.c
> +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
> @@ -701,9 +701,7 @@ static __init void cpufeatures_cpu_quirks(void)
>  	/*
>  	 * Not all quirks can be derived from the cpufeatures device tree.
>  	 */
> -	if ((version & 0xffffff00) =3D=3D 0x004e0100)
> -		cur_cpu_spec->cpu_features |=3D CPU_FTR_POWER9_DD1;
> -	else if ((version & 0xffffefff) =3D=3D 0x004e0200)
> +	if ((version & 0xffffefff) =3D=3D 0x004e0200)
>  		; /* DD2.0 has no feature flag */
>  	else if ((version & 0xffffefff) =3D=3D 0x004e0201)
>  		cur_cpu_spec->cpu_features |=3D CPU_FTR_POWER9_DD2_1;
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/e=
xceptions-64s.S
> index 285c6465324a..76a14702cb9c 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -276,9 +276,7 @@ BEGIN_FTR_SECTION
>  	 *
>  	 * This interrupt can wake directly from idle. If that is the case,
>  	 * the machine check is handled then the idle wakeup code is called
> -	 * to restore state. In that case, the POWER9 DD1 idle PACA workaround
> -	 * is not applied in the early machine check code, which will cause
> -	 * bugs.
> +	 * to restore state.
>  	 */
>  	mr	r11,r1			/* Save r1 */
>  	lhz	r10,PACA_IN_MCE(r13)
> diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle=
_book3s.S
> index e734f6e45abc..d85d5515a091 100644
> --- a/arch/powerpc/kernel/idle_book3s.S
> +++ b/arch/powerpc/kernel/idle_book3s.S
> @@ -466,43 +466,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
>  	blr		/* return 0 for wakeup cause / SRR1 value */
>  #endif
>=20=20
> -/*
> - * On waking up from stop 0,1,2 with ESL=3D1 on POWER9 DD1,
> - * HSPRG0 will be set to the HSPRG0 value of one of the
> - * threads in this core. Thus the value we have in r13
> - * may not be this thread's paca pointer.
> - *
> - * Fortunately, the TIR remains invariant. Since this thread's
> - * paca pointer is recorded in all its sibling's paca, we can
> - * correctly recover this thread's paca pointer if we
> - * know the index of this thread in the core.
> - *
> - * This index can be obtained from the TIR.
> - *
> - * i.e, thread's position in the core =3D TIR.
> - * If this value is i, then this thread's paca is
> - * paca->thread_sibling_pacas[i].
> - */
> -power9_dd1_recover_paca:
> -	mfspr	r4, SPRN_TIR
> -	/*
> -	 * Since each entry in thread_sibling_pacas is 8 bytes
> -	 * we need to left-shift by 3 bits. Thus r4 =3D i * 8
> -	 */
> -	sldi	r4, r4, 3
> -	/* Get &paca->thread_sibling_pacas[0] in r5 */
> -	ld	r5, PACA_SIBLING_PACA_PTRS(r13)
> -	/* Load paca->thread_sibling_pacas[i] into r13 */
> -	ldx	r13, r4, r5
> -	SET_PACA(r13)
> -	/*
> -	 * Indicate that we have lost NVGPR state
> -	 * which needs to be restored from the stack.
> -	 */
> -	li	r3, 1
> -	stb	r3,PACA_NAPSTATELOST(r13)
> -	blr
> -
>  /*
>   * Called from machine check handler for powersave wakeups.
>   * Low level machine check processing has already been done. Now just
> @@ -537,9 +500,6 @@ pnv_powersave_wakeup:
>  	ld	r2, PACATOC(r13)
>=20=20
>  BEGIN_FTR_SECTION
> -BEGIN_FTR_SECTION_NESTED(70)
> -	bl	power9_dd1_recover_paca
> -END_FTR_SECTION_NESTED_IFSET(CPU_FTR_POWER9_DD1, 70)
>  	bl	pnv_restore_hyp_resource_arch300
>  FTR_SECTION_ELSE
>  	bl	pnv_restore_hyp_resource_arch207
> @@ -602,22 +562,12 @@ END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1)
>  	LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
>  	ld	r4,ADDROFF(pnv_first_deep_stop_state)(r5)
>=20=20
> -BEGIN_FTR_SECTION_NESTED(71)
> -	/*
> -	 * Assume that we are waking up from the state
> -	 * same as the Requested Level (RL) in the PSSCR
> -	 * which are Bits 60-63
> -	 */
> -	ld	r5,PACA_REQ_PSSCR(r13)
> -	rldicl  r5,r5,0,60
> -FTR_SECTION_ELSE_NESTED(71)
>  	/*
>  	 * 0-3 bits correspond to Power-Saving Level Status
>  	 * which indicates the idle state we are waking up from
>  	 */
>  	mfspr	r5, SPRN_PSSCR
>  	rldicl  r5,r5,4,60
> -ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_POWER9_DD1, 71)
>  	li	r0, 0		/* clear requested_psscr to say we're awake */
>  	std	r0, PACA_REQ_PSSCR(r13)
>  	cmpd	cr4,r5,r4
> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> index 9ef4aea9fffe..27f0caee55ea 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -1250,17 +1250,9 @@ struct task_struct *__switch_to(struct task_struct=
 *prev,
>  		 * mappings. If the new process has the foreign real address
>  		 * mappings, we must issue a cp_abort to clear any state and
>  		 * prevent snooping, corruption or a covert channel.
> -		 *
> -		 * DD1 allows paste into normal system memory so we do an
> -		 * unpaired copy, rather than cp_abort, to clear the buffer,
> -		 * since cp_abort is quite expensive.
>  		 */
> -		if (current_thread_info()->task->thread.used_vas) {
> +		if (current_thread_info()->task->thread.used_vas)
>  			asm volatile(PPC_CP_ABORT);
> -		} else if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
> -			asm volatile(PPC_COPY(%0, %1)
> -					: : "r"(dummy_copy_buffer), "r"(0));
> -		}
>  	}
>  #endif /* CONFIG_PPC_BOOK3S_64 */
>=20=20
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/bo=
ok3s_64_mmu_radix.c
> index 481da8f93fa4..0aa40b7d6000 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -66,10 +66,7 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_=
t eaddr,
>  	bits =3D root & RPDS_MASK;
>  	root =3D root & RPDB_MASK;
>=20=20
> -	/* P9 DD1 interprets RTS (radix tree size) differently */
>  	offset =3D rts + 31;
> -	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
> -		offset -=3D 3;
>=20=20
>  	/* current implementations only support 52-bit space */
>  	if (offset !=3D 52)
> @@ -180,17 +177,7 @@ unsigned long kvmppc_radix_update_pte(struct kvm *kv=
m, pte_t *ptep,
>  				      unsigned long clr, unsigned long set,
>  				      unsigned long addr, unsigned int shift)
>  {
> -	unsigned long old =3D 0;
> -
> -	if (!(clr & _PAGE_PRESENT) && cpu_has_feature(CPU_FTR_POWER9_DD1) &&
> -	    pte_present(*ptep)) {
> -		/* have to invalidate it first */
> -		old =3D __radix_pte_update(ptep, _PAGE_PRESENT, 0);
> -		kvmppc_radix_tlbie_page(kvm, addr, shift);
> -		set |=3D _PAGE_PRESENT;
> -		old &=3D _PAGE_PRESENT;
> -	}
> -	return __radix_pte_update(ptep, clr, set) | old;
> +	return __radix_pte_update(ptep, clr, set);
>  }
>=20=20
>  void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index cb6d2313b19f..ca17a9e7f759 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -1664,14 +1664,6 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *=
vcpu, u64 id,
>  		r =3D set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
>  		break;
>  	case KVM_REG_PPC_TB_OFFSET:
> -		/*
> -		 * POWER9 DD1 has an erratum where writing TBU40 causes
> -		 * the timebase to lose ticks.  So we don't let the
> -		 * timebase offset be changed on P9 DD1.  (It is
> -		 * initialized to zero.)
> -		 */
> -		if (cpu_has_feature(CPU_FTR_POWER9_DD1))
> -			break;
>  		/* round up to multiple of 2^24 */
>  		vcpu->arch.vcore->tb_offset =3D
>  			ALIGN(set_reg_val(id, *val), 1UL << 24);
> @@ -1989,8 +1981,6 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(=
struct kvm *kvm,
>  	/*
>  	 * Set the default HFSCR for the guest from the host value.
>  	 * This value is only used on POWER9.
> -	 * On POWER9 DD1, TM doesn't work, so we make sure to
> -	 * prevent the guest from using it.
>  	 * On POWER9, we want to virtualize the doorbell facility, so we
>  	 * turn off the HFSCR bit, which causes those instructions to trap.
>  	 */
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/b=
ook3s_hv_rmhandlers.S
> index b97d261d3b89..fe22b40c356d 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -916,9 +916,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_DAWR)
>  	mtspr	SPRN_BESCR, r6
>  	mtspr	SPRN_PID, r7
>  	mtspr	SPRN_WORT, r8
> -BEGIN_FTR_SECTION
> -	PPC_INVALIDATE_ERAT
> -END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
>  BEGIN_FTR_SECTION
>  	/* POWER8-only registers */
>  	ld	r5, VCPU_TCSCR(r4)
> @@ -1909,7 +1906,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
>  	ld	r5, VCPU_KVM(r9)
>  	lbz	r0, KVM_RADIX(r5)
>  	cmpwi	cr2, r0, 0
> -	beq	cr2, 4f
> +	beq	cr2, 2f
>=20=20
>  	/*
>  	 * Radix: do eieio; tlbsync; ptesync sequence in case we
> @@ -1949,11 +1946,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
>  	bdnz	1b
>  	ptesync
>=20=20
> -2:	/* Flush the ERAT on radix P9 DD1 guest exit */
> -BEGIN_FTR_SECTION
> -	PPC_INVALIDATE_ERAT
> -END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
> -4:
> +2:
>  #endif /* CONFIG_PPC_RADIX_MMU */
>=20=20
>  	/*
> @@ -3533,11 +3526,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
>  	mtspr	SPRN_CIABR, r0
>  	mtspr	SPRN_DAWRX, r0
>=20=20
> -	/* Flush the ERAT on radix P9 DD1 guest exit */
> -BEGIN_FTR_SECTION
> -	PPC_INVALIDATE_ERAT
> -END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
> -
>  BEGIN_MMU_FTR_SECTION
>  	b	4f
>  END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
> diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/b=
ook3s_xive_template.c
> index 99c3620b40d9..487f1f6650cc 100644
> --- a/arch/powerpc/kvm/book3s_xive_template.c
> +++ b/arch/powerpc/kvm/book3s_xive_template.c
> @@ -25,18 +25,6 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive=
_vcpu *xc)
>  	 */
>  	eieio();
>=20=20
> -	/*
> -	 * DD1 bug workaround: If PIPR is less favored than CPPR
> -	 * ignore the interrupt or we might incorrectly lose an IPB
> -	 * bit.
> -	 */
> -	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
> -		__be64 qw1 =3D __x_readq(__x_tima + TM_QW1_OS);
> -		u8 pipr =3D be64_to_cpu(qw1) & 0xff;
> -		if (pipr >=3D xc->hw_cppr)
> -			return;
> -	}
> -
>  	/* Perform the acknowledge OS to register cycle. */
>  	ack =3D be16_to_cpu(__x_readw(__x_tima + TM_SPC_ACK_OS_REG));
>=20=20
> @@ -105,7 +93,7 @@ static void GLUE(X_PFX,source_eoi)(u32 hw_irq, struct =
xive_irq_data *xd)
>  		 *
>  		 * For LSIs, using the HW EOI cycle works around a problem
>  		 * on P9 DD1 PHBs where the other ESB accesses don't work
> -		 * properly.
> +		 * properly. XXX: can this be removed?
>  		 */
>  		if (xd->flags & XIVE_IRQ_FLAG_LSI)
>  			__x_readq(__x_eoi_page(xd) + XIVE_ESB_LOAD_EOI);
> diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils=
_64.c
> index 8318716e5075..141ba68d63f3 100644
> --- a/arch/powerpc/mm/hash_utils_64.c
> +++ b/arch/powerpc/mm/hash_utils_64.c
> @@ -845,8 +845,6 @@ static void __init hash_init_partition_table(phys_add=
r_t hash_table,
>  	htab_size =3D  __ilog2(htab_size) - 18;
>  	mmu_partition_table_set_entry(0, hash_table | htab_size, 0);
>  	pr_info("Partition table %p\n", partition_tb);
> -	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
> -		update_hid_for_hash();
>  }
>=20=20
>  static void __init htab_initialize(void)
> @@ -1077,9 +1075,6 @@ void hash__early_init_mmu_secondary(void)
>  	/* Initialize hash table for that CPU */
>  	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
>=20=20
> -		if (cpu_has_feature(CPU_FTR_POWER9_DD1))
> -			update_hid_for_hash();
> -
>  		if (!cpu_has_feature(CPU_FTR_ARCH_300))
>  			mtspr(SPRN_SDR1, _SDR1);
>  		else
> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> index 7c5f479c5c00..3e91acef24b2 100644
> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -620,14 +620,12 @@ static int __init add_huge_page_size(unsigned long =
long size)
>  	 * firmware we only add hugetlb support for page sizes that can be
>  	 * supported by linux page table layout.
>  	 * For now we have
> -	 * Radix: 2M
> +	 * Radix: 2M and 1G
>  	 * Hash: 16M and 16G
>  	 */
>  	if (radix_enabled()) {
> -		if (mmu_psize !=3D MMU_PAGE_2M) {
> -			if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
> -			    (mmu_psize !=3D MMU_PAGE_1G))
> -				return -EINVAL;
> +		if (mmu_psize !=3D MMU_PAGE_2M && mmu_psize !=3D MMU_PAGE_1G)
> +			return -EINVAL;
>  		}
>  	} else {
>  		if (mmu_psize !=3D MMU_PAGE_16M && mmu_psize !=3D MMU_PAGE_16G)
> diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu=
_context_book3s64.c
> index f3d4b4a0e561..39e9ef0eb78b 100644
> --- a/arch/powerpc/mm/mmu_context_book3s64.c
> +++ b/arch/powerpc/mm/mmu_context_book3s64.c
> @@ -273,15 +273,7 @@ void arch_exit_mmap(struct mm_struct *mm)
>  #ifdef CONFIG_PPC_RADIX_MMU
>  void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct =
*next)
>  {
> -
> -	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
> -		isync();
> -		mtspr(SPRN_PID, next->context.id);
> -		isync();
> -		asm volatile(PPC_INVALIDATE_ERAT : : :"memory");
> -	} else {
> -		mtspr(SPRN_PID, next->context.id);
> -		isync();
> -	}
> +	mtspr(SPRN_PID, next->context.id);
> +	isync();
>  }
>  #endif
> diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-ra=
dix.c
> index 96f68c5aa1f5..efe549f707b3 100644
> --- a/arch/powerpc/mm/pgtable-radix.c
> +++ b/arch/powerpc/mm/pgtable-radix.c
> @@ -226,16 +226,6 @@ void radix__mark_rodata_ro(void)
>  {
>  	unsigned long start, end;
>=20=20
> -	/*
> -	 * mark_rodata_ro() will mark itself as !writable at some point.
> -	 * Due to DD1 workaround in radix__pte_update(), we'll end up with
> -	 * an invalid pte and the system will crash quite severly.
> -	 */
> -	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
> -		pr_warn("Warning: Unable to mark rodata read only on P9 DD1\n");
> -		return;
> -	}
> -
>  	start =3D (unsigned long)_stext;
>  	end =3D (unsigned long)__init_begin;
>=20=20
> @@ -576,22 +566,12 @@ static void radix_init_amor(void)
>=20=20
>  static void radix_init_iamr(void)
>  {
> -	unsigned long iamr;
> -
> -	/*
> -	 * The IAMR should set to 0 on DD1.
> -	 */
> -	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
> -		iamr =3D 0;
> -	else
> -		iamr =3D (1ul << 62);
> -
>  	/*
>  	 * Radix always uses key0 of the IAMR to determine if an access is
>  	 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
>  	 * fetch.
>  	 */
> -	mtspr(SPRN_IAMR, iamr);
> +	mtspr(SPRN_IAMR, (1ul << 62));
>  }
>=20=20
>  void __init radix__early_init_mmu(void)
> @@ -644,8 +624,6 @@ void __init radix__early_init_mmu(void)
>=20=20
>  	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
>  		radix_init_native();
> -		if (cpu_has_feature(CPU_FTR_POWER9_DD1))
> -			update_hid_for_radix();
>  		lpcr =3D mfspr(SPRN_LPCR);
>  		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
>  		radix_init_partition_table();
> @@ -671,10 +649,6 @@ void radix__early_init_mmu_secondary(void)
>  	 * update partition table control register and UPRT
>  	 */
>  	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
> -
> -		if (cpu_has_feature(CPU_FTR_POWER9_DD1))
> -			update_hid_for_radix();
> -
>  		lpcr =3D mfspr(SPRN_LPCR);
>  		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
>=20=20
> @@ -1095,8 +1069,7 @@ void radix__ptep_set_access_flags(struct vm_area_st=
ruct *vma, pte_t *ptep,
>  	 * To avoid NMMU hang while relaxing access, we need mark
>  	 * the pte invalid in between.
>  	 */
> -	if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
> -	    atomic_read(&mm->context.copros) > 0) {
> +	if (atomic_read(&mm->context.copros) > 0) {
>  		unsigned long old_pte, new_pte;
>=20=20
>  		old_pte =3D __radix_pte_update(ptep, ~0, 0);
> diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
> index 67a6e86d3e7e..902767b8a9c1 100644
> --- a/arch/powerpc/mm/tlb-radix.c
> +++ b/arch/powerpc/mm/tlb-radix.c
> @@ -994,24 +994,6 @@ void radix__flush_tlb_all(void)
>  	asm volatile("eieio; tlbsync; ptesync": : :"memory");
>  }
>=20=20
> -void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct=
 *mm,
> -				 unsigned long address)
> -{
> -	/*
> -	 * We track page size in pte only for DD1, So we can
> -	 * call this only on DD1.
> -	 */
> -	if (!cpu_has_feature(CPU_FTR_POWER9_DD1)) {
> -		VM_WARN_ON(1);
> -		return;
> -	}
> -
> -	if (old_pte & R_PAGE_LARGE)
> -		radix__flush_tlb_page_psize(mm, address, MMU_PAGE_2M);
> -	else
> -		radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize);
> -}
> -
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>  extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
>  {
> diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-boo=
k3s.c
> index 3f66fcf8ad99..19a0cf44744d 100644
> --- a/arch/powerpc/perf/core-book3s.c
> +++ b/arch/powerpc/perf/core-book3s.c
> @@ -128,10 +128,6 @@ static inline void power_pmu_bhrb_disable(struct per=
f_event *event) {}
>  static void power_pmu_sched_task(struct perf_event_context *ctx, bool sc=
hed_in) {}
>  static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {}
>  static void pmao_restore_workaround(bool ebb) { }
> -static bool use_ic(u64 event)
> -{
> -	return false;
> -}
>  #endif /* CONFIG_PPC32 */
>=20=20
>  static bool regs_use_siar(struct pt_regs *regs)
> @@ -714,14 +710,6 @@ static void pmao_restore_workaround(bool ebb)
>  	mtspr(SPRN_PMC6, pmcs[5]);
>  }
>=20=20
> -static bool use_ic(u64 event)
> -{
> -	if (cpu_has_feature(CPU_FTR_POWER9_DD1) &&
> -			(event =3D=3D 0x200f2 || event =3D=3D 0x300f2))
> -		return true;
> -
> -	return false;
> -}
>  #endif /* CONFIG_PPC64 */
>=20=20
>  static void perf_event_interrupt(struct pt_regs *regs);
> @@ -1056,13 +1044,6 @@ static void power_pmu_read(struct perf_event *even=
t)
>=20=20
>  	if (is_ebb_event(event)) {
>  		val =3D read_pmc(event->hw.idx);
> -		if (use_ic(event->attr.config)) {
> -			val =3D mfspr(SPRN_IC);
> -			if (val > cpuhw->ic_init)
> -				val =3D val - cpuhw->ic_init;
> -			else
> -				val =3D val + (0 - cpuhw->ic_init);
> -		}
>  		local64_set(&event->hw.prev_count, val);
>  		return;
>  	}
> @@ -1076,13 +1057,6 @@ static void power_pmu_read(struct perf_event *even=
t)
>  		prev =3D local64_read(&event->hw.prev_count);
>  		barrier();
>  		val =3D read_pmc(event->hw.idx);
> -		if (use_ic(event->attr.config)) {
> -			val =3D mfspr(SPRN_IC);
> -			if (val > cpuhw->ic_init)
> -				val =3D val - cpuhw->ic_init;
> -			else
> -				val =3D val + (0 - cpuhw->ic_init);
> -		}
>  		delta =3D check_and_compute_delta(prev, val);
>  		if (!delta)
>  			return;
> @@ -1535,13 +1509,6 @@ static int power_pmu_add(struct perf_event *event,=
 int ef_flags)
>  					event->attr.branch_sample_type);
>  	}
>=20=20
> -	/*
> -	 * Workaround for POWER9 DD1 to use the Instruction Counter
> -	 * register value for instruction counting
> -	 */
> -	if (use_ic(event->attr.config))
> -		cpuhw->ic_init =3D mfspr(SPRN_IC);
> -
>  	perf_pmu_enable(event->pmu);
>  	local_irq_restore(flags);
>  	return ret;
> diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207=
-common.c
> index 2efee3f196f5..177de814286f 100644
> --- a/arch/powerpc/perf/isa207-common.c
> +++ b/arch/powerpc/perf/isa207-common.c
> @@ -59,7 +59,7 @@ static bool is_event_valid(u64 event)
>  {
>  	u64 valid_mask =3D EVENT_VALID_MASK;
>=20=20
> -	if (cpu_has_feature(CPU_FTR_ARCH_300) && !cpu_has_feature(CPU_FTR_POWER=
9_DD1))
> +	if (cpu_has_feature(CPU_FTR_ARCH_300))
>  		valid_mask =3D p9_EVENT_VALID_MASK;
>=20=20
>  	return !(event & ~valid_mask);
> @@ -86,8 +86,6 @@ static void mmcra_sdar_mode(u64 event, unsigned long *m=
mcra)
>  	 * Incase of Power9:
>  	 * Marked event: MMCRA[SDAR_MODE] will be set to 0b00 ('No Updates'),
>  	 *               or if group already have any marked events.
> -	 * Non-Marked events (for DD1):
> -	 *	MMCRA[SDAR_MODE] will be set to 0b01
>  	 * For rest
>  	 *	MMCRA[SDAR_MODE] will be set from event code.
>  	 *      If sdar_mode from event is zero, default to 0b01. Hardware
> @@ -96,7 +94,7 @@ static void mmcra_sdar_mode(u64 event, unsigned long *m=
mcra)
>  	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
>  		if (is_event_marked(event) || (*mmcra & MMCRA_SAMPLE_ENABLE))
>  			*mmcra &=3D MMCRA_SDAR_MODE_NO_UPDATES;
> -		else if (!cpu_has_feature(CPU_FTR_POWER9_DD1) && p9_SDAR_MODE(event))
> +		else if (p9_SDAR_MODE(event))
>  			*mmcra |=3D  p9_SDAR_MODE(event) << MMCRA_SDAR_MODE_SHIFT;
>  		else
>  			*mmcra |=3D MMCRA_SDAR_MODE_DCACHE;
> @@ -106,7 +104,7 @@ static void mmcra_sdar_mode(u64 event, unsigned long =
*mmcra)
>=20=20
>  static u64 thresh_cmp_val(u64 value)
>  {
> -	if (cpu_has_feature(CPU_FTR_ARCH_300) && !cpu_has_feature(CPU_FTR_POWER=
9_DD1))
> +	if (cpu_has_feature(CPU_FTR_ARCH_300))
>  		return value << p9_MMCRA_THR_CMP_SHIFT;
>=20=20
>  	return value << MMCRA_THR_CMP_SHIFT;
> @@ -114,7 +112,7 @@ static u64 thresh_cmp_val(u64 value)
>=20=20
>  static unsigned long combine_from_event(u64 event)
>  {
> -	if (cpu_has_feature(CPU_FTR_ARCH_300) && !cpu_has_feature(CPU_FTR_POWER=
9_DD1))
> +	if (cpu_has_feature(CPU_FTR_ARCH_300))
>  		return p9_EVENT_COMBINE(event);
>=20=20
>  	return EVENT_COMBINE(event);
> @@ -122,7 +120,7 @@ static unsigned long combine_from_event(u64 event)
>=20=20
>  static unsigned long combine_shift(unsigned long pmc)
>  {
> -	if (cpu_has_feature(CPU_FTR_ARCH_300) && !cpu_has_feature(CPU_FTR_POWER=
9_DD1))
> +	if (cpu_has_feature(CPU_FTR_ARCH_300))
>  		return p9_MMCR1_COMBINE_SHIFT(pmc);
>=20=20
>  	return MMCR1_COMBINE_SHIFT(pmc);
> diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207=
-common.h
> index 6a0b586c935a..0028f4b9490d 100644
> --- a/arch/powerpc/perf/isa207-common.h
> +++ b/arch/powerpc/perf/isa207-common.h
> @@ -158,11 +158,6 @@
>  	CNST_PMC_VAL(1) | CNST_PMC_VAL(2) | CNST_PMC_VAL(3) | \
>  	CNST_PMC_VAL(4) | CNST_PMC_VAL(5) | CNST_PMC_VAL(6) | CNST_NC_VAL
>=20=20
> -/*
> - * Lets restrict use of PMC5 for instruction counting.
> - */
> -#define P9_DD1_TEST_ADDER	(ISA207_TEST_ADDER | CNST_PMC_VAL(5))
> -
>  /* Bits in MMCR1 for PowerISA v2.07 */
>  #define MMCR1_UNIT_SHIFT(pmc)		(60 - (4 * ((pmc) - 1)))
>  #define MMCR1_COMBINE_SHIFT(pmc)	(35 - ((pmc) - 1))
> diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pm=
u.c
> index 2ca0b33b4efb..56124dd1a82c 100644
> --- a/arch/powerpc/perf/power9-pmu.c
> +++ b/arch/powerpc/perf/power9-pmu.c
> @@ -439,25 +439,6 @@ static int power9_cache_events[C(MAX)][C(OP_MAX)][C(=
RESULT_MAX)] =3D {
>=20=20
>  #undef C
>=20=20
> -static struct power_pmu power9_isa207_pmu =3D {
> -	.name			=3D "POWER9",
> -	.n_counter		=3D MAX_PMU_COUNTERS,
> -	.add_fields		=3D ISA207_ADD_FIELDS,
> -	.test_adder		=3D P9_DD1_TEST_ADDER,
> -	.compute_mmcr		=3D isa207_compute_mmcr,
> -	.config_bhrb		=3D power9_config_bhrb,
> -	.bhrb_filter_map	=3D power9_bhrb_filter_map,
> -	.get_constraint		=3D isa207_get_constraint,
> -	.get_alternatives	=3D power9_get_alternatives,
> -	.disable_pmc		=3D isa207_disable_pmc,
> -	.flags			=3D PPMU_NO_SIAR | PPMU_ARCH_207S,
> -	.n_generic		=3D ARRAY_SIZE(power9_generic_events_dd1),
> -	.generic_events		=3D power9_generic_events_dd1,
> -	.cache_events		=3D &power9_cache_events,
> -	.attr_groups		=3D power9_isa207_pmu_attr_groups,
> -	.bhrb_nr		=3D 32,
> -};
> -
>  static struct power_pmu power9_pmu =3D {
>  	.name			=3D "POWER9",
>  	.n_counter		=3D MAX_PMU_COUNTERS,
> @@ -500,23 +481,7 @@ static int __init init_power9_pmu(void)
>  		}
>  	}
>=20=20
> -	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
> -		/*
> -		 * Since PM_INST_CMPL may not provide right counts in all
> -		 * sampling scenarios in power9 DD1, instead use PM_INST_DISP.
> -		 */
> -		EVENT_VAR(PM_INST_CMPL, _g).id =3D PM_INST_DISP;
> -		/*
> -		 * Power9 DD1 should use PM_BR_CMPL_ALT event code for
> -		 * "branches" to provide correct counter value.
> -		 */
> -		EVENT_VAR(PM_BR_CMPL, _g).id =3D PM_BR_CMPL_ALT;
> -		EVENT_VAR(PM_BR_CMPL, _c).id =3D PM_BR_CMPL_ALT;
> -		rc =3D register_power_pmu(&power9_isa207_pmu);
> -	} else {
> -		rc =3D register_power_pmu(&power9_pmu);
> -	}
> -
> +	rc =3D register_power_pmu(&power9_pmu);
>  	if (rc)
>  		return rc;
>=20=20
> diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platfor=
ms/powernv/idle.c
> index 1c5d0675b43c..12f13acee1f6 100644
> --- a/arch/powerpc/platforms/powernv/idle.c
> +++ b/arch/powerpc/platforms/powernv/idle.c
> @@ -177,11 +177,6 @@ static void pnv_alloc_idle_core_states(void)
>  			paca_ptrs[cpu]->core_idle_state_ptr =3D core_idle_state;
>  			paca_ptrs[cpu]->thread_idle_state =3D PNV_THREAD_RUNNING;
>  			paca_ptrs[cpu]->thread_mask =3D 1 << j;
> -			if (!cpu_has_feature(CPU_FTR_POWER9_DD1))
> -				continue;
> -			paca_ptrs[cpu]->thread_sibling_pacas =3D
> -				kmalloc_node(paca_ptr_array_size,
> -					     GFP_KERNEL, node);
>  		}
>  	}
>=20=20
> @@ -805,29 +800,6 @@ static int __init pnv_init_idle_states(void)
>=20=20
>  	pnv_alloc_idle_core_states();
>=20=20
> -	/*
> -	 * For each CPU, record its PACA address in each of it's
> -	 * sibling thread's PACA at the slot corresponding to this
> -	 * CPU's index in the core.
> -	 */
> -	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
> -		int cpu;
> -
> -		pr_info("powernv: idle: Saving PACA pointers of all CPUs in their thre=
ad sibling PACA\n");
> -		for_each_present_cpu(cpu) {
> -			int base_cpu =3D cpu_first_thread_sibling(cpu);
> -			int idx =3D cpu_thread_in_core(cpu);
> -			int i;
> -
> -			for (i =3D 0; i < threads_per_core; i++) {
> -				int j =3D base_cpu + i;
> -
> -				paca_ptrs[j]->thread_sibling_pacas[idx] =3D
> -					paca_ptrs[cpu];
> -			}
> -		}
> -	}
> -
>  	if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED)
>  		ppc_md.power_save =3D power7_idle;
>=20=20
> diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platform=
s/powernv/smp.c
> index b80909957792..0d354e19ef92 100644
> --- a/arch/powerpc/platforms/powernv/smp.c
> +++ b/arch/powerpc/platforms/powernv/smp.c
> @@ -283,23 +283,6 @@ static void pnv_cause_ipi(int cpu)
>  	ic_cause_ipi(cpu);
>  }
>=20=20
> -static void pnv_p9_dd1_cause_ipi(int cpu)
> -{
> -	int this_cpu =3D get_cpu();
> -
> -	/*
> -	 * POWER9 DD1 has a global addressed msgsnd, but for now we restrict
> -	 * IPIs to same core, because it requires additional synchronization
> -	 * for inter-core doorbells which we do not implement.
> -	 */
> -	if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu)))
> -		doorbell_global_ipi(cpu);
> -	else
> -		ic_cause_ipi(cpu);
> -
> -	put_cpu();
> -}
> -
>  static void __init pnv_smp_probe(void)
>  {
>  	if (xive_enabled())
> @@ -311,14 +294,10 @@ static void __init pnv_smp_probe(void)
>  		ic_cause_ipi =3D smp_ops->cause_ipi;
>  		WARN_ON(!ic_cause_ipi);
>=20=20
> -		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
> -			if (cpu_has_feature(CPU_FTR_POWER9_DD1))
> -				smp_ops->cause_ipi =3D pnv_p9_dd1_cause_ipi;
> -			else
> -				smp_ops->cause_ipi =3D doorbell_global_ipi;
> -		} else {
> +		if (cpu_has_feature(CPU_FTR_ARCH_300))
> +			smp_ops->cause_ipi =3D doorbell_global_ipi;
> +		else
>  			smp_ops->cause_ipi =3D pnv_cause_ipi;
> -		}
>  	}
>  }
>=20=20
> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
> index 47166ad2a669..21119cfe8474 100644
> --- a/arch/powerpc/xmon/xmon.c
> +++ b/arch/powerpc/xmon/xmon.c
> @@ -2429,7 +2429,6 @@ static void dump_one_paca(int cpu)
>  	DUMP(p, thread_idle_state, "%#-*x");
>  	DUMP(p, thread_mask, "%#-*x");
>  	DUMP(p, subcore_sibling_mask, "%#-*x");
> -	DUMP(p, thread_sibling_pacas, "%-*px");
>  	DUMP(p, requested_psscr, "%#-*llx");
>  	DUMP(p, stop_sprs.pid, "%#-*llx");
>  	DUMP(p, stop_sprs.ldbar, "%#-*llx");
> diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
> index 918d4fb742d1..505f973e13f3 100644
> --- a/drivers/misc/cxl/cxl.h
> +++ b/drivers/misc/cxl/cxl.h
> @@ -865,14 +865,6 @@ static inline bool cxl_is_power9(void)
>  	return false;
>  }
>=20=20
> -static inline bool cxl_is_power9_dd1(void)
> -{
> -	if ((pvr_version_is(PVR_POWER9)) &&
> -	    cpu_has_feature(CPU_FTR_POWER9_DD1))
> -		return true;
> -	return false;
> -}
> -
>  ssize_t cxl_pci_afu_read_err_buffer(struct cxl_afu *afu, char *buf,
>  				loff_t off, size_t count);
>=20=20
> diff --git a/drivers/misc/cxl/cxllib.c b/drivers/misc/cxl/cxllib.c
> index 0bc7c31cf739..5a3f91255258 100644
> --- a/drivers/misc/cxl/cxllib.c
> +++ b/drivers/misc/cxl/cxllib.c
> @@ -102,10 +102,6 @@ int cxllib_get_xsl_config(struct pci_dev *dev, struc=
t cxllib_xsl_config *cfg)
>  	rc =3D cxl_get_xsl9_dsnctl(dev, capp_unit_id, &cfg->dsnctl);
>  	if (rc)
>  		return rc;
> -	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
> -		/* workaround for DD1 - nbwind =3D capiind */
> -		cfg->dsnctl |=3D ((u64)0x02 << (63-47));
> -	}
>=20=20
>  	cfg->version  =3D CXL_XSL_CONFIG_CURRENT_VERSION;
>  	cfg->log_bar_size =3D CXL_CAPI_WINDOW_LOG_SIZE;
> diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
> index 429d6de1dde7..2af0d4c47b76 100644
> --- a/drivers/misc/cxl/pci.c
> +++ b/drivers/misc/cxl/pci.c
> @@ -465,23 +465,21 @@ int cxl_get_xsl9_dsnctl(struct pci_dev *dev, u64 ca=
pp_unit_id, u64 *reg)
>  	/* nMMU_ID Defaults to: b=E2=80=99000001001=E2=80=99*/
>  	xsl_dsnctl |=3D ((u64)0x09 << (63-28));
>=20=20
> -	if (!(cxl_is_power9_dd1())) {
> -		/*
> -		 * Used to identify CAPI packets which should be sorted into
> -		 * the Non-Blocking queues by the PHB. This field should match
> -		 * the PHB PBL_NBW_CMPM register
> -		 * nbwind=3D0x03, bits [57:58], must include capi indicator.
> -		 * Not supported on P9 DD1.
> -		 */
> -		xsl_dsnctl |=3D (nbwind << (63-55));
> +	/*
> +	 * Used to identify CAPI packets which should be sorted into
> +	 * the Non-Blocking queues by the PHB. This field should match
> +	 * the PHB PBL_NBW_CMPM register
> +	 * nbwind=3D0x03, bits [57:58], must include capi indicator.
> +	 * Not supported on P9 DD1.
> +	 */
> +	xsl_dsnctl |=3D (nbwind << (63-55));
>=20=20
> -		/*
> -		 * Upper 16b address bits of ASB_Notify messages sent to the
> -		 * system. Need to match the PHB=E2=80=99s ASN Compare/Mask Register.
> -		 * Not supported on P9 DD1.
> -		 */
> -		xsl_dsnctl |=3D asnind;
> -	}
> +	/*
> +	 * Upper 16b address bits of ASB_Notify messages sent to the
> +	 * system. Need to match the PHB=E2=80=99s ASN Compare/Mask Register.
> +	 * Not supported on P9 DD1.
> +	 */
> +	xsl_dsnctl |=3D asnind;
>=20=20
>  	*reg =3D xsl_dsnctl;
>  	return 0;
> @@ -539,15 +537,8 @@ static int init_implementation_adapter_regs_psl9(str=
uct cxl *adapter,
>  	/* Snoop machines */
>  	cxl_p1_write(adapter, CXL_PSL9_APCDEDALLOC, 0x800F000200000000ULL);
>=20=20
> -	if (cxl_is_power9_dd1()) {
> -		/* Disabling deadlock counter CAR */
> -		cxl_p1_write(adapter, CXL_PSL9_GP_CT, 0x0020000000000001ULL);
> -		/* Enable NORST */
> -		cxl_p1_write(adapter, CXL_PSL9_DEBUG, 0x8000000000000000ULL);
> -	} else {
> -		/* Enable NORST and DD2 features */
> -		cxl_p1_write(adapter, CXL_PSL9_DEBUG, 0xC000000000000000ULL);
> -	}
> +	/* Enable NORST and DD2 features */
> +	cxl_p1_write(adapter, CXL_PSL9_DEBUG, 0xC000000000000000ULL);
>=20=20
>  	/*
>  	 * Check if PSL has data-cache. We need to flush adapter datacache
> --=20
> 2.17.0

^ permalink raw reply

* Re: [v3 PATCH 4/5] powerpc/pseries: Dump and flush SLB contents on SLB MCE errors.
From: Mahesh Jagannath Salgaonkar @ 2018-06-13  3:45 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev
  Cc: Laurent Dufour, Aneesh Kumar K.V, Nicholas Piggin
In-Reply-To: <87lgbkt8ts.fsf@concordia.ellerman.id.au>

On 06/12/2018 07:17 PM, Michael Ellerman wrote:
> Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> writes:
>> diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
>> index 2edc673be137..e56759d92356 100644
>> --- a/arch/powerpc/platforms/pseries/ras.c
>> +++ b/arch/powerpc/platforms/pseries/ras.c
>> @@ -422,6 +422,31 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
>>  	return 0; /* need to perform reset */
>>  }
>>  
>> +static int mce_handle_error(struct rtas_error_log *errp)
>> +{
>> +	struct pseries_errorlog *pseries_log;
>> +	struct pseries_mc_errorlog *mce_log;
>> +	int disposition = rtas_error_disposition(errp);
>> +	uint8_t error_type;
>> +
>> +	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
>> +	if (pseries_log == NULL)
>> +		goto out;
>> +
>> +	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
>> +	error_type = rtas_mc_error_type(mce_log);
>> +
>> +	if ((disposition == RTAS_DISP_NOT_RECOVERED) &&
>> +			(error_type == PSERIES_MC_ERROR_TYPE_SLB)) {
>> +		slb_dump_contents();
>> +		slb_flush_and_rebolt();
> 
> Aren't we back in virtual mode here?
> 
> Don't we need to do the flush in real mode before turning the MMU back
> on. Otherwise we'll just take another multi-hit?

Yeah for duplicate entries for kernel segment "0xc00", we will end up
with another multi-hit. For other segments we won't. I think I need to
move the fetching of rtas error log and handling part into real mode to
avoid a loop, and do only printing part in virtual mode.

> 
> cheers
> 

^ permalink raw reply

* Re: [v3 PATCH 4/5] powerpc/pseries: Dump and flush SLB contents on SLB MCE errors.
From: Aneesh Kumar K.V @ 2018-06-13  2:38 UTC (permalink / raw)
  To: Michael Ellerman, Mahesh J Salgaonkar, linuxppc-dev
  Cc: Aneesh Kumar K.V, Laurent Dufour, Nicholas Piggin
In-Reply-To: <87lgbkt8ts.fsf@concordia.ellerman.id.au>

On 06/12/2018 07:17 PM, Michael Ellerman wrote:
> Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> writes:
>> diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
>> index 2edc673be137..e56759d92356 100644
>> --- a/arch/powerpc/platforms/pseries/ras.c
>> +++ b/arch/powerpc/platforms/pseries/ras.c
>> @@ -422,6 +422,31 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
>>   	return 0; /* need to perform reset */
>>   }
>>   
>> +static int mce_handle_error(struct rtas_error_log *errp)
>> +{
>> +	struct pseries_errorlog *pseries_log;
>> +	struct pseries_mc_errorlog *mce_log;
>> +	int disposition = rtas_error_disposition(errp);
>> +	uint8_t error_type;
>> +
>> +	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
>> +	if (pseries_log == NULL)
>> +		goto out;
>> +
>> +	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
>> +	error_type = rtas_mc_error_type(mce_log);
>> +
>> +	if ((disposition == RTAS_DISP_NOT_RECOVERED) &&
>> +			(error_type == PSERIES_MC_ERROR_TYPE_SLB)) {
>> +		slb_dump_contents();
>> +		slb_flush_and_rebolt();
> 
> Aren't we back in virtual mode here?
> 
> Don't we need to do the flush in real mode before turning the MMU back
> on. Otherwise we'll just take another multi-hit?
> 

slb_flush_and_rebolt does slbia, which keeps slb index 0. So kernel code 
should not get another slb miss. We also make sure we don't touch stack 
in slb_flush_and_rebolt(). So we flush everything and put vmalloc and 
stack back. That should be ok with MMU on?

-aneesh

^ permalink raw reply

* Re: [RFC PATCH 2/5] powerpc: Flush checkpointed gpr state for 32-bit processes in ptrace
From: Michael Ellerman @ 2018-06-13  2:19 UTC (permalink / raw)
  To: Pedro Franco de Carvalho, linuxppc-dev
In-Reply-To: <20180607152534.29427-3-pedromfc@linux.vnet.ibm.com>

Pedro Franco de Carvalho <pedromfc@linux.vnet.ibm.com> writes:

> Currently ptrace doesn't flush the register state when the
> checkpointed GPRs of a 32-bit thread are accessed. This can cause core
> dumps to have stale data in the checkpointed GPR note.
> ---
>  arch/powerpc/kernel/ptrace.c | 20 ++++++++++++++++++++
>  1 file changed, 20 insertions(+)
>
> diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
> index 6618570c6d56..be8ca03a0bd5 100644
> --- a/arch/powerpc/kernel/ptrace.c
> +++ b/arch/powerpc/kernel/ptrace.c
> @@ -2124,6 +2124,16 @@ static int tm_cgpr32_get(struct task_struct *target,
>  		     unsigned int pos, unsigned int count,
>  		     void *kbuf, void __user *ubuf)
>  {
> +	if (!cpu_has_feature(CPU_FTR_TM))
> +		return -ENODEV;
> +
> +	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
> +		return -ENODATA;
> +
> +	flush_tmregs_to_thread(target);
> +	flush_fp_to_thread(target);
> +	flush_altivec_to_thread(target);

I think we already have 8 (!) copies of this logic in ptrace.c.

And you add two more, seems like it should be in a helper function.

Can you add a helper that does it and use that helper in these two
functions. Then if you can send me another patch that converts all the
other uses to use the new helper.

cheers

> @@ -2133,6 +2143,16 @@ static int tm_cgpr32_set(struct task_struct *target,
>  		     unsigned int pos, unsigned int count,
>  		     const void *kbuf, const void __user *ubuf)
>  {
> +	if (!cpu_has_feature(CPU_FTR_TM))
> +		return -ENODEV;
> +
> +	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
> +		return -ENODATA;
> +
> +	flush_tmregs_to_thread(target);
> +	flush_fp_to_thread(target);
> +	flush_altivec_to_thread(target);
> +
>  	return gpr32_set_common(target, regset, pos, count, kbuf, ubuf,
>  			&target->thread.ckpt_regs.gpr[0]);
>  }
> -- 
> 2.13.6

^ permalink raw reply

* Re: [RFC PATCH 1/5] powerpc: Fix inverted active predicate for setting the EBB regset
From: Michael Ellerman @ 2018-06-13  2:15 UTC (permalink / raw)
  To: Pedro Franco de Carvalho, linuxppc-dev, Anshuman Khandual
In-Reply-To: <20180607152534.29427-2-pedromfc@linux.vnet.ibm.com>

Pedro Franco de Carvalho <pedromfc@linux.vnet.ibm.com> writes:

> Currently, the ebb_set function for writing to the EBB regset returns
> ENODATA when ebb is active in the thread, and copies in the data when
> it is inactive. This patch inverts the condition so that it matches
> ebb_get and ebb_active.
> ---
>  arch/powerpc/kernel/ptrace.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Hi Pedro,

Thanks for looking into this, how did you detect this? Do you have a
test case?

I don't think Anshuman wrote it this way on purpose, but added him to Cc
in case he remembers.

But I don't think this fix is necessarily right. If we are setting the
EBB regs via ptrace then it doesn't matter if they were previously in
use or not, we should just set them. What *does* matter is that at the
end of the function we set used_ebb to true, because otherwise the
values we have set will not actually be used when the process is
rescheduled.

cheers

> diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
> index d23cf632edf0..6618570c6d56 100644
> --- a/arch/powerpc/kernel/ptrace.c
> +++ b/arch/powerpc/kernel/ptrace.c
> @@ -1701,7 +1701,7 @@ static int ebb_set(struct task_struct *target,
>  	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
>  		return -ENODEV;
>  
> -	if (target->thread.used_ebb)
> +	if (!target->thread.used_ebb)
>  		return -ENODATA;
>  
>  	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
> -- 
> 2.13.6

^ permalink raw reply

* Re: UBSAN: Undefined behaviour in ../include/linux/percpu_counter.h:137:13
From: Michael Ellerman @ 2018-06-13  1:42 UTC (permalink / raw)
  To: Mathieu Malaterre, linuxppc-dev
In-Reply-To: <CA+7wUszLF3R3QCgBw2jwLtgLw2C=QnH1jHz6_RUDDouMEmfr9A@mail.gmail.com>

Mathieu Malaterre <malat@debian.org> writes:

> Hi there,
>
> I have a reproducible UBSAN appearing in dmesg after a while on my G4
> (*). Could anyone suggest a way to diagnose the actual root issue here
> (or is it just a false positive) ?

It looks like a real overflow, I guess the question is why are we seeing it.

The first thing to work out would be what exactly is overflowing.

Is it in here?

	cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns,
				     rq->io_start_time_ns, rq->cmd_flags);


If so that would suggest something is taking multiple hours to complete,
which seems unlikely. Is time going backward?

cheers

> (*)
> [41877.514338] ================================================================================
> [41877.514364] UBSAN: Undefined behaviour in
> ../include/linux/percpu_counter.h:137:13
> [41877.514373] signed integer overflow:
> [41877.514378] 9223352809007201260 + 41997676517838 cannot be
> represented in type 'long long int'
> [41877.514389] CPU: 0 PID: 0 Comm: swapper Not tainted 4.17.0+ #54
> [41877.514394] Call Trace:
> [41877.514411] [dffedd30] [c047a5f8] ubsan_epilogue+0x18/0x4c (unreliable)
> [41877.514422] [dffedd40] [c047af98] handle_overflow+0xbc/0xdc
> [41877.514437] [dffeddc0] [c043aaa8] cfq_completed_request+0x560/0x1234
> [41877.514446] [dffede40] [c03f595c] __blk_put_request+0xb0/0x2dc
> [41877.514460] [dffede80] [c05aa41c] scsi_end_request+0x19c/0x344
> [41877.514469] [dffedeb0] [c05abba0] scsi_io_completion+0x4b4/0x854
> [41877.514482] [dffedf10] [c040604c] blk_done_softirq+0xe4/0x1e0
> [41877.514496] [dffedf60] [c07eef84] __do_softirq+0x16c/0x5f0
> [41877.514508] [dffedfd0] [c0065160] irq_exit+0x110/0x1a8
> [41877.514520] [dffedff0] [c001646c] call_do_irq+0x24/0x3c
> [41877.514533] [c0ce5e80] [c0009a2c] do_IRQ+0x98/0x1a0
> [41877.514541] [c0ce5eb0] [c001b93c] ret_from_except+0x0/0x14
> [41877.514549] --- interrupt: 501 at arch_cpu_idle+0x30/0x78
>                    LR = arch_cpu_idle+0x30/0x78
> [41877.514558] [c0ce5f70] [c0ce4000] 0xc0ce4000 (unreliable)
> [41877.514570] [c0ce5f80] [c00a3928] do_idle+0xc4/0x158
> [41877.514577] [c0ce5fb0] [c00a3b74] cpu_startup_entry+0x24/0x28
> [41877.514585] [c0ce5fc0] [c0988820] start_kernel+0x47c/0x490
> [41877.514592] [c0ce5ff0] [00003444] 0x3444
> [41877.514597] ================================================================================
> [41886.390210] ================================================================================
> [41886.390236] UBSAN: Undefined behaviour in
> ../include/linux/percpu_counter.h:137:13
> [41886.390245] signed integer overflow:
> [41886.390250] 9223366156262940402 + 42006563339289 cannot be
> represented in type 'long long int'
> [41886.390260] CPU: 0 PID: 0 Comm: swapper Not tainted 4.17.0+ #54
> [41886.390265] Call Trace:
> [41886.390282] [dffedd30] [c047a5f8] ubsan_epilogue+0x18/0x4c (unreliable)
> [41886.390293] [dffedd40] [c047af98] handle_overflow+0xbc/0xdc
> [41886.390309] [dffeddc0] [c043a8c4] cfq_completed_request+0x37c/0x1234
> [41886.390317] [dffede40] [c03f595c] __blk_put_request+0xb0/0x2dc
> [41886.390331] [dffede80] [c05aa41c] scsi_end_request+0x19c/0x344
> [41886.390340] [dffedeb0] [c05abba0] scsi_io_completion+0x4b4/0x854
> [41886.390353] [dffedf10] [c040604c] blk_done_softirq+0xe4/0x1e0
> [41886.390367] [dffedf60] [c07eef84] __do_softirq+0x16c/0x5f0
> [41886.390379] [dffedfd0] [c0065160] irq_exit+0x110/0x1a8
> [41886.390391] [dffedff0] [c001646c] call_do_irq+0x24/0x3c
> [41886.390404] [c0ce5e80] [c0009a2c] do_IRQ+0x98/0x1a0
> [41886.390411] [c0ce5eb0] [c001b93c] ret_from_except+0x0/0x14
> [41886.390420] --- interrupt: 501 at arch_cpu_idle+0x30/0x78
>                    LR = arch_cpu_idle+0x30/0x78
> [41886.390429] [c0ce5f70] [c0ce4000] 0xc0ce4000 (unreliable)
> [41886.390441] [c0ce5f80] [c00a3928] do_idle+0xc4/0x158
> [41886.390449] [c0ce5fb0] [c00a3b74] cpu_startup_entry+0x24/0x28
> [41886.390457] [c0ce5fc0] [c0988820] start_kernel+0x47c/0x490
> [41886.390463] [c0ce5ff0] [00003444] 0x3444
> [41886.390468] ================================================================================

^ permalink raw reply

* Re: [RFC PATCH 3/3] powerpc/64s/radix: optimise TLB flush with precise TLB ranges in mmu_gather
From: Linus Torvalds @ 2018-06-13  1:10 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-mm, ppc-dev, linux-arch, Aneesh Kumar K. V, Minchan Kim,
	Mel Gorman, Nadav Amit, Andrew Morton
In-Reply-To: <20180613101241.004fd64e@roar.ozlabs.ibm.com>

On Tue, Jun 12, 2018 at 5:12 PM Nicholas Piggin <npiggin@gmail.com> wrote:
> >
> > And in _theory_, maybe you could have just used "invalpg" with a
> > targeted address instead. In fact, I think a single invlpg invalidates
> > _all_ caches for the associated MM, but don't quote me on that.

Confirmed. The SDK says

 "INVLPG also invalidates all entries in all paging-structure caches
  associated with the current PCID, regardless of the linear addresses
  to which they correspond"

so if x86 wants to do this "separate invalidation for page directory
entryes", then it would want to

 (a) remove the __tlb_adjust_range() operation entirely from
pud_free_tlb() and friends

 (b) instead just have a single field for "invalidate_tlb_caches",
which could be a boolean, or could just be one of the addresses

and then the logic would be that IFF no other tlb invalidate is done
due to an actual page range, then we look at that
invalidate_tlb_caches field, and do a single INVLPG instead.

I still am not sure if this would actually make a difference in
practice, but I guess it does mean that x86 could at least participate
in some kind of scheme where we have architecture-specific actions for
those page directory entries.

And we could make the default behavior - if no architecture-specific
tlb page directory invalidation function exists - be the current
"__tlb_adjust_range()" case. So the default would be to not change
behavior, and architectures could opt in to something like this.

            Linus

^ permalink raw reply

* [RFC PATCH 22/23] watchdog/hardlockup/hpet: Only enable the HPET watchdog via a boot parameter
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Rafael J. Wysocki, Don Zickus,
	Nicholas Piggin, Michael Ellerman, Frederic Weisbecker,
	Alexei Starovoitov, Babu Moger, Mathieu Desnoyers,
	Masami Hiramatsu, Peter Zijlstra, Andrew Morton,
	Philippe Ombredanne, Colin Ian King, Byungchul Park,
	Paul E. McKenney, Luis R. Rodriguez, Waiman Long, Josh Poimboeuf,
	Randy Dunlap, Davidlohr Bueso, Christoffer Dall, Marc Zyngier,
	Kai-Heng Feng, Konrad Rzeszutek Wilk, David Rientjes, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

Keep the HPET-based hardlockup detector disabled unless explicitly enabled
via a command line argument. If such parameter is not given, the hardlockup
detector will fallback to use the perf-based implementation.

The function hardlockup_panic_setup() is updated to return 0 in order to
to allow __setup functions of specific hardlockup detectors (in this case
hardlockup_detector_hpet_setup()) to inspect the nmi_watchdog boot
parameter.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Babu Moger <babu.moger@oracle.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Christoffer Dall <cdall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Kai-Heng Feng <kai.heng.feng@canonical.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
--
checkpatch gives the following warning:

CHECK: __setup appears un-documented -- check Documentation/admin-guide/kernel-parameters.rst
+__setup("nmi_watchdog=", hardlockup_detector_hpet_setup);

This is a false-positive as the option nmi_watchdog is already
documented. The option is re-evaluated in this file as well.
---
 Documentation/admin-guide/kernel-parameters.txt |  5 ++++-
 kernel/watchdog.c                               |  2 +-
 kernel/watchdog_hld_hpet.c                      | 13 +++++++++++++
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f2040d4..a8833c7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2577,7 +2577,7 @@
 			Format: [state][,regs][,debounce][,die]
 
 	nmi_watchdog=	[KNL,BUGS=X86] Debugging features for SMP kernels
-			Format: [panic,][nopanic,][num]
+			Format: [panic,][nopanic,][num,][hpet]
 			Valid num: 0 or 1
 			0 - turn hardlockup detector in nmi_watchdog off
 			1 - turn hardlockup detector in nmi_watchdog on
@@ -2587,6 +2587,9 @@
 			please see 'nowatchdog'.
 			This is useful when you use a panic=... timeout and
 			need the box quickly up again.
+			When hpet is specified, the NMI watchdog will be driven
+			by an HPET timer, if available in the system. Otherwise,
+			the perf-based implementation will be used.
 
 			These settings can be accessed at runtime via
 			the nmi_watchdog and hardlockup_panic sysctls.
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b94bbe3..b5ce6e4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -84,7 +84,7 @@ static int __init hardlockup_panic_setup(char *str)
 		nmi_watchdog_user_enabled = 0;
 	else if (!strncmp(str, "1", 1))
 		nmi_watchdog_user_enabled = 1;
-	return 1;
+	return 0;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
 
diff --git a/kernel/watchdog_hld_hpet.c b/kernel/watchdog_hld_hpet.c
index ebb820d..12e5937 100644
--- a/kernel/watchdog_hld_hpet.c
+++ b/kernel/watchdog_hld_hpet.c
@@ -17,6 +17,7 @@
 #define pr_fmt(fmt) "NMI hpet watchdog: " fmt
 
 static struct hpet_hld_data *hld_data;
+static bool hardlockup_use_hpet;
 
 /**
  * get_count() - Get the current count of the HPET timer
@@ -488,6 +489,15 @@ static void hardlockup_detector_hpet_stop(void)
 	spin_unlock(&hld_data->lock);
 }
 
+static int __init hardlockup_detector_hpet_setup(char *str)
+{
+	if (strstr(str, "hpet"))
+		hardlockup_use_hpet = true;
+
+	return 0;
+}
+__setup("nmi_watchdog=", hardlockup_detector_hpet_setup);
+
 /**
  * hardlockup_detector_hpet_init() - Initialize the hardlockup detector
  *
@@ -502,6 +512,9 @@ static int __init hardlockup_detector_hpet_init(void)
 {
 	int ret;
 
+	if (!hardlockup_use_hpet)
+		return -EINVAL;
+
 	if (!is_hpet_enabled())
 		return -ENODEV;
 
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 23/23] watchdog/hardlockup: Activate the HPET-based lockup detector
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Rafael J. Wysocki, Don Zickus,
	Nicholas Piggin, Michael Ellerman, Frederic Weisbecker,
	Alexei Starovoitov, Babu Moger, Mathieu Desnoyers,
	Masami Hiramatsu, Peter Zijlstra, Andrew Morton,
	Philippe Ombredanne, Colin Ian King, Byungchul Park,
	Paul E. McKenney, Luis R. Rodriguez, Waiman Long, Josh Poimboeuf,
	Randy Dunlap, Davidlohr Bueso, Christoffer Dall, Marc Zyngier,
	Kai-Heng Feng, Konrad Rzeszutek Wilk, David Rientjes, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

Now that the implementation of the HPET-based hardlockup detector is
complete, enable it. It will be used only if it can be initialized
successfully. Otherwise, the perf-based detector will be used.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Babu Moger <babu.moger@oracle.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Christoffer Dall <cdall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Kai-Heng Feng <kai.heng.feng@canonical.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 kernel/watchdog.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b5ce6e4..e2cc6c0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -149,6 +149,21 @@ int __weak __init watchdog_nmi_probe(void)
 {
 	int ret = -ENODEV;
 
+	/*
+	 * Try first with the HPET hardlockup detector. It will only
+	 * succeed if selected at build time and the nmi_watchdog
+	 * command-line parameter is configured. This ensure that the
+	 * perf-based detector is used by default, if selected at
+	 * build time.
+	 */
+	if (IS_ENABLED(CONFIG_HARDLOCKUP_DETECTOR_HPET))
+		ret = hardlockup_detector_hpet_ops.init();
+
+	if (!ret) {
+		nmi_wd_ops = &hardlockup_detector_hpet_ops;
+		return ret;
+	}
+
 	if (IS_ENABLED(CONFIG_HARDLOCKUP_DETECTOR_PERF))
 		ret = hardlockup_detector_perf_ops.init();
 
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 21/23] watchdog/hardlockup/hpet: Adjust timer expiration on the number of monitored CPUs
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Rafael J. Wysocki, Don Zickus,
	Nicholas Piggin, Michael Ellerman, Frederic Weisbecker,
	Alexei Starovoitov, Babu Moger, Mathieu Desnoyers,
	Masami Hiramatsu, Peter Zijlstra, Andrew Morton,
	Philippe Ombredanne, Colin Ian King, Byungchul Park,
	Paul E. McKenney, Luis R. Rodriguez, Waiman Long, Josh Poimboeuf,
	Randy Dunlap, Davidlohr Bueso, Christoffer Dall, Marc Zyngier,
	Kai-Heng Feng, Konrad Rzeszutek Wilk, David Rientjes, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

Each CPU should be monitored for hardlockups every watchdog_thresh seconds.
Since all the CPUs in the system are monitored by the same timer and the
timer interrupt is rotated among the monitored CPUs, the timer must expire
every watchdog_thresh/N seconds; where N is the number of monitored CPUs.

A new member is added to struct hpet_wdt_data to determine the per-CPU
ticks per second. This quantity is used to program the comparator of the
timer.

The ticks-per-CPU quantity is updated every time when the number of
monitored CPUs changes: when the watchdog is enabled or disabled for
a specific CPU.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Babu Moger <babu.moger@oracle.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Christoffer Dall <cdall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Kai-Heng Feng <kai.heng.feng@canonical.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 arch/x86/include/asm/hpet.h |  1 +
 kernel/watchdog_hld_hpet.c  | 41 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 6ace2d1..e67818d 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -124,6 +124,7 @@ struct hpet_hld_data {
 	u32		irq;
 	u32		flags;
 	u64		ticks_per_second;
+	u64		ticks_per_cpu;
 	struct cpumask	monitored_mask;
 	spinlock_t	lock; /* serialized access to monitored_mask */
 };
diff --git a/kernel/watchdog_hld_hpet.c b/kernel/watchdog_hld_hpet.c
index c40acfd..ebb820d 100644
--- a/kernel/watchdog_hld_hpet.c
+++ b/kernel/watchdog_hld_hpet.c
@@ -65,11 +65,21 @@ static void kick_timer(struct hpet_hld_data *hdata)
 	 * are able to update the comparator before the counter reaches such new
 	 * value.
 	 *
+	 * The timer must monitor each CPU every watch_thresh seconds. Hence the
+	 * timer expiration must be:
+	 *
+	 *    watch_thresh/N
+	 *
+	 * where N is the number of monitored CPUs.
+	 *
+	 * in order to monitor all the online CPUs. ticks_per_cpu gives the
+	 * number of ticks needed to meet the condition above.
+	 *
 	 * Let it wrap around if needed.
 	 */
 	count = get_count();
 
-	new_compare = count + watchdog_thresh * hdata->ticks_per_second;
+	new_compare = count + watchdog_thresh * hdata->ticks_per_cpu;
 
 	set_comparator(hdata, new_compare);
 }
@@ -160,6 +170,33 @@ static bool is_hpet_wdt_interrupt(struct hpet_hld_data *hdata)
 }
 
 /**
+ * update_ticks_per_cpu() - Update the number of HPET ticks per CPU
+ * @hdata:	struct with the timer's the ticks-per-second and CPU mask
+ *
+ * From the overall ticks-per-second of the timer, compute the number of ticks
+ * after which the timer should expire to monitor each CPU every watch_thresh
+ * seconds. The ticks-per-cpu quantity is computed using the number of CPUs that
+ * the watchdog currently monitors.
+ *
+ * Returns:
+ *
+ * None
+ *
+ */
+static void update_ticks_per_cpu(struct hpet_hld_data *hdata)
+{
+	unsigned int num_cpus = cpumask_weight(&hdata->monitored_mask);
+	unsigned long long temp = hdata->ticks_per_second;
+
+	/* Only update if there are monitored CPUs. */
+	if (!num_cpus)
+		return;
+
+	do_div(temp, num_cpus);
+	hdata->ticks_per_cpu = temp;
+}
+
+/**
  * hardlockup_detector_irq_handler() - Interrupt handler
  * @irq:	Interrupt number
  * @data:	Data associated with the interrupt
@@ -390,6 +427,7 @@ static void hardlockup_detector_hpet_enable(void)
 	spin_lock(&hld_data->lock);
 
 	cpumask_set_cpu(cpu, &hld_data->monitored_mask);
+	update_ticks_per_cpu(hld_data);
 
 	/*
 	 * If this is the first CPU to be monitored, set everything in motion:
@@ -425,6 +463,7 @@ static void hardlockup_detector_hpet_disable(void)
 	spin_lock(&hld_data->lock);
 
 	cpumask_clear_cpu(smp_processor_id(), &hld_data->monitored_mask);
+	update_ticks_per_cpu(hld_data);
 
 	/* Only disable the timer if there are no more CPUs to monitor. */
 	if (!cpumask_weight(&hld_data->monitored_mask))
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 20/23] watchdog/hardlockup/hpet: Rotate interrupt among all monitored CPUs
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Rafael J. Wysocki, Don Zickus,
	Nicholas Piggin, Michael Ellerman, Frederic Weisbecker,
	Alexei Starovoitov, Babu Moger, Mathieu Desnoyers,
	Masami Hiramatsu, Peter Zijlstra, Andrew Morton,
	Philippe Ombredanne, Colin Ian King, Byungchul Park,
	Paul E. McKenney, Luis R. Rodriguez, Waiman Long, Josh Poimboeuf,
	Randy Dunlap, Davidlohr Bueso, Christoffer Dall, Marc Zyngier,
	Kai-Heng Feng, Konrad Rzeszutek Wilk, David Rientjes, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

In order to detect hardlockups in all the monitored CPUs, move the
interrupt to the next monitored CPU when handling the NMI interrupt; wrap
around when reaching the highest CPU in the mask. This rotation is achieved
by setting the affinity mask to only contain the next CPU to monitor.

In order to prevent our interrupt to be reassigned to another CPU, flag
it as IRQF_NONBALANCING.

The cpumask monitored_mask keeps track of the CPUs that the watchdog
should monitor. This structure is updated when the NMI watchdog is
enabled or disabled in a specific CPU. As this mask can change
concurrently as CPUs are put online or offline and the watchdog is
disabled or enabled, a lock is required to protect the monitored_mask.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Babu Moger <babu.moger@oracle.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Christoffer Dall <cdall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Kai-Heng Feng <kai.heng.feng@canonical.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 kernel/watchdog_hld_hpet.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/kernel/watchdog_hld_hpet.c b/kernel/watchdog_hld_hpet.c
index 857e051..c40acfd 100644
--- a/kernel/watchdog_hld_hpet.c
+++ b/kernel/watchdog_hld_hpet.c
@@ -10,6 +10,7 @@
 #include <linux/nmi.h>
 #include <linux/hpet.h>
 #include <asm/hpet.h>
+#include <asm/cpumask.h>
 #include <asm/irq_remapping.h>
 
 #undef pr_fmt
@@ -199,8 +200,8 @@ static irqreturn_t hardlockup_detector_irq_handler(int irq, void *data)
  * @regs:	Register values as seen when the NMI was asserted
  *
  * When an NMI is issued, look for hardlockups. If the timer is not periodic,
- * kick it. The interrupt is always handled when if delivered via the
- * Front-Side Bus.
+ * kick it. Move the interrupt to the next monitored CPU. The interrupt is
+ * always handled when if delivered via the Front-Side Bus.
  *
  * Returns:
  *
@@ -211,7 +212,7 @@ static int hardlockup_detector_nmi_handler(unsigned int val,
 					   struct pt_regs *regs)
 {
 	struct hpet_hld_data *hdata = hld_data;
-	unsigned int use_fsb;
+	unsigned int use_fsb, cpu;
 
 	/*
 	 * If FSB delivery mode is used, the timer interrupt is programmed as
@@ -222,8 +223,27 @@ static int hardlockup_detector_nmi_handler(unsigned int val,
 	if (!use_fsb && !is_hpet_wdt_interrupt(hdata))
 		return NMI_DONE;
 
+	/* There are no CPUs to monitor. */
+	if (!cpumask_weight(&hdata->monitored_mask))
+		return NMI_HANDLED;
+
 	inspect_for_hardlockups(regs);
 
+	/*
+	 * Target a new CPU. Keep trying until we find a monitored CPU. CPUs
+	 * are addded and removed to this mask at cpu_up() and cpu_down(),
+	 * respectively. Thus, the interrupt should be able to be moved to
+	 * the next monitored CPU.
+	 */
+	spin_lock(&hld_data->lock);
+	for_each_cpu_wrap(cpu, &hdata->monitored_mask, smp_processor_id() + 1) {
+		if (!irq_set_affinity(hld_data->irq, cpumask_of(cpu)))
+			break;
+		pr_err("Could not assign interrupt to CPU %d. Trying with next present CPU.\n",
+		       cpu);
+	}
+	spin_unlock(&hld_data->lock);
+
 	if (!(hdata->flags & HPET_DEV_PERI_CAP))
 		kick_timer(hdata);
 
@@ -336,7 +356,7 @@ static int setup_hpet_irq(struct hpet_hld_data *hdata)
 	 * Request an interrupt to activate the irq in all the needed domains.
 	 */
 	ret = request_irq(hwirq, hardlockup_detector_irq_handler,
-			  IRQF_TIMER | IRQF_DELIVER_AS_NMI,
+			  IRQF_TIMER | IRQF_DELIVER_AS_NMI | IRQF_NOBALANCING,
 			  "hpet_hld", hdata);
 	if (ret)
 		unregister_nmi_handler(NMI_LOCAL, "hpet_hld");
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 17/23] watchdog/hardlockup/hpet: Convert the timer's interrupt to NMI
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Rafael J. Wysocki, Don Zickus,
	Nicholas Piggin, Michael Ellerman, Frederic Weisbecker,
	Alexei Starovoitov, Babu Moger, Mathieu Desnoyers,
	Masami Hiramatsu, Peter Zijlstra, Andrew Morton,
	Philippe Ombredanne, Colin Ian King, Byungchul Park,
	Paul E. McKenney, Luis R. Rodriguez, Waiman Long, Josh Poimboeuf,
	Randy Dunlap, Davidlohr Bueso, Christoffer Dall, Marc Zyngier,
	Kai-Heng Feng, Konrad Rzeszutek Wilk, David Rientjes, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

In order to detect hardlockups, it is necessary to have the ability to
receive interrupts even when disabled: a non-maskable interrupt is
required. Add the flag IRQF_DELIVER_AS_NMI to the arguments of
request_irq() for this purpose.

Note that the timer, when programmed to deliver interrupts via the IO APIC
is programmed as level-triggered. This is to have an indication that the
NMI comes from HPET timer as indicated in the General Status Interrupt
Register. However, NMIs are always edge-triggered, thus a GSI edge-
triggered interrupt is now requested.

An NMI handler is also implemented. The handler looks for hardlockups and
kicks the timer.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Babu Moger <babu.moger@oracle.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Christoffer Dall <cdall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Kai-Heng Feng <kai.heng.feng@canonical.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 arch/x86/kernel/hpet.c     |  2 +-
 kernel/watchdog_hld_hpet.c | 55 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index fda6e19..5ca1953 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -205,7 +205,7 @@ int hpet_hardlockup_detector_assign_legacy_irq(struct hpet_hld_data *hdata)
 			break;
 		}
 
-		gsi = acpi_register_gsi(NULL, hwirq, ACPI_LEVEL_SENSITIVE,
+		gsi = acpi_register_gsi(NULL, hwirq, ACPI_EDGE_SENSITIVE,
 					ACPI_ACTIVE_LOW);
 		if (gsi > 0)
 			break;
diff --git a/kernel/watchdog_hld_hpet.c b/kernel/watchdog_hld_hpet.c
index 8fa4e55..3bedffa 100644
--- a/kernel/watchdog_hld_hpet.c
+++ b/kernel/watchdog_hld_hpet.c
@@ -10,6 +10,7 @@
 #include <linux/nmi.h>
 #include <linux/hpet.h>
 #include <asm/hpet.h>
+#include <asm/irq_remapping.h>
 
 #undef pr_fmt
 #define pr_fmt(fmt) "NMI hpet watchdog: " fmt
@@ -183,6 +184,8 @@ static irqreturn_t hardlockup_detector_irq_handler(int irq, void *data)
 	if (!(hdata->flags & HPET_DEV_PERI_CAP))
 		kick_timer(hdata);
 
+	pr_err("This interrupt should not have happened. Ensure delivery mode is NMI.\n");
+
 	/* Acknowledge interrupt if in level-triggered mode */
 	if (!use_fsb)
 		hpet_writel(BIT(hdata->num), HPET_STATUS);
@@ -191,6 +194,47 @@ static irqreturn_t hardlockup_detector_irq_handler(int irq, void *data)
 }
 
 /**
+ * hardlockup_detector_nmi_handler() - NMI Interrupt handler
+ * @val:	Attribute associated with the NMI. Not used.
+ * @regs:	Register values as seen when the NMI was asserted
+ *
+ * When an NMI is issued, look for hardlockups. If the timer is not periodic,
+ * kick it. The interrupt is always handled when if delivered via the
+ * Front-Side Bus.
+ *
+ * Returns:
+ *
+ * NMI_DONE if the HPET timer did not cause the interrupt. NMI_HANDLED
+ * otherwise.
+ */
+static int hardlockup_detector_nmi_handler(unsigned int val,
+					   struct pt_regs *regs)
+{
+	struct hpet_hld_data *hdata = hld_data;
+	unsigned int use_fsb;
+
+	/*
+	 * If FSB delivery mode is used, the timer interrupt is programmed as
+	 * edge-triggered and there is no need to check the ISR register.
+	 */
+	use_fsb = hdata->flags & HPET_DEV_FSB_CAP;
+
+	if (!use_fsb && !is_hpet_wdt_interrupt(hdata))
+		return NMI_DONE;
+
+	inspect_for_hardlockups(regs);
+
+	if (!(hdata->flags & HPET_DEV_PERI_CAP))
+		kick_timer(hdata);
+
+	/* Acknowledge interrupt if in level-triggered mode */
+	if (!use_fsb)
+		hpet_writel(BIT(hdata->num), HPET_STATUS);
+
+	return NMI_HANDLED;
+}
+
+/**
  * setup_irq_msi_mode() - Configure the timer to deliver an MSI interrupt
  * @data:	Data associated with the instance of the HPET timer to configure
  *
@@ -282,11 +326,20 @@ static int setup_hpet_irq(struct hpet_hld_data *hdata)
 	if (ret)
 		return ret;
 
+	/* Register the NMI handler, which will be the actual handler we use. */
+	ret = register_nmi_handler(NMI_LOCAL, hardlockup_detector_nmi_handler,
+				   0, "hpet_hld");
+	if (ret)
+		return ret;
+
 	/*
 	 * Request an interrupt to activate the irq in all the needed domains.
 	 */
 	ret = request_irq(hwirq, hardlockup_detector_irq_handler,
-			  IRQF_TIMER, "hpet_hld", hdata);
+			  IRQF_TIMER | IRQF_DELIVER_AS_NMI,
+			  "hpet_hld", hdata);
+	if (ret)
+		unregister_nmi_handler(NMI_LOCAL, "hpet_hld");
 
 	return ret;
 }
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 19/23] watchdog/hardlockup: Make arch_touch_nmi_watchdog() to hpet-based implementation
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Rafael J. Wysocki, Don Zickus,
	Nicholas Piggin, Michael Ellerman, Frederic Weisbecker,
	Alexei Starovoitov, Babu Moger, David S. Miller,
	Benjamin Herrenschmidt, Paul Mackerras, Mathieu Desnoyers,
	Masami Hiramatsu, Peter Zijlstra, Andrew Morton,
	Philippe Ombredanne, Colin Ian King, Byungchul Park,
	Paul E. McKenney, Luis R. Rodriguez, Waiman Long, Josh Poimboeuf,
	Randy Dunlap, Davidlohr Bueso, Christoffer Dall, Marc Zyngier,
	Kai-Heng Feng, Konrad Rzeszutek Wilk, David Rientjes, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

CPU architectures that have an NMI watchdog use arch_touch_nmi_watchdog()
to briefly ignore the hardlockup detector. If the architecture does not
have an NMI watchdog, one can be constructed using a source of non-
maskable interrupts. In this case, arch_touch_nmi_watchdog() is common
to any underlying hardware resource used to drive the detector and needs
to be available to other kernel subsystems if hardware different from perf
drives the detector.

There exists perf-based and HPET-based implementations. Make it available
to the latter.

For clarity, wrap this function in a separate preprocessor conditional
from functions which are truly specific to the perf-based implementation.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Babu Moger <babu.moger@oracle.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Christoffer Dall <cdall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Kai-Heng Feng <kai.heng.feng@canonical.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: sparclinux@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 include/linux/nmi.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 23e20d2..8b6b814 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -89,16 +89,22 @@ static inline void hardlockup_detector_disable(void) {}
 # define NMI_WATCHDOG_SYSCTL_PERM	0444
 #endif
 
-#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
+#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) || \
+    defined(CONFIG_HARDLOCKUP_DETECTOR_HPET)
 extern void arch_touch_nmi_watchdog(void);
+#else
+# if !defined(CONFIG_HAVE_NMI_WATCHDOG)
+static inline void arch_touch_nmi_watchdog(void) {}
+# endif
+#endif
+
+#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
 extern void hardlockup_detector_perf_stop(void);
 extern void hardlockup_detector_perf_restart(void);
 #else
 static inline void hardlockup_detector_perf_stop(void) { }
 static inline void hardlockup_detector_perf_restart(void) { }
-# if !defined(CONFIG_HAVE_NMI_WATCHDOG)
-static inline void arch_touch_nmi_watchdog(void) {}
-# endif
+
 #endif
 
 /**
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 18/23] watchdog/hardlockup/hpet: Add the NMI watchdog operations
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Rafael J. Wysocki, Don Zickus,
	Nicholas Piggin, Michael Ellerman, Frederic Weisbecker,
	Alexei Starovoitov, Babu Moger, Mathieu Desnoyers,
	Masami Hiramatsu, Peter Zijlstra, Andrew Morton,
	Philippe Ombredanne, Colin Ian King, Byungchul Park,
	Paul E. McKenney, Luis R. Rodriguez, Waiman Long, Josh Poimboeuf,
	Randy Dunlap, Davidlohr Bueso, Christoffer Dall, Marc Zyngier,
	Kai-Heng Feng, Konrad Rzeszutek Wilk, David Rientjes, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

Implement the start, stop and disable operations of the HPET-based NMI
watchdog. Given that a single timer is used to monitor all the CPUs in
the system, it is necessary to define a cpumask that keeps track of the
CPUs that can be monitored. This cpumask is protected with a spin lock.

As individual CPUs are put online and offline, this cpumask is updated.
CPUs are unconditionally cleared from the mask when going offline. When
going online, the CPU is set in the mask only if is one of the CPUs allowed
to be monitored by the watchdog.

It is not necessary to implement a start function. The NMI watchdog will
be enabled when there is at least one CPU to monitor.

The disable function clears the CPU mask and disables the timer.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Babu Moger <babu.moger@oracle.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Christoffer Dall <cdall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Kai-Heng Feng <kai.heng.feng@canonical.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 arch/x86/include/asm/hpet.h |  2 +
 include/linux/nmi.h         |  1 +
 kernel/watchdog_hld_hpet.c  | 98 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 101 insertions(+)

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 33309b7..6ace2d1 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -124,6 +124,8 @@ struct hpet_hld_data {
 	u32		irq;
 	u32		flags;
 	u64		ticks_per_second;
+	struct cpumask	monitored_mask;
+	spinlock_t	lock; /* serialized access to monitored_mask */
 };
 
 extern struct hpet_hld_data *hpet_hardlockup_detector_assign_timer(void);
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index e608762..23e20d2 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -129,6 +129,7 @@ struct nmi_watchdog_ops {
 };
 
 extern struct nmi_watchdog_ops hardlockup_detector_perf_ops;
+extern struct nmi_watchdog_ops hardlockup_detector_hpet_ops;
 
 void watchdog_nmi_stop(void);
 void watchdog_nmi_start(void);
diff --git a/kernel/watchdog_hld_hpet.c b/kernel/watchdog_hld_hpet.c
index 3bedffa..857e051 100644
--- a/kernel/watchdog_hld_hpet.c
+++ b/kernel/watchdog_hld_hpet.c
@@ -345,6 +345,91 @@ static int setup_hpet_irq(struct hpet_hld_data *hdata)
 }
 
 /**
+ * hardlockup_detector_hpet_enable() - Enable the hardlockup detector
+ *
+ * The hardlockup detector is enabled for the CPU that executes the
+ * function. It is only enabled if such CPU is allowed to be monitored
+ * by the lockup detector.
+ *
+ * Returns:
+ *
+ * None
+ *
+ */
+static void hardlockup_detector_hpet_enable(void)
+{
+	struct cpumask *allowed = watchdog_get_allowed_cpumask();
+	unsigned int cpu = smp_processor_id();
+
+	if (!hld_data)
+		return;
+
+	if (!cpumask_test_cpu(cpu, allowed))
+		return;
+
+	spin_lock(&hld_data->lock);
+
+	cpumask_set_cpu(cpu, &hld_data->monitored_mask);
+
+	/*
+	 * If this is the first CPU to be monitored, set everything in motion:
+	 * move the interrupt to this CPU, kick and enable the timer.
+	 */
+	if (cpumask_weight(&hld_data->monitored_mask) == 1) {
+		if (irq_set_affinity(hld_data->irq, cpumask_of(cpu))) {
+			spin_unlock(&hld_data->lock);
+			pr_err("Unable to enable on CPU %d.!\n", cpu);
+			return;
+		}
+
+		kick_timer(hld_data);
+		enable(hld_data);
+	}
+
+	spin_unlock(&hld_data->lock);
+}
+
+/**
+ * hardlockup_detector_hpet_disable() - Disable the hardlockup detector
+ *
+ * The hardlockup detector is disabled for the CPU that executes the
+ * function.
+ *
+ * None
+ */
+static void hardlockup_detector_hpet_disable(void)
+{
+	if (!hld_data)
+		return;
+
+	spin_lock(&hld_data->lock);
+
+	cpumask_clear_cpu(smp_processor_id(), &hld_data->monitored_mask);
+
+	/* Only disable the timer if there are no more CPUs to monitor. */
+	if (!cpumask_weight(&hld_data->monitored_mask))
+		disable(hld_data);
+
+	spin_unlock(&hld_data->lock);
+}
+
+/**
+ * hardlockup_detector_hpet_stop() - Stop the NMI watchdog on all CPUs
+ *
+ * Returns:
+ *
+ * None
+ */
+static void hardlockup_detector_hpet_stop(void)
+{
+	disable(hld_data);
+
+	spin_lock(&hld_data->lock);
+	cpumask_clear(&hld_data->monitored_mask);
+	spin_unlock(&hld_data->lock);
+}
+
+/**
  * hardlockup_detector_hpet_init() - Initialize the hardlockup detector
  *
  * Only initialize and configure the detector if an HPET is available on the
@@ -383,5 +468,18 @@ static int __init hardlockup_detector_hpet_init(void)
 	 */
 	disable(hld_data);
 
+	spin_lock_init(&hld_data->lock);
+
+	spin_lock(&hld_data->lock);
+	cpumask_clear(&hld_data->monitored_mask);
+	spin_unlock(&hld_data->lock);
+
 	return 0;
 }
+
+struct nmi_watchdog_ops hardlockup_detector_hpet_ops = {
+	.init		= hardlockup_detector_hpet_init,
+	.enable		= hardlockup_detector_hpet_enable,
+	.disable	= hardlockup_detector_hpet_disable,
+	.stop		= hardlockup_detector_hpet_stop
+};
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 16/23] watchdog/hardlockup: Add an HPET-based hardlockup detector
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Rafael J. Wysocki, Don Zickus,
	Nicholas Piggin, Michael Ellerman, Frederic Weisbecker,
	Alexei Starovoitov, Babu Moger, Mathieu Desnoyers,
	Masami Hiramatsu, Peter Zijlstra, Andrew Morton,
	Philippe Ombredanne, Colin Ian King, Byungchul Park,
	Paul E. McKenney, Luis R. Rodriguez, Waiman Long, Josh Poimboeuf,
	Randy Dunlap, Davidlohr Bueso, Christoffer Dall, Marc Zyngier,
	Kai-Heng Feng, Konrad Rzeszutek Wilk, David Rientjes, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

This is the initial implementation of a hardlockup detector driven by an
HPET timer. This initial implementation includes functions to control
the timer via its registers. It also requests such timer, installs
a minimal interrupt handler and performs the initial configuration of
the timer.

The detector is not functional at this stage. Subsequent changesets will
populate the NMI watchdog operations and register it with the lockup
detector.

This detector depends on HPET_TIMER since platform code performs the
initialization of the timer and maps its registers to memory. It depends
on HPET to compute the ticks per second of the timer.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Babu Moger <babu.moger@oracle.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Christoffer Dall <cdall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Kai-Heng Feng <kai.heng.feng@canonical.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 kernel/Makefile            |   1 +
 kernel/watchdog_hld_hpet.c | 334 +++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug          |  10 ++
 3 files changed, 345 insertions(+)
 create mode 100644 kernel/watchdog_hld_hpet.c

diff --git a/kernel/Makefile b/kernel/Makefile
index 0a0d86d..73c79b2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -86,6 +86,7 @@ obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
 obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o watchdog_hld_perf.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_HPET) += watchdog_hld.o watchdog_hld_hpet.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
diff --git a/kernel/watchdog_hld_hpet.c b/kernel/watchdog_hld_hpet.c
new file mode 100644
index 0000000..8fa4e55
--- /dev/null
+++ b/kernel/watchdog_hld_hpet.c
@@ -0,0 +1,334 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A hardlockup detector driven by an HPET timer.
+ *
+ * Copyright (C) Intel Corporation 2018
+ */
+
+#define pr_fmt(fmt) "NMI hpet watchdog: " fmt
+
+#include <linux/nmi.h>
+#include <linux/hpet.h>
+#include <asm/hpet.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "NMI hpet watchdog: " fmt
+
+static struct hpet_hld_data *hld_data;
+
+/**
+ * get_count() - Get the current count of the HPET timer
+ *
+ * Returns:
+ *
+ * Value of the main counter of the HPET timer
+ */
+static inline unsigned long get_count(void)
+{
+	return hpet_readq(HPET_COUNTER);
+}
+
+/**
+ * set_comparator() - Update the comparator in an HPET timer instance
+ * @hdata:	A data structure with the timer instance to update
+ * @cmp:	The value to write in the in the comparator registere
+ *
+ * Returns:
+ *
+ * None
+ */
+static inline void set_comparator(struct hpet_hld_data *hdata,
+				  unsigned long cmp)
+{
+	hpet_writeq(cmp, HPET_Tn_CMP(hdata->num));
+}
+
+/**
+ * kick_timer() - Reprogram timer to expire in the future
+ * @hdata:	A data structure with the timer instance to update
+ *
+ * Reprogram the timer to expire within watchdog_thresh seconds in the future.
+ *
+ * Returns:
+ *
+ * None
+ */
+static void kick_timer(struct hpet_hld_data *hdata)
+{
+	unsigned long new_compare, count;
+
+	/*
+	 * Update the comparator in increments of watch_thresh seconds relative
+	 * to the current count. Since watch_thresh is given in seconds, we
+	 * are able to update the comparator before the counter reaches such new
+	 * value.
+	 *
+	 * Let it wrap around if needed.
+	 */
+	count = get_count();
+
+	new_compare = count + watchdog_thresh * hdata->ticks_per_second;
+
+	set_comparator(hdata, new_compare);
+}
+
+/**
+ * disable() - Disable an HPET timer instance
+ * @hdata:	A data structure with the timer instance to disable
+ *
+ * Returns:
+ *
+ * None
+ */
+static void disable(struct hpet_hld_data *hdata)
+{
+	unsigned int v;
+
+	v = hpet_readl(HPET_Tn_CFG(hdata->num));
+	v &= ~HPET_TN_ENABLE;
+	hpet_writel(v, HPET_Tn_CFG(hdata->num));
+}
+
+/**
+ * enable() - Enable an HPET timer instance
+ * @hdata:	A data structure with the timer instance to enable
+ *
+ * Returns:
+ *
+ * None
+ */
+static void enable(struct hpet_hld_data *hdata)
+{
+	unsigned long v;
+
+	/* Clear any previously active interrupt. */
+	hpet_writel(BIT(hdata->num), HPET_STATUS);
+
+	v = hpet_readl(HPET_Tn_CFG(hdata->num));
+	v |= HPET_TN_ENABLE;
+	hpet_writel(v, HPET_Tn_CFG(hdata->num));
+}
+
+/**
+ * set_periodic() - Set an HPET timer instance in periodic mode
+ * @hdata:	A data structure with the timer instance to enable
+ *
+ * If the timer supports periodic mode, configure it in such mode.
+ * Returns:
+ *
+ * None
+ */
+static void set_periodic(struct hpet_hld_data *hdata)
+{
+	unsigned long v;
+
+	v = hpet_readl(HPET_Tn_CFG(hdata->num));
+	if (v & HPET_TN_PERIODIC_CAP) {
+		v |= HPET_TN_PERIODIC;
+		hpet_writel(v, HPET_Tn_CFG(hdata->num));
+		hdata->flags |= HPET_DEV_PERI_CAP;
+	}
+}
+
+/**
+ * is_hpet_wdt_interrupt() - Determine if an HPET timer caused interrupt
+ * @hdata:	A data structure with the timer instance to enable
+ *
+ * To be used when the timer is programmed in level-triggered mode, determine
+ * if an instance of an HPET timer indicates that it asserted an interrupt by
+ * checking the status register.
+ *
+ * Returns:
+ *
+ * True if a level-triggered timer asserted an interrupt. False otherwise.
+ */
+static bool is_hpet_wdt_interrupt(struct hpet_hld_data *hdata)
+{
+	unsigned long this_isr;
+	unsigned int lvl_trig;
+
+	this_isr = hpet_readl(HPET_STATUS) & BIT(hdata->num);
+
+	lvl_trig = hpet_readl(HPET_Tn_CFG(hdata->num)) & HPET_TN_LEVEL;
+
+	if (lvl_trig && this_isr)
+		return true;
+
+	return false;
+}
+
+/**
+ * hardlockup_detector_irq_handler() - Interrupt handler
+ * @irq:	Interrupt number
+ * @data:	Data associated with the interrupt
+ *
+ * A simple interrupt handler. Simply kick the timer and acknowledge the
+ * interrupt.
+ *
+ * Returns:
+ *
+ * IRQ_NONE if the HPET timer did not cause the interrupt. IRQ_HANDLED
+ * otherwise.
+ */
+static irqreturn_t hardlockup_detector_irq_handler(int irq, void *data)
+{
+	struct hpet_hld_data *hdata = data;
+	unsigned int use_fsb;
+
+	use_fsb = hdata->flags & HPET_DEV_FSB_CAP;
+
+	if (!use_fsb && !is_hpet_wdt_interrupt(hdata))
+		return IRQ_NONE;
+
+	if (!(hdata->flags & HPET_DEV_PERI_CAP))
+		kick_timer(hdata);
+
+	/* Acknowledge interrupt if in level-triggered mode */
+	if (!use_fsb)
+		hpet_writel(BIT(hdata->num), HPET_STATUS);
+
+	return IRQ_HANDLED;
+}
+
+/**
+ * setup_irq_msi_mode() - Configure the timer to deliver an MSI interrupt
+ * @data:	Data associated with the instance of the HPET timer to configure
+ *
+ * Configure an instance of the HPET timer to deliver interrupts via the Front-
+ * Side Bus.
+ *
+ * Returns:
+ *
+ * 0 success. An error code in configuration was unsuccessful.
+ */
+static int setup_irq_msi_mode(struct hpet_hld_data *hdata)
+{
+	unsigned int v;
+
+	v = hpet_readl(HPET_Tn_CFG(hdata->num));
+
+	/*
+	 * If FSB interrupt delivery is used, configure as edge-triggered
+	 * interrupt. We are certain the interrupt comes from the HPET timer as
+	 * we receive the MSI message.
+	 *
+	 * Also, the FSB delivery mode and the FSB route are configured when the
+	 * interrupt is unmasked.
+	 */
+	v &= ~HPET_TN_LEVEL;
+
+	hpet_writel(v, HPET_Tn_CFG(hdata->num));
+
+	return 0;
+}
+
+/**
+ * setup_irq_legacy_mode() - Configure the timer to deliver an pin interrupt
+ * @data:	Data associated with the instance of the HPET timer to configure
+ *
+ * Configure an instance of the HPET timer to deliver interrupts via a pin of
+ * the IO APIC.
+ *
+ * Returns:
+ *
+ * 0 success. An error code in configuration was unsuccessful.
+ */
+static int setup_irq_legacy_mode(struct hpet_hld_data *hdata)
+{
+	int hwirq = hdata->irq;
+	unsigned long v;
+
+	v = hpet_readl(HPET_Tn_CFG(hdata->num));
+
+	v |= hwirq << HPET_TN_ROUTE_SHIFT;
+	hpet_writel(v, HPET_Tn_CFG(hdata->num));
+
+	/*
+	 * If IO APIC interrupt delivery is used, configure as level-triggered.
+	 * In this way, the ISR register can be used to determine if this HPET
+	 * timer caused the interrupt at the IO APIC pin.
+	 */
+	v |= HPET_TN_LEVEL;
+
+	/* Disable Front-Side Bus delivery. */
+	v &= ~HPET_TN_FSB;
+
+	hpet_writel(v, HPET_Tn_CFG(hdata->num));
+
+	return 0;
+}
+
+/**
+ * setup_hpet_irq() - Configure the interrupt delivery of an HPET timer
+ * @data:	Data associated with the instance of the HPET timer to configure
+ *
+ * Configure the interrupt parameters of an HPET timer. If supported, configure
+ * interrupts to be delivered via the Front-Side Bus. Also, install an interrupt
+ * handler.
+ *
+ * Returns:
+ *
+ * 0 success. An error code in configuration was unsuccessful.
+ */
+static int setup_hpet_irq(struct hpet_hld_data *hdata)
+{
+	int hwirq = hdata->irq, ret;
+
+	if (hdata->flags & HPET_DEV_FSB_CAP)
+		ret = setup_irq_msi_mode(hdata);
+	else
+		ret = setup_irq_legacy_mode(hdata);
+
+	if (ret)
+		return ret;
+
+	/*
+	 * Request an interrupt to activate the irq in all the needed domains.
+	 */
+	ret = request_irq(hwirq, hardlockup_detector_irq_handler,
+			  IRQF_TIMER, "hpet_hld", hdata);
+
+	return ret;
+}
+
+/**
+ * hardlockup_detector_hpet_init() - Initialize the hardlockup detector
+ *
+ * Only initialize and configure the detector if an HPET is available on the
+ * system.
+ *
+ * Returns:
+ *
+ * 0 success. An error code if initialization was unsuccessful.
+ */
+static int __init hardlockup_detector_hpet_init(void)
+{
+	int ret;
+
+	if (!is_hpet_enabled())
+		return -ENODEV;
+
+	hld_data = hpet_hardlockup_detector_assign_timer();
+	if (!hld_data)
+		return -ENODEV;
+
+	/* Disable before configuring. */
+	disable(hld_data);
+
+	set_periodic(hld_data);
+
+	/* Set timer for the first time relative to the current count. */
+	kick_timer(hld_data);
+
+	ret = setup_hpet_irq(hld_data);
+	if (ret)
+		return -ENODEV;
+
+	/*
+	 * Timer might have been enabled when the interrupt was unmasked.
+	 * This should be done via the .enable operation.
+	 */
+	disable(hld_data);
+
+	return 0;
+}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c40c7b7..6e79833 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -828,6 +828,16 @@ config HARDLOCKUP_DETECTOR_PERF
 	bool
 	select SOFTLOCKUP_DETECTOR
 
+config HARDLOCKUP_DETECTOR_HPET
+	bool "Use HPET Timer for Hard Lockup Detection"
+	select SOFTLOCKUP_DETECTOR
+	select HARDLOCKUP_DETECTOR
+	depends on HPET_TIMER && HPET
+	help
+	  Say y to enable a hardlockup detector that is driven by an High-Precision
+	  Event Timer. In addition to selecting this option, the command-line
+	  parameter nmi_watchdog option. See Documentation/admin-guide/kernel-parameters.rst
+
 #
 # Enables a timestamp based low pass filter to compensate for perf based
 # hard lockup detection which runs too fast due to turbo modes.
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 15/23] kernel/watchdog: Add a function to obtain the watchdog_allowed_mask
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Rafael J. Wysocki, Don Zickus,
	Nicholas Piggin, Michael Ellerman, Frederic Weisbecker,
	Alexei Starovoitov, Babu Moger, Paul Mackerras, Mathieu Desnoyers,
	Masami Hiramatsu, Peter Zijlstra, Andrew Morton,
	Philippe Ombredanne, Colin Ian King, Byungchul Park,
	Paul E. McKenney, Luis R. Rodriguez, Waiman Long, Josh Poimboeuf,
	Randy Dunlap, Davidlohr Bueso, Christoffer Dall, Marc Zyngier,
	Kai-Heng Feng, Konrad Rzeszutek Wilk, David Rientjes,
	David S. Miller, Benjamin Herrenschmidt, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

Implementations of NMI watchdogs that use a single piece of hardware to
monitor all the CPUs in the system (as opposed to per-CPU implementations
such as perf) need to know which CPUs the watchdog is allowed to monitor.
In this manner, non-maskable interrupts are directed only to the monitored
CPUs.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Babu Moger <babu.moger@oracle.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Christoffer Dall <cdall@linaro.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Kai-Heng Feng <kai.heng.feng@canonical.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: iommu@lists.linux-foundation.org
Cc: sparclinux@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 include/linux/nmi.h | 1 +
 kernel/watchdog.c   | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index e61b441..e608762 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -77,6 +77,7 @@ static inline void reset_hung_task_detector(void) { }
 
 #if defined(CONFIG_HARDLOCKUP_DETECTOR)
 extern void hardlockup_detector_disable(void);
+extern struct cpumask *watchdog_get_allowed_cpumask(void);
 extern unsigned int hardlockup_panic;
 #else
 static inline void hardlockup_detector_disable(void) {}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5057376..b94bbe3 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -50,7 +50,7 @@ int __read_mostly nmi_watchdog_available;
 
 static struct nmi_watchdog_ops *nmi_wd_ops;
 
-struct cpumask watchdog_allowed_mask __read_mostly;
+static struct cpumask watchdog_allowed_mask __read_mostly;
 
 struct cpumask watchdog_cpumask __read_mostly;
 unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -98,6 +98,11 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str)
 }
 __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
 # endif /* CONFIG_SMP */
+
+struct cpumask *watchdog_get_allowed_cpumask(void)
+{
+	return &watchdog_allowed_mask;
+}
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
 /*
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 12/23] kernel/watchdog: Introduce a struct for NMI watchdog operations
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Don Zickus, Nicholas Piggin,
	Michael Ellerman, Frederic Weisbecker, Babu Moger,
	David S. Miller, Benjamin Herrenschmidt, Paul Mackerras,
	Mathieu Desnoyers, Masami Hiramatsu, Peter Zijlstra,
	Andrew Morton, Philippe Ombredanne, Colin Ian King,
	Luis R. Rodriguez, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

Instead of exposing individual functions for the operations of the NMI
watchdog, define a common interface that can be used across multiple
implementations.

The struct nmi_watchdog_ops is defined for such operations. These initial
definitions include the enable, disable, start, stop, and cleanup
operations.

Only a single NMI watchdog can be used in the system. The operations of
this NMI watchdog are accessed via the new variable nmi_wd_ops. This
variable is set to point the operations of the first NMI watchdog that
initializes successfully. Even though at this moment, the only available
NMI watchdog is the perf-based hardlockup detector. More implementations
can be added in the future.

While introducing this new struct for the NMI watchdog operations, convert
the perf-based NMI watchdog to use these operations.

The functions hardlockup_detector_perf_restart() and
hardlockup_detector_perf_stop() are special. They are not regular watchdog
operations; they are used to work around hardware bugs. Thus, they are not
used for the start and stop operations. Furthermore, the perf-based NMI
watchdog does not need to implement such operations. They are intended to
globally start and stop the NMI watchdog; the perf-based NMI
watchdog is implemented on a per-CPU basis.

Currently, when perf-based hardlockup detector is not selected at build
time, a dummy hardlockup_detector_perf_init() is used. The return value
of this function depends on CONFIG_HAVE_NMI_WATCHDOG. This behavior is
conserved by defining using the set of NMI watchdog operations structure
hardlockup_detector_noop. These dummy operations are used when no hard-
lockup detector is used or fails to initialize.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Babu Moger <babu.moger@oracle.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: sparclinux@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 include/linux/nmi.h   | 39 +++++++++++++++++++++++++++----------
 kernel/watchdog.c     | 54 +++++++++++++++++++++++++++++++++++++++++++++------
 kernel/watchdog_hld.c | 16 +++++++++++----
 3 files changed, 89 insertions(+), 20 deletions(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index b8d868d..d3f5d55f 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -92,24 +92,43 @@ static inline void hardlockup_detector_disable(void) {}
 extern void arch_touch_nmi_watchdog(void);
 extern void hardlockup_detector_perf_stop(void);
 extern void hardlockup_detector_perf_restart(void);
-extern void hardlockup_detector_perf_disable(void);
-extern void hardlockup_detector_perf_enable(void);
-extern void hardlockup_detector_perf_cleanup(void);
-extern int hardlockup_detector_perf_init(void);
 #else
 static inline void hardlockup_detector_perf_stop(void) { }
 static inline void hardlockup_detector_perf_restart(void) { }
-static inline void hardlockup_detector_perf_disable(void) { }
-static inline void hardlockup_detector_perf_enable(void) { }
-static inline void hardlockup_detector_perf_cleanup(void) { }
 # if !defined(CONFIG_HAVE_NMI_WATCHDOG)
-static inline int hardlockup_detector_perf_init(void) { return -ENODEV; }
 static inline void arch_touch_nmi_watchdog(void) {}
-# else
-static inline int hardlockup_detector_perf_init(void) { return 0; }
 # endif
 #endif
 
+/**
+ * struct nmi_watchdog_ops - Operations performed by NMI watchdogs
+ * @init:		Initialize and configure the hardware resources of the
+ *			NMI watchdog.
+ * @enable:		Enable (i.e., monitor for hardlockups) the NMI watchdog
+ *			in the CPU in which the function is executed.
+ * @disable:		Disable (i.e., do not monitor for hardlockups) the NMI
+ *			in the CPU in which the function is executed.
+ * @start:		Start the the NMI watchdog in all CPUs. Used after the
+ *			parameters of the watchdog are updated. Optional if
+ *			such updates does not impact operation the NMI watchdog.
+ * @stop:		Stop the the NMI watchdog in all CPUs. Used before the
+ *			parameters of the watchdog are updated. Optional if
+ *			such updates does not impact the NMI watchdog.
+ * @cleanup:		Cleanup unneeded data structures of the NMI watchdog.
+ *			Used after updating the parameters of the watchdog.
+ *			Optional no cleanup is needed.
+ */
+struct nmi_watchdog_ops {
+	int	(*init)(void);
+	void	(*enable)(void);
+	void	(*disable)(void);
+	void	(*start)(void);
+	void	(*stop)(void);
+	void	(*cleanup)(void);
+};
+
+extern struct nmi_watchdog_ops hardlockup_detector_perf_ops;
+
 void watchdog_nmi_stop(void);
 void watchdog_nmi_start(void);
 int watchdog_nmi_probe(void);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 576d180..5057376 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -48,6 +48,8 @@ int __read_mostly soft_watchdog_user_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
 int __read_mostly nmi_watchdog_available;
 
+static struct nmi_watchdog_ops *nmi_wd_ops;
+
 struct cpumask watchdog_allowed_mask __read_mostly;
 
 struct cpumask watchdog_cpumask __read_mostly;
@@ -99,6 +101,23 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
 /*
+ * Define a non-existent hard lockup detector. It will be used only if
+ * no actual hardlockup detector was selected at built time.
+ */
+static inline int noop_hardlockup_detector_init(void)
+{
+	/* If arch has an NMI watchdog, pretend to initialize it. */
+	if (IS_ENABLED(CONFIG_HAVE_NMI_WATCHDOG))
+		return 0;
+	else
+		return -ENODEV;
+}
+
+static struct nmi_watchdog_ops hardlockup_detector_noop = {
+	.init = noop_hardlockup_detector_init,
+};
+
+/*
  * These functions can be overridden if an architecture implements its
  * own hardlockup detector.
  *
@@ -108,19 +127,33 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
  */
 int __weak watchdog_nmi_enable(unsigned int cpu)
 {
-	hardlockup_detector_perf_enable();
+	if (nmi_wd_ops && nmi_wd_ops->enable)
+		nmi_wd_ops->enable();
+
 	return 0;
 }
 
 void __weak watchdog_nmi_disable(unsigned int cpu)
 {
-	hardlockup_detector_perf_disable();
+	if (nmi_wd_ops && nmi_wd_ops->disable)
+		nmi_wd_ops->disable();
 }
 
 /* Return 0, if a NMI watchdog is available. Error code otherwise */
 int __weak __init watchdog_nmi_probe(void)
 {
-	return hardlockup_detector_perf_init();
+	int ret = -ENODEV;
+
+	if (IS_ENABLED(CONFIG_HARDLOCKUP_DETECTOR_PERF))
+		ret = hardlockup_detector_perf_ops.init();
+
+	if (!ret) {
+		nmi_wd_ops = &hardlockup_detector_perf_ops;
+		return ret;
+	}
+
+	nmi_wd_ops = &hardlockup_detector_noop;
+	return nmi_wd_ops->init();
 }
 
 /**
@@ -131,7 +164,11 @@ int __weak __init watchdog_nmi_probe(void)
  * update_variables();
  * watchdog_nmi_start();
  */
-void __weak watchdog_nmi_stop(void) { }
+void __weak watchdog_nmi_stop(void)
+{
+	if (nmi_wd_ops && nmi_wd_ops->stop)
+		nmi_wd_ops->stop();
+}
 
 /**
  * watchdog_nmi_start - Start the watchdog after reconfiguration
@@ -144,7 +181,11 @@ void __weak watchdog_nmi_stop(void) { }
  * - watchdog_thresh
  * - watchdog_cpumask
  */
-void __weak watchdog_nmi_start(void) { }
+void __weak watchdog_nmi_start(void)
+{
+	if (nmi_wd_ops && nmi_wd_ops->start)
+		nmi_wd_ops->start();
+}
 
 /**
  * lockup_detector_update_enable - Update the sysctl enable bit
@@ -627,7 +668,8 @@ static inline void lockup_detector_setup(void)
 static void __lockup_detector_cleanup(void)
 {
 	lockdep_assert_held(&watchdog_mutex);
-	hardlockup_detector_perf_cleanup();
+	if (nmi_wd_ops && nmi_wd_ops->cleanup)
+		nmi_wd_ops->cleanup();
 }
 
 /**
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index e449a23..036cb0a 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -186,7 +186,7 @@ static int hardlockup_detector_event_create(void)
 /**
  * hardlockup_detector_perf_enable - Enable the local event
  */
-void hardlockup_detector_perf_enable(void)
+static void hardlockup_detector_perf_enable(void)
 {
 	if (hardlockup_detector_event_create())
 		return;
@@ -201,7 +201,7 @@ void hardlockup_detector_perf_enable(void)
 /**
  * hardlockup_detector_perf_disable - Disable the local event
  */
-void hardlockup_detector_perf_disable(void)
+static void hardlockup_detector_perf_disable(void)
 {
 	struct perf_event *event = this_cpu_read(watchdog_ev);
 
@@ -219,7 +219,7 @@ void hardlockup_detector_perf_disable(void)
  *
  * Called from lockup_detector_cleanup(). Serialized by the caller.
  */
-void hardlockup_detector_perf_cleanup(void)
+static void hardlockup_detector_perf_cleanup(void)
 {
 	int cpu;
 
@@ -281,7 +281,7 @@ void __init hardlockup_detector_perf_restart(void)
 /**
  * hardlockup_detector_perf_init - Probe whether NMI event is available at all
  */
-int __init hardlockup_detector_perf_init(void)
+static int __init hardlockup_detector_perf_init(void)
 {
 	int ret = hardlockup_detector_event_create();
 
@@ -291,5 +291,13 @@ int __init hardlockup_detector_perf_init(void)
 		perf_event_release_kernel(this_cpu_read(watchdog_ev));
 		this_cpu_write(watchdog_ev, NULL);
 	}
+
 	return ret;
 }
+
+struct nmi_watchdog_ops hardlockup_detector_perf_ops = {
+	.init		= hardlockup_detector_perf_init,
+	.enable		= hardlockup_detector_perf_enable,
+	.disable	= hardlockup_detector_perf_disable,
+	.cleanup	= hardlockup_detector_perf_cleanup,
+};
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 14/23] watchdog/hardlockup: Decouple the hardlockup detector from perf
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Don Zickus, Nicholas Piggin,
	Michael Ellerman, Frederic Weisbecker, Babu Moger,
	David S. Miller, Benjamin Herrenschmidt, Paul Mackerras,
	Mathieu Desnoyers, Masami Hiramatsu, Peter Zijlstra,
	Andrew Morton, Philippe Ombredanne, Colin Ian King,
	Luis R. Rodriguez, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

The current default implementation of the hardlockup detector assumes that
it is implemented using perf events. However, the hardlockup detector can
be driven by other sources of non-maskable interrupts (e.g., a properly
configured timer).

Put in a separate file all the code that is specific to perf: create and
manage events, stop and start the detector. This perf-specific code is put
in the new file watchdog_hld_perf.c

The code generic code used to monitor the timers' thresholds, check
timestamps and detect hardlockups remains in watchdog_hld.c

Functions and variables are simply relocated to a new file. No functional
changes were made.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Babu Moger <babu.moger@oracle.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: sparclinux@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 kernel/Makefile            |   2 +-
 kernel/watchdog_hld.c      | 162 ----------------------------------------
 kernel/watchdog_hld_perf.c | 182 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 183 insertions(+), 163 deletions(-)
 create mode 100644 kernel/watchdog_hld_perf.c

diff --git a/kernel/Makefile b/kernel/Makefile
index f85ae5d..0a0d86d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -85,7 +85,7 @@ obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
-obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o watchdog_hld_perf.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 28a00c3..96615a2 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -22,12 +22,8 @@
 
 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
-static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-static DEFINE_PER_CPU(struct perf_event *, dead_event);
-static struct cpumask dead_events_mask;
 
 static unsigned long hardlockup_allcpu_dumped;
-static atomic_t watchdog_cpus = ATOMIC_INIT(0);
 
 void arch_touch_nmi_watchdog(void)
 {
@@ -98,14 +94,6 @@ static inline bool watchdog_check_timestamp(void)
 }
 #endif
 
-static struct perf_event_attr wd_hw_attr = {
-	.type		= PERF_TYPE_HARDWARE,
-	.config		= PERF_COUNT_HW_CPU_CYCLES,
-	.size		= sizeof(struct perf_event_attr),
-	.pinned		= 1,
-	.disabled	= 1,
-};
-
 void inspect_for_hardlockups(struct pt_regs *regs)
 {
 	if (__this_cpu_read(watchdog_nmi_touch) == true) {
@@ -155,153 +143,3 @@ void inspect_for_hardlockups(struct pt_regs *regs)
 	__this_cpu_write(hard_watchdog_warn, false);
 	return;
 }
-
-/* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event,
-				       struct perf_sample_data *data,
-				       struct pt_regs *regs)
-{
-	/* Ensure the watchdog never gets throttled */
-	event->hw.interrupts = 0;
-	inspect_for_hardlockups(regs);
-}
-
-static int hardlockup_detector_event_create(void)
-{
-	unsigned int cpu = smp_processor_id();
-	struct perf_event_attr *wd_attr;
-	struct perf_event *evt;
-
-	wd_attr = &wd_hw_attr;
-	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
-
-	/* Try to register using hardware perf events */
-	evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
-					       watchdog_overflow_callback, NULL);
-	if (IS_ERR(evt)) {
-		pr_info("Perf event create on CPU %d failed with %ld\n", cpu,
-			PTR_ERR(evt));
-		return PTR_ERR(evt);
-	}
-	this_cpu_write(watchdog_ev, evt);
-	return 0;
-}
-
-/**
- * hardlockup_detector_perf_enable - Enable the local event
- */
-static void hardlockup_detector_perf_enable(void)
-{
-	if (hardlockup_detector_event_create())
-		return;
-
-	/* use original value for check */
-	if (!atomic_fetch_inc(&watchdog_cpus))
-		pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
-
-	perf_event_enable(this_cpu_read(watchdog_ev));
-}
-
-/**
- * hardlockup_detector_perf_disable - Disable the local event
- */
-static void hardlockup_detector_perf_disable(void)
-{
-	struct perf_event *event = this_cpu_read(watchdog_ev);
-
-	if (event) {
-		perf_event_disable(event);
-		this_cpu_write(watchdog_ev, NULL);
-		this_cpu_write(dead_event, event);
-		cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
-		atomic_dec(&watchdog_cpus);
-	}
-}
-
-/**
- * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
- *
- * Called from lockup_detector_cleanup(). Serialized by the caller.
- */
-static void hardlockup_detector_perf_cleanup(void)
-{
-	int cpu;
-
-	for_each_cpu(cpu, &dead_events_mask) {
-		struct perf_event *event = per_cpu(dead_event, cpu);
-
-		/*
-		 * Required because for_each_cpu() reports  unconditionally
-		 * CPU0 as set on UP kernels. Sigh.
-		 */
-		if (event)
-			perf_event_release_kernel(event);
-		per_cpu(dead_event, cpu) = NULL;
-	}
-	cpumask_clear(&dead_events_mask);
-}
-
-/**
- * hardlockup_detector_perf_stop - Globally stop watchdog events
- *
- * Special interface for x86 to handle the perf HT bug.
- */
-void __init hardlockup_detector_perf_stop(void)
-{
-	int cpu;
-
-	lockdep_assert_cpus_held();
-
-	for_each_online_cpu(cpu) {
-		struct perf_event *event = per_cpu(watchdog_ev, cpu);
-
-		if (event)
-			perf_event_disable(event);
-	}
-}
-
-/**
- * hardlockup_detector_perf_restart - Globally restart watchdog events
- *
- * Special interface for x86 to handle the perf HT bug.
- */
-void __init hardlockup_detector_perf_restart(void)
-{
-	int cpu;
-
-	lockdep_assert_cpus_held();
-
-	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-		return;
-
-	for_each_online_cpu(cpu) {
-		struct perf_event *event = per_cpu(watchdog_ev, cpu);
-
-		if (event)
-			perf_event_enable(event);
-	}
-}
-
-/**
- * hardlockup_detector_perf_init - Probe whether NMI event is available at all
- */
-static int __init hardlockup_detector_perf_init(void)
-{
-	int ret = hardlockup_detector_event_create();
-
-	if (ret) {
-		pr_info("Perf NMI watchdog permanently disabled\n");
-	} else {
-		perf_event_release_kernel(this_cpu_read(watchdog_ev));
-		this_cpu_write(watchdog_ev, NULL);
-	}
-
-	return ret;
-}
-
-struct nmi_watchdog_ops hardlockup_detector_perf_ops = {
-	.init		= hardlockup_detector_perf_init,
-	.enable		= hardlockup_detector_perf_enable,
-	.disable	= hardlockup_detector_perf_disable,
-	.cleanup	= hardlockup_detector_perf_cleanup,
-};
diff --git a/kernel/watchdog_hld_perf.c b/kernel/watchdog_hld_perf.c
new file mode 100644
index 0000000..abc8edc
--- /dev/null
+++ b/kernel/watchdog_hld_perf.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Detect hard lockups on a system
+ *
+ * started by Ricardo Neri, Copyright (C) 2018 Intel Corporation.
+ *
+ * Note: All of this code comes from the previous perf-specific hardlockup
+ * detector.
+ */
+
+#define pr_fmt(fmt) "NMI perf watchdog: " fmt
+
+#include <linux/nmi.h>
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/sched/debug.h>
+#include <linux/perf_event.h>
+#include <asm/irq_regs.h>
+
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+static DEFINE_PER_CPU(struct perf_event *, dead_event);
+static struct cpumask dead_events_mask;
+
+static atomic_t watchdog_cpus = ATOMIC_INIT(0);
+
+static struct perf_event_attr wd_hw_attr = {
+	.type		= PERF_TYPE_HARDWARE,
+	.config		= PERF_COUNT_HW_CPU_CYCLES,
+	.size		= sizeof(struct perf_event_attr),
+	.pinned		= 1,
+	.disabled	= 1,
+};
+
+/* Callback function for perf event subsystem */
+static void watchdog_overflow_callback(struct perf_event *event,
+				       struct perf_sample_data *data,
+				       struct pt_regs *regs)
+{
+	/* Ensure the watchdog never gets throttled */
+	event->hw.interrupts = 0;
+	inspect_for_hardlockups(regs);
+}
+
+static int hardlockup_detector_event_create(void)
+{
+	unsigned int cpu = smp_processor_id();
+	struct perf_event_attr *wd_attr;
+	struct perf_event *evt;
+
+	wd_attr = &wd_hw_attr;
+	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+
+	/* Try to register using hardware perf events */
+	evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
+					       watchdog_overflow_callback, NULL);
+	if (IS_ERR(evt)) {
+		pr_info("Perf event create on CPU %d failed with %ld\n", cpu,
+			PTR_ERR(evt));
+		return PTR_ERR(evt);
+	}
+	this_cpu_write(watchdog_ev, evt);
+	return 0;
+}
+
+/**
+ * hardlockup_detector_perf_enable - Enable the local event
+ */
+static void hardlockup_detector_perf_enable(void)
+{
+	if (hardlockup_detector_event_create())
+		return;
+
+	/* use original value for check */
+	if (!atomic_fetch_inc(&watchdog_cpus))
+		pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
+
+	perf_event_enable(this_cpu_read(watchdog_ev));
+}
+
+/**
+ * hardlockup_detector_perf_disable - Disable the local event
+ */
+static void hardlockup_detector_perf_disable(void)
+{
+	struct perf_event *event = this_cpu_read(watchdog_ev);
+
+	if (event) {
+		perf_event_disable(event);
+		this_cpu_write(watchdog_ev, NULL);
+		this_cpu_write(dead_event, event);
+		cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
+		atomic_dec(&watchdog_cpus);
+	}
+}
+
+/**
+ * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
+ *
+ * Called from lockup_detector_cleanup(). Serialized by the caller.
+ */
+static void hardlockup_detector_perf_cleanup(void)
+{
+	int cpu;
+
+	for_each_cpu(cpu, &dead_events_mask) {
+		struct perf_event *event = per_cpu(dead_event, cpu);
+
+		/*
+		 * Required because for_each_cpu() reports  unconditionally
+		 * CPU0 as set on UP kernels. Sigh.
+		 */
+		if (event)
+			perf_event_release_kernel(event);
+		per_cpu(dead_event, cpu) = NULL;
+	}
+	cpumask_clear(&dead_events_mask);
+}
+
+/**
+ * hardlockup_detector_perf_stop - Globally stop watchdog events
+ *
+ * Special interface for x86 to handle the perf HT bug.
+ */
+void __init hardlockup_detector_perf_stop(void)
+{
+	int cpu;
+
+	lockdep_assert_cpus_held();
+
+	for_each_online_cpu(cpu) {
+		struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+		if (event)
+			perf_event_disable(event);
+	}
+}
+
+/**
+ * hardlockup_detector_perf_restart - Globally restart watchdog events
+ *
+ * Special interface for x86 to handle the perf HT bug.
+ */
+void __init hardlockup_detector_perf_restart(void)
+{
+	int cpu;
+
+	lockdep_assert_cpus_held();
+
+	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+		return;
+
+	for_each_online_cpu(cpu) {
+		struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+		if (event)
+			perf_event_enable(event);
+	}
+}
+
+/**
+ * hardlockup_detector_perf_init - Probe whether NMI event is available at all
+ */
+static int __init hardlockup_detector_perf_init(void)
+{
+	int ret = hardlockup_detector_event_create();
+
+	if (ret) {
+		pr_info("Perf NMI watchdog permanently disabled\n");
+	} else {
+		perf_event_release_kernel(this_cpu_read(watchdog_ev));
+		this_cpu_write(watchdog_ev, NULL);
+	}
+
+	return ret;
+}
+
+struct nmi_watchdog_ops hardlockup_detector_perf_ops = {
+	.init		= hardlockup_detector_perf_init,
+	.enable		= hardlockup_detector_perf_enable,
+	.disable	= hardlockup_detector_perf_disable,
+	.cleanup	= hardlockup_detector_perf_cleanup,
+};
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 13/23] watchdog/hardlockup: Define a generic function to detect hardlockups
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Don Zickus, Nicholas Piggin,
	Michael Ellerman, Frederic Weisbecker, Babu Moger,
	David S. Miller, Benjamin Herrenschmidt, Paul Mackerras,
	Mathieu Desnoyers, Masami Hiramatsu, Peter Zijlstra,
	Andrew Morton, Philippe Ombredanne, Colin Ian King,
	Luis R. Rodriguez, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

The procedure to detect hardlockups is independent of the underlying
mechanism that generated the non-maskable interrupt used to drive the
detector. Thus, it can be put in a separate, generic function. In this
manner, it can be invoked by various implementations of the NMI watchdog.

For this purpose, move the bulk of watchdog_overflow_callback() to the
new function inspect_for_hardlockups(). This function can then be called
from the applicable NMI handlers.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Babu Moger <babu.moger@oracle.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: sparclinux@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 include/linux/nmi.h   |  1 +
 kernel/watchdog_hld.c | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index d3f5d55f..e61b441 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -223,6 +223,7 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
 				void __user *, size_t *, loff_t *);
 extern int proc_watchdog_cpumask(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
+void inspect_for_hardlockups(struct pt_regs *regs);
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
 #include <asm/nmi.h>
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 036cb0a..28a00c3 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -106,14 +106,8 @@ static struct perf_event_attr wd_hw_attr = {
 	.disabled	= 1,
 };
 
-/* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event,
-				       struct perf_sample_data *data,
-				       struct pt_regs *regs)
+void inspect_for_hardlockups(struct pt_regs *regs)
 {
-	/* Ensure the watchdog never gets throttled */
-	event->hw.interrupts = 0;
-
 	if (__this_cpu_read(watchdog_nmi_touch) == true) {
 		__this_cpu_write(watchdog_nmi_touch, false);
 		return;
@@ -162,6 +156,16 @@ static void watchdog_overflow_callback(struct perf_event *event,
 	return;
 }
 
+/* Callback function for perf event subsystem */
+static void watchdog_overflow_callback(struct perf_event *event,
+				       struct perf_sample_data *data,
+				       struct pt_regs *regs)
+{
+	/* Ensure the watchdog never gets throttled */
+	event->hw.interrupts = 0;
+	inspect_for_hardlockups(regs);
+}
+
 static int hardlockup_detector_event_create(void)
 {
 	unsigned int cpu = smp_processor_id();
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 10/23] x86/hpet: Relocate flag definitions to a header file
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Clemens Ladisch, Arnd Bergmann,
	Philippe Ombredanne, Kate Stewart, Rafael J. Wysocki, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

Users of HPET timers (such as the hardlockup detector) need the definitions
of these flags to interpret the configuration of a timer as passed by
platform code.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 arch/x86/include/asm/hpet.h | 6 ++++++
 arch/x86/kernel/hpet.c      | 6 ------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 3266796..9fd112a 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -64,6 +64,12 @@
 /* Timer used for the hardlockup detector */
 #define HPET_WD_TIMER_NR 2
 
+#define HPET_DEV_USED_BIT		2
+#define HPET_DEV_USED			(1 << HPET_DEV_USED_BIT)
+#define HPET_DEV_VALID			0x8
+#define HPET_DEV_FSB_CAP		0x1000
+#define HPET_DEV_PERI_CAP		0x2000
+
 /* hpet memory map physical address */
 extern unsigned long hpet_address;
 extern unsigned long force_hpet_address;
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index b03faee..99d4972 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -24,12 +24,6 @@
    NSEC = 10^-9 */
 #define FSEC_PER_NSEC			1000000L
 
-#define HPET_DEV_USED_BIT		2
-#define HPET_DEV_USED			(1 << HPET_DEV_USED_BIT)
-#define HPET_DEV_VALID			0x8
-#define HPET_DEV_FSB_CAP		0x1000
-#define HPET_DEV_PERI_CAP		0x2000
-
 #define HPET_MIN_CYCLES			128
 #define HPET_MIN_PROG_DELTA		(HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
 
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 11/23] x86/hpet: Configure the timer used by the hardlockup detector
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Clemens Ladisch, Arnd Bergmann,
	Philippe Ombredanne, Kate Stewart, Rafael J. Wysocki, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

Implement the initial configuration of the timer to be used by the
hardlockup detector. The main focus of this configuration is to provide an
interrupt for the timer.

Two types of interrupt can be assigned to the timer. First, attempt to
assign a message-signaled interrupt. This implies creating the HPET MSI
domain; only if it was not created when HPET timers are used for event
timers. The data structures needed to allocate the MSI interrupt in the
domain are also created.

If message-signaled interrupts cannot be used, assign a legacy IO APIC
interrupt via the ACPI Global System Interrupts.

The resulting interrupt configuration, along with the timer instance, and
frequency are then made available to the hardlockup detector in a struct
via the new function hpet_hardlockup_detector_assign_timer().

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 arch/x86/include/asm/hpet.h |  16 +++++++
 arch/x86/kernel/hpet.c      | 112 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 9fd112a..33309b7 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -118,6 +118,22 @@ extern void hpet_unregister_irq_handler(rtc_irq_handler handler);
 
 #endif /* CONFIG_HPET_EMULATE_RTC */
 
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_HPET
+struct hpet_hld_data {
+	u32		num;
+	u32		irq;
+	u32		flags;
+	u64		ticks_per_second;
+};
+
+extern struct hpet_hld_data *hpet_hardlockup_detector_assign_timer(void);
+#else
+static inline struct hpet_hld_data *hpet_hardlockup_detector_assign_timer(void)
+{
+	return NULL;
+}
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_HPET */
+
 #else /* CONFIG_HPET_TIMER */
 
 static inline int hpet_enable(void) { return 0; }
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 99d4972..fda6e19 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -5,6 +5,7 @@
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/i8253.h>
+#include <linux/acpi.h>
 #include <linux/slab.h>
 #include <linux/hpet.h>
 #include <linux/init.h>
@@ -36,6 +37,7 @@ bool					hpet_msi_disable;
 
 #ifdef CONFIG_PCI_MSI
 static unsigned int			hpet_num_timers;
+static struct irq_domain		*hpet_domain;
 #endif
 static void __iomem			*hpet_virt_address;
 
@@ -177,6 +179,115 @@ do {								\
 		_hpet_print_config(__func__, __LINE__);	\
 } while (0)
 
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_HPET
+static
+int hpet_hardlockup_detector_assign_legacy_irq(struct hpet_hld_data *hdata)
+{
+	unsigned long v;
+	int gsi, hwirq;
+
+	/* Obtain interrupt pins that can be used by this timer. */
+	v = hpet_readq(HPET_Tn_CFG(HPET_WD_TIMER_NR));
+	v = (v & Tn_INT_ROUTE_CAP_MASK) >> Tn_INT_ROUTE_CAP_SHIFT;
+
+	/*
+	 * In PIC mode, skip IRQ0-4, IRQ6-9, IRQ12-15 which is always used by
+	 * legacy device. In IO APIC mode, we skip all the legacy IRQS.
+	 */
+	if (acpi_irq_model == ACPI_IRQ_MODEL_PIC)
+		v &= ~0xf3df;
+	else
+		v &= ~0xffff;
+
+	for_each_set_bit(hwirq, &v, HPET_MAX_IRQ) {
+		if (hwirq >= NR_IRQS) {
+			hwirq = HPET_MAX_IRQ;
+			break;
+		}
+
+		gsi = acpi_register_gsi(NULL, hwirq, ACPI_LEVEL_SENSITIVE,
+					ACPI_ACTIVE_LOW);
+		if (gsi > 0)
+			break;
+	}
+
+	if (hwirq >= HPET_MAX_IRQ)
+		return -ENODEV;
+
+	hdata->irq = hwirq;
+	return 0;
+}
+
+static int hpet_hardlockup_detector_assign_msi_irq(struct hpet_hld_data *hdata)
+{
+	struct hpet_dev *hdev;
+	int hwirq;
+
+	if (hpet_msi_disable)
+		return -ENODEV;
+
+	hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
+	if (!hdev)
+		return -ENOMEM;
+
+	hdev->flags |= HPET_DEV_FSB_CAP;
+	hdev->num = hdata->num;
+	sprintf(hdev->name, "hpet_hld");
+
+	/* Domain may exist if CPU does not have Always-Running APIC Timers. */
+	if (!hpet_domain) {
+		hpet_domain = hpet_create_irq_domain(hpet_blockid);
+		if (!hpet_domain)
+			return -EPERM;
+	}
+
+	hwirq = hpet_assign_irq(hpet_domain, hdev, hdev->num);
+	if (hwirq <= 0) {
+		kfree(hdev);
+		return -ENODEV;
+	}
+
+	hdata->irq = hwirq;
+	hdata->flags |= HPET_DEV_FSB_CAP;
+
+	hdev->irq = hwirq;
+
+	return 0;
+}
+
+struct hpet_hld_data *hpet_hardlockup_detector_assign_timer(void)
+{
+	struct hpet_hld_data *hdata;
+	int ret = -ENODEV;
+	unsigned int cfg;
+
+	hdata = kzalloc(sizeof(*hdata), GFP_KERNEL);
+	if (!hdata)
+		return NULL;
+
+	hdata->num = HPET_WD_TIMER_NR;
+
+	cfg = hpet_readl(HPET_Tn_CFG(HPET_WD_TIMER_NR));
+
+	hdata->ticks_per_second = hpet_get_ticks_per_sec(hpet_readq(HPET_ID));
+
+	/* Try first an MSI interrupt or fallback to IO APIC. */
+	if (cfg & HPET_TN_FSB_CAP)
+		ret = hpet_hardlockup_detector_assign_msi_irq(hdata);
+
+	if (!ret)
+		return hdata;
+
+	ret = hpet_hardlockup_detector_assign_legacy_irq(hdata);
+	if (ret) {
+		kfree(hdata);
+		return NULL;
+	}
+
+	return hdata;
+}
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_HPET */
+
 /*
  * When the hpet driver (/dev/hpet) is enabled, we need to reserve
  * timer 0 and timer 1 in case of RTC emulation. Timer 2 is reserved in case
@@ -450,7 +561,6 @@ static struct clock_event_device hpet_clockevent = {
 
 static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
 static struct hpet_dev	*hpet_devs;
-static struct irq_domain *hpet_domain;
 
 void hpet_msi_unmask(struct irq_data *data)
 {
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 08/23] x86/hpet: Calculate ticks-per-second in a separate function
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Clemens Ladisch, Arnd Bergmann,
	Philippe Ombredanne, Kate Stewart, Rafael J. Wysocki, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

It is easier to compute the expiration times of an HPET timer by using
its frequency (i.e., the number of times it ticks in a second) than its
period, as given in the capabilities register.

In addition to the HPET char driver, the HPET-based hardlockup detector
will also need to know the timer's frequency. Thus, create a common
function that both can use.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 drivers/char/hpet.c  | 31 +++++++++++++++++++++++++------
 include/linux/hpet.h |  1 +
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index be426eb..1c9584a 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -838,6 +838,29 @@ static unsigned long hpet_calibrate(struct hpets *hpetp)
 	return ret;
 }
 
+u64 hpet_get_ticks_per_sec(u64 hpet_caps)
+{
+	u64 ticks_per_sec, period;
+
+	period = (hpet_caps & HPET_COUNTER_CLK_PERIOD_MASK) >>
+		 HPET_COUNTER_CLK_PERIOD_SHIFT; /* fs, 10^-15 */
+
+	/*
+	 * The frequency is the reciprocal of the period. The period is given
+	 * femtoseconds per second. Thus, prepare a dividend to obtain the
+	 * frequency in ticks per second.
+	 */
+
+	/* 10^15 femtoseconds per second */
+	ticks_per_sec = 1000000000000000uLL;
+	ticks_per_sec += period >> 1; /* round */
+
+	/* The quotient is put in the dividend. We drop the remainder. */
+	do_div(ticks_per_sec, period);
+
+	return ticks_per_sec;
+}
+
 int hpet_alloc(struct hpet_data *hdp)
 {
 	u64 cap, mcfg;
@@ -847,7 +870,6 @@ int hpet_alloc(struct hpet_data *hdp)
 	size_t siz;
 	struct hpet __iomem *hpet;
 	static struct hpets *last;
-	unsigned long period;
 	unsigned long long temp;
 	u32 remainder;
 
@@ -883,6 +905,8 @@ int hpet_alloc(struct hpet_data *hdp)
 
 	cap = readq(&hpet->hpet_cap);
 
+	temp = hpet_get_ticks_per_sec(cap);
+
 	ntimer = ((cap & HPET_NUM_TIM_CAP_MASK) >> HPET_NUM_TIM_CAP_SHIFT) + 1;
 
 	if (hpetp->hp_ntimer != ntimer) {
@@ -899,11 +923,6 @@ int hpet_alloc(struct hpet_data *hdp)
 
 	last = hpetp;
 
-	period = (cap & HPET_COUNTER_CLK_PERIOD_MASK) >>
-		HPET_COUNTER_CLK_PERIOD_SHIFT; /* fs, 10^-15 */
-	temp = 1000000000000000uLL; /* 10^15 femtoseconds per second */
-	temp += period >> 1; /* round */
-	do_div(temp, period);
 	hpetp->hp_tick_freq = temp; /* ticks per second */
 
 	printk(KERN_INFO "hpet%d: at MMIO 0x%lx, IRQ%s",
diff --git a/include/linux/hpet.h b/include/linux/hpet.h
index 8604564..e7b36bcf4 100644
--- a/include/linux/hpet.h
+++ b/include/linux/hpet.h
@@ -107,5 +107,6 @@ static inline void hpet_reserve_timer(struct hpet_data *hd, int timer)
 }
 
 int hpet_alloc(struct hpet_data *);
+u64 hpet_get_ticks_per_sec(u64 hpet_caps);
 
 #endif				/* !__HPET__ */
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 09/23] x86/hpet: Reserve timer for the HPET hardlockup detector
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Clemens Ladisch, Arnd Bergmann,
	Philippe Ombredanne, Kate Stewart, Rafael J. Wysocki, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

HPET timer 2 will be used to drive the HPET-based hardlockup detector.
Reserve such timer to ensure it cannot be used by user space programs or
clock events.

When looking for MSI-capable timers for clock events, skip timer 2 if
the HPET hardlockup detector is selected.

Also, do not assign an IO APIC pin to timer 2 of the HPET. A subsequent
changeset will handle the interrupt setup of the timer used for the
hardlockup detector.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: Clemens Ladisch <clemens@ladisch.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 arch/x86/include/asm/hpet.h |  3 +++
 arch/x86/kernel/hpet.c      | 19 ++++++++++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 9e0afde..3266796 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -61,6 +61,9 @@
  */
 #define HPET_MIN_PERIOD		100000UL
 
+/* Timer used for the hardlockup detector */
+#define HPET_WD_TIMER_NR 2
+
 /* hpet memory map physical address */
 extern unsigned long hpet_address;
 extern unsigned long force_hpet_address;
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 3fa1d3f..b03faee 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -185,7 +185,8 @@ do {								\
 
 /*
  * When the hpet driver (/dev/hpet) is enabled, we need to reserve
- * timer 0 and timer 1 in case of RTC emulation.
+ * timer 0 and timer 1 in case of RTC emulation. Timer 2 is reserved in case
+ * the HPET-based hardlockup detector is used.
  */
 #ifdef CONFIG_HPET
 
@@ -195,7 +196,7 @@ static void hpet_reserve_platform_timers(unsigned int id)
 {
 	struct hpet __iomem *hpet = hpet_virt_address;
 	struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
-	unsigned int nrtimers, i;
+	unsigned int nrtimers, i, start_timer;
 	struct hpet_data hd;
 
 	nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
@@ -210,6 +211,13 @@ static void hpet_reserve_platform_timers(unsigned int id)
 	hpet_reserve_timer(&hd, 1);
 #endif
 
+	if (IS_ENABLED(CONFIG_HARDLOCKUP_DETECTOR_HPET)) {
+		hpet_reserve_timer(&hd, HPET_WD_TIMER_NR);
+		start_timer = HPET_WD_TIMER_NR + 1;
+	} else {
+		start_timer = HPET_WD_TIMER_NR;
+	}
+
 	/*
 	 * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254
 	 * is wrong for i8259!) not the output IRQ.  Many BIOS writers
@@ -218,7 +226,7 @@ static void hpet_reserve_platform_timers(unsigned int id)
 	hd.hd_irq[0] = HPET_LEGACY_8254;
 	hd.hd_irq[1] = HPET_LEGACY_RTC;
 
-	for (i = 2; i < nrtimers; timer++, i++) {
+	for (i = start_timer; i < nrtimers; timer++, i++) {
 		hd.hd_irq[i] = (readl(&timer->hpet_config) &
 			Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
 	}
@@ -630,6 +638,11 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 		struct hpet_dev *hdev = &hpet_devs[num_timers_used];
 		unsigned int cfg = hpet_readl(HPET_Tn_CFG(i));
 
+		/* Do not use timer reserved for the HPET watchdog. */
+		if (IS_ENABLED(CONFIG_HARDLOCKUP_DETECTOR_HPET) &&
+		    i == HPET_WD_TIMER_NR)
+			continue;
+
 		/* Only consider HPET timer with MSI support */
 		if (!(cfg & HPET_TN_FSB_CAP))
 			continue;
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 07/23] x86/hpet: Expose more functions to read and write registers
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Philippe Ombredanne, Kate Stewart,
	Rafael J. Wysocki, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

Some of the registers in the HPET hardware have a width of 64 bits. 64-bit
access functions are needed mostly to read the counter and write the
comparator in a single read or write. Also, 64-bit accesses can be used to
to read parameters located in the higher bits of some registers (such as
the timer period and the IO APIC pins that can be asserted by the timer)
without the need of masking and shifting the register values.

64-bit read and write functions are added. These functions, along with the
existing hpet_writel(), are exposed via the HPET header to be used by other
kernel subsystems.

Thus far, the only consumer of these functions will the HPET-based
hardlockup detector, which will only be available in 64-bit builds. Thus,
the 64-bit access functions are wrapped in CONFIG_X86_64.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 arch/x86/include/asm/hpet.h | 10 ++++++++++
 arch/x86/kernel/hpet.c      | 12 +++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 67385d5..9e0afde 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -72,6 +72,11 @@ extern int is_hpet_enabled(void);
 extern int hpet_enable(void);
 extern void hpet_disable(void);
 extern unsigned int hpet_readl(unsigned int a);
+extern void hpet_writel(unsigned int d, unsigned int a);
+#ifdef CONFIG_X86_64
+extern unsigned long hpet_readq(unsigned int a);
+extern void hpet_writeq(unsigned long d, unsigned int a);
+#endif
 extern void force_hpet_resume(void);
 
 struct irq_data;
@@ -109,6 +114,11 @@ extern void hpet_unregister_irq_handler(rtc_irq_handler handler);
 static inline int hpet_enable(void) { return 0; }
 static inline int is_hpet_enabled(void) { return 0; }
 #define hpet_readl(a) 0
+#define hpet_writel(d, a)
+#ifdef CONFIG_X86_64
+#define hpet_readq(a) 0
+#define hpet_writeq(d, a)
+#endif
 #define default_setup_hpet_msi	NULL
 
 #endif
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 8ce4212..3fa1d3f 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -64,12 +64,22 @@ inline unsigned int hpet_readl(unsigned int a)
 	return readl(hpet_virt_address + a);
 }
 
-static inline void hpet_writel(unsigned int d, unsigned int a)
+inline void hpet_writel(unsigned int d, unsigned int a)
 {
 	writel(d, hpet_virt_address + a);
 }
 
 #ifdef CONFIG_X86_64
+inline unsigned long hpet_readq(unsigned int a)
+{
+	return readq(hpet_virt_address + a);
+}
+
+inline void hpet_writeq(unsigned long d, unsigned int a)
+{
+	writeq(d, hpet_virt_address + a);
+}
+
 #include <asm/pgtable.h>
 #endif
 
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH 06/23] x86/ioapic: Add support for IRQCHIP_CAN_DELIVER_AS_NMI with interrupt remapping
From: Ricardo Neri @ 2018-06-13  0:57 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin
  Cc: Andi Kleen, Ashok Raj, Borislav Petkov, Tony Luck,
	Ravi V. Shankar, x86, sparclinux, linuxppc-dev, linux-kernel,
	Ricardo Neri, Jacob Pan, Juergen Gross, Baoquan He,
	Eric W. Biederman, Dou Liyang, Jan Kiszka, iommu
In-Reply-To: <1528851463-21140-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

Even though there is a delivery mode field at the entries of an IO APIC's
redirection table, the documentation of the majority of the IO APICs
explicitly states that interrupt delivery as non-maskable is not supported.
Thus,

However, when using an IO APIC in combination with the Intel VT-d interrupt
remapping functionality, the delivery of the interrupt to the CPU is
handled by the remapping hardware. In such a case, the interrupt can be
delivered as non maskable.

Thus, add the IRQCHIP_CAN_DELIVER_AS_NMI flag only when used in combination
with interrupt remapping.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 arch/x86/kernel/apic/io_apic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 10a20f8..39de91b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1911,7 +1911,8 @@ static struct irq_chip ioapic_ir_chip __read_mostly = {
 	.irq_eoi		= ioapic_ir_ack_level,
 	.irq_set_affinity	= ioapic_set_affinity,
 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
-	.flags			= IRQCHIP_SKIP_SET_WAKE,
+	.flags			= IRQCHIP_SKIP_SET_WAKE |
+				  IRQCHIP_CAN_DELIVER_AS_NMI,
 };
 
 static inline void init_IO_APIC_traps(void)
-- 
2.7.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox