LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 3/3] mm: introduce ARCH_HAS_PTE_DEVMAP
From: Robin Murphy @ 2019-04-29 17:22 UTC (permalink / raw)
  To: linux-mm
  Cc: Ira Weiny, Anshuman Khandual, x86, linux-kernel,
	Oliver O'Halloran, akpm, linuxppc-dev, Dan Williams
In-Reply-To: <cover.1556555457.git.robin.murphy@arm.com>

ARCH_HAS_ZONE_DEVICE is somewhat meaningless in itself, and combined
with the long-out-of-date comment can lead to the impression than an
architecture may just enable it (since __add_pages() now "comprehends
device memory" for itself) and expect things to work.

In practice, however, ZONE_DEVICE users have little chance of
functioning correctly without __HAVE_ARCH_PTE_DEVMAP, so let's clean
that up the same way as ARCH_HAS_PTE_SPECIAL and make it the proper
dependency so the real situation is clearer.

Cc: x86@kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Acked-by: Oliver O'Halloran <oohall@gmail.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

v2: Add review tags.

 arch/powerpc/Kconfig                         | 2 +-
 arch/powerpc/include/asm/book3s/64/pgtable.h | 1 -
 arch/x86/Kconfig                             | 2 +-
 arch/x86/include/asm/pgtable.h               | 4 ++--
 arch/x86/include/asm/pgtable_types.h         | 1 -
 include/linux/mm.h                           | 4 ++--
 include/linux/pfn_t.h                        | 4 ++--
 mm/Kconfig                                   | 5 ++---
 mm/gup.c                                     | 2 +-
 9 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5e3d0853c31d..77e1993bba80 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -135,6 +135,7 @@ config PPC
 	select ARCH_HAS_MMIOWB			if PPC64
 	select ARCH_HAS_PHYS_TO_DMA
 	select ARCH_HAS_PMEM_API                if PPC64
+	select ARCH_HAS_PTE_DEVMAP		if PPC_BOOK3S_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_MEMBARRIER_CALLBACKS
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC64
@@ -142,7 +143,6 @@ config PPC
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if PPC64
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_ZONE_DEVICE		if PPC_BOOK3S_64
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 581f91be9dd4..02c22ac8f387 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -90,7 +90,6 @@
 #define _PAGE_SOFT_DIRTY	_RPAGE_SW3 /* software: software dirty tracking */
 #define _PAGE_SPECIAL		_RPAGE_SW2 /* software: special page */
 #define _PAGE_DEVMAP		_RPAGE_SW1 /* software: ZONE_DEVICE page */
-#define __HAVE_ARCH_PTE_DEVMAP
 
 /*
  * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5ad92419be19..ffd50f27f395 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -60,6 +60,7 @@ config X86
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
 	select ARCH_HAS_PMEM_API		if X86_64
+	select ARCH_HAS_PTE_DEVMAP		if X86_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_REFCOUNT
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
@@ -69,7 +70,6 @@ config X86
 	select ARCH_HAS_STRICT_MODULE_RWX
 	select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_ZONE_DEVICE		if X86_64
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_MIGHT_HAVE_ACPI_PDC		if ACPI
 	select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2779ace16d23..89a1f6fd48bf 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -254,7 +254,7 @@ static inline int has_transparent_hugepage(void)
 	return boot_cpu_has(X86_FEATURE_PSE);
 }
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline int pmd_devmap(pmd_t pmd)
 {
 	return !!(pmd_val(pmd) & _PAGE_DEVMAP);
@@ -715,7 +715,7 @@ static inline int pte_present(pte_t a)
 	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline int pte_devmap(pte_t a)
 {
 	return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d6ff0bbdb394..b5e49e6bac63 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -103,7 +103,6 @@
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
 #define _PAGE_DEVMAP	(_AT(u64, 1) << _PAGE_BIT_DEVMAP)
-#define __HAVE_ARCH_PTE_DEVMAP
 #else
 #define _PAGE_NX	(_AT(pteval_t, 0))
 #define _PAGE_DEVMAP	(_AT(pteval_t, 0))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d76dfb7ac617..fe05c94f23e9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -504,7 +504,7 @@ struct inode;
 #define page_private(page)		((page)->private)
 #define set_page_private(page, v)	((page)->private = (v))
 
-#if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static inline int pmd_devmap(pmd_t pmd)
 {
 	return 0;
@@ -1698,7 +1698,7 @@ static inline void sync_mm_rss(struct mm_struct *mm)
 }
 #endif
 
-#ifndef __HAVE_ARCH_PTE_DEVMAP
+#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline int pte_devmap(pte_t pte)
 {
 	return 0;
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 7bb77850c65a..de8bc66b10a4 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -104,7 +104,7 @@ static inline pud_t pfn_t_pud(pfn_t pfn, pgprot_t pgprot)
 #endif
 #endif
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline bool pfn_t_devmap(pfn_t pfn)
 {
 	const u64 flags = PFN_DEV|PFN_MAP;
@@ -122,7 +122,7 @@ pmd_t pmd_mkdevmap(pmd_t pmd);
 	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
 pud_t pud_mkdevmap(pud_t pud);
 #endif
-#endif /* __HAVE_ARCH_PTE_DEVMAP */
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
 
 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
 static inline bool pfn_t_special(pfn_t pfn)
diff --git a/mm/Kconfig b/mm/Kconfig
index 25c71eb8a7db..fcb7ab08e294 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -655,8 +655,7 @@ config IDLE_PAGE_TRACKING
 	  See Documentation/admin-guide/mm/idle_page_tracking.rst for
 	  more details.
 
-# arch_add_memory() comprehends device memory
-config ARCH_HAS_ZONE_DEVICE
+config ARCH_HAS_PTE_DEVMAP
 	bool
 
 config ZONE_DEVICE
@@ -664,7 +663,7 @@ config ZONE_DEVICE
 	depends on MEMORY_HOTPLUG
 	depends on MEMORY_HOTREMOVE
 	depends on SPARSEMEM_VMEMMAP
-	depends on ARCH_HAS_ZONE_DEVICE
+	depends on ARCH_HAS_PTE_DEVMAP
 	select XARRAY_MULTI
 
 	help
diff --git a/mm/gup.c b/mm/gup.c
index f84e22685aaa..72a5c7d1e1a7 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1623,7 +1623,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 }
 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
 
-#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 		unsigned long end, struct page **pages, int *nr)
 {
-- 
2.21.0.dirty


^ permalink raw reply related

* Re: [PATCH 12/10] powerpc: unbreak DYNAMIC_DEBUG=y build with clang
From: Nick Desaulniers @ 2019-04-29 17:34 UTC (permalink / raw)
  To: Rasmus Villemoes
  Cc: Arnd Bergmann, LKML, Jason Baron, Nathan Chancellor, linuxppc-dev,
	Andrew Morton
In-Reply-To: <20190426190603.5982-2-linux@rasmusvillemoes.dk>

On Fri, Apr 26, 2019 at 12:06 PM Rasmus Villemoes
<linux@rasmusvillemoes.dk> wrote:
>
> Current versions of clang does not like the %c modifier in inline
> assembly for targets other than x86, so any DYNAMIC_DEBUG=y build
> fails on ppc64. A fix is likely to land in 9.0 (see
> https://github.com/ClangBuiltLinux/linux/issues/456), but unbreak the
> build for older versions.
>
> Fixes: powerpc: select DYNAMIC_DEBUG_RELATIVE_POINTERS for PPC64
> Reported-by: Nathan Chancellor <natechancellor@gmail.com>
> Reported-by: Arnd Bergmann <arnd@arndb.de>
> Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>

Thanks for fixing the build.
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>

> ---
> Andrew, please apply and/or fold into 10/10.
>
>  arch/powerpc/Kconfig | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 6821c8ae1d62..8511137ab963 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -155,7 +155,7 @@ config PPC
>         select BUILDTIME_EXTABLE_SORT
>         select CLONE_BACKWARDS
>         select DCACHE_WORD_ACCESS               if PPC64 && CPU_LITTLE_ENDIAN
> -       select DYNAMIC_DEBUG_RELATIVE_POINTERS  if PPC64
> +       select DYNAMIC_DEBUG_RELATIVE_POINTERS  if PPC64 && (CC_IS_GCC || CLANG_VERSION >= 90000)
>         select DYNAMIC_FTRACE                   if FUNCTION_TRACER
>         select EDAC_ATOMIC_SCRUB
>         select EDAC_SUPPORT
> --
> 2.20.1
>


-- 
Thanks,
~Nick Desaulniers

^ permalink raw reply

* [PATCH v3 1/3] powernv/mce: reduce mce console logs to lesser lines.
From: Mahesh Salgaonkar @ 2019-04-29 18:15 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin

Also add cpu number while displaying mce log. This will help cleaner logs
when mce hits on multiple cpus simultaneously.

before the changes the mce o/p was:

[  127.223515] Severe Machine check interrupt [Recovered]
[  127.223530]   NIP [d00000000ba80280]: insert_slb_entry.constprop.0+0x278/0x2c0 [mcetest_slb]
[  127.223539]   Initiator: CPU
[  127.223544]   Error type: SLB [Multihit]
[  127.223550]     Effective address: d00000000ba80280

After this patch series changes the mce o/p will be:

[  471.959843] MCE: CPU80: machine check (Warning) Host SLB Multihit [Recovered]
[  471.959870] MCE: CPU80: NIP: [d00000000b550280] insert_slb_entry.constprop.0+0x278/0x2c0 [mcetest_slb]
[  471.959892] MCE: CPU80: Probable software error (some chance of hardware cause)

UE in host application:

[ 1001.831517] MCE: CPU48: machine check (Severe) Host UE Load/Store DAR: 00007fffc6079a80 paddr: 0000000f8e260000 [Not recovered]
[ 1001.831518] MCE: CPU48: PID: 4584 Comm: find NIP: [0000000010023368]
[ 1001.831519] MCE: CPU48: Hardware error

and for MCE in Guest:

[ 1289.447571] MCE: CPU80: machine check (Warning) Guest SLB Multihit DAR: 000001001b6e0320 [Recovered]
[ 1289.447615] MCE: CPU80: PID: 24765 Comm: qemu-system-ppc Guest NIP: [00007fffa309dc60]
[ 1289.447634] MCE: CPU80: Probable software error (some chance of hardware cause)

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
CHange in v3:
- Print physical address if available.

Change in v2:
- Address comments from Michael.
---
 arch/powerpc/include/asm/mce.h |    2 -
 arch/powerpc/kernel/mce.c      |   89 ++++++++++++++++++++++------------------
 2 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 17996bc9382b..8d0b1c24c636 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -116,7 +116,7 @@ struct machine_check_event {
 	enum MCE_Initiator	initiator:8;	/* 0x03 */
 	enum MCE_ErrorType	error_type:8;	/* 0x04 */
 	enum MCE_Disposition	disposition:8;	/* 0x05 */
-	uint8_t			reserved_1[2];	/* 0x06 */
+	uint16_t		cpu;		/* 0x06 */
 	uint64_t		gpr3;		/* 0x08 */
 	uint64_t		srr0;		/* 0x10 */
 	uint64_t		srr1;		/* 0x18 */
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b5fec1f9751a..25a8b20cbbdc 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -112,6 +112,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
 	mce->srr1 = regs->msr;
 	mce->gpr3 = regs->gpr[3];
 	mce->in_use = 1;
+	mce->cpu = get_paca()->paca_index;
 
 	/* Mark it recovered if we have handled it and MSR(RI=1). */
 	if (handled && (regs->msr & MSR_RI))
@@ -310,7 +311,11 @@ static void machine_check_process_queued_event(struct irq_work *work)
 void machine_check_print_event_info(struct machine_check_event *evt,
 				    bool user_mode, bool in_guest)
 {
-	const char *level, *sevstr, *subtype;
+	const char *level, *sevstr, *subtype, *err_type;
+	uint64_t ea = 0, pa = 0;
+	int n = 0;
+	char dar_str[50];
+	char pa_str[50];
 	static const char *mc_ue_types[] = {
 		"Indeterminate",
 		"Instruction fetch",
@@ -384,101 +389,103 @@ void machine_check_print_event_info(struct machine_check_event *evt,
 		break;
 	}
 
-	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
-	       evt->disposition == MCE_DISPOSITION_RECOVERED ?
-	       "Recovered" : "Not recovered");
-
-	if (in_guest) {
-		printk("%s  Guest NIP: %016llx\n", level, evt->srr0);
-	} else if (user_mode) {
-		printk("%s  NIP: [%016llx] PID: %d Comm: %s\n", level,
-			evt->srr0, current->pid, current->comm);
-	} else {
-		printk("%s  NIP [%016llx]: %pS\n", level, evt->srr0,
-		       (void *)evt->srr0);
-	}
-
-	printk("%s  Initiator: %s\n", level,
-	       evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
 	switch (evt->error_type) {
 	case MCE_ERROR_TYPE_UE:
+		err_type = "UE";
 		subtype = evt->u.ue_error.ue_error_type <
 			ARRAY_SIZE(mc_ue_types) ?
 			mc_ue_types[evt->u.ue_error.ue_error_type]
 			: "Unknown";
-		printk("%s  Error type: UE [%s]\n", level, subtype);
 		if (evt->u.ue_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt->u.ue_error.effective_address);
+			ea = evt->u.ue_error.effective_address;
 		if (evt->u.ue_error.physical_address_provided)
-			printk("%s    Physical address:  %016llx\n",
-			       level, evt->u.ue_error.physical_address);
+			pa = evt->u.ue_error.physical_address;
 		break;
 	case MCE_ERROR_TYPE_SLB:
+		err_type = "SLB";
 		subtype = evt->u.slb_error.slb_error_type <
 			ARRAY_SIZE(mc_slb_types) ?
 			mc_slb_types[evt->u.slb_error.slb_error_type]
 			: "Unknown";
-		printk("%s  Error type: SLB [%s]\n", level, subtype);
 		if (evt->u.slb_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt->u.slb_error.effective_address);
+			ea = evt->u.slb_error.effective_address;
 		break;
 	case MCE_ERROR_TYPE_ERAT:
+		err_type = "ERAT";
 		subtype = evt->u.erat_error.erat_error_type <
 			ARRAY_SIZE(mc_erat_types) ?
 			mc_erat_types[evt->u.erat_error.erat_error_type]
 			: "Unknown";
-		printk("%s  Error type: ERAT [%s]\n", level, subtype);
 		if (evt->u.erat_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt->u.erat_error.effective_address);
+			ea = evt->u.erat_error.effective_address;
 		break;
 	case MCE_ERROR_TYPE_TLB:
+		err_type = "TLB";
 		subtype = evt->u.tlb_error.tlb_error_type <
 			ARRAY_SIZE(mc_tlb_types) ?
 			mc_tlb_types[evt->u.tlb_error.tlb_error_type]
 			: "Unknown";
-		printk("%s  Error type: TLB [%s]\n", level, subtype);
 		if (evt->u.tlb_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt->u.tlb_error.effective_address);
+			ea = evt->u.tlb_error.effective_address;
 		break;
 	case MCE_ERROR_TYPE_USER:
+		err_type = "User";
 		subtype = evt->u.user_error.user_error_type <
 			ARRAY_SIZE(mc_user_types) ?
 			mc_user_types[evt->u.user_error.user_error_type]
 			: "Unknown";
-		printk("%s  Error type: User [%s]\n", level, subtype);
 		if (evt->u.user_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt->u.user_error.effective_address);
+			ea = evt->u.user_error.effective_address;
 		break;
 	case MCE_ERROR_TYPE_RA:
+		err_type = "Real address";
 		subtype = evt->u.ra_error.ra_error_type <
 			ARRAY_SIZE(mc_ra_types) ?
 			mc_ra_types[evt->u.ra_error.ra_error_type]
 			: "Unknown";
-		printk("%s  Error type: Real address [%s]\n", level, subtype);
 		if (evt->u.ra_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt->u.ra_error.effective_address);
+			ea = evt->u.ra_error.effective_address;
 		break;
 	case MCE_ERROR_TYPE_LINK:
+		err_type = "Link";
 		subtype = evt->u.link_error.link_error_type <
 			ARRAY_SIZE(mc_link_types) ?
 			mc_link_types[evt->u.link_error.link_error_type]
 			: "Unknown";
-		printk("%s  Error type: Link [%s]\n", level, subtype);
 		if (evt->u.link_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt->u.link_error.effective_address);
+			ea = evt->u.link_error.effective_address;
 		break;
 	default:
 	case MCE_ERROR_TYPE_UNKNOWN:
-		printk("%s  Error type: Unknown\n", level);
+		err_type = "Unknown";
+		subtype = "";
 		break;
 	}
+
+	dar_str[0] = pa_str[0] = '\0';
+	if (ea && evt->srr0 != ea) {
+		/* Load/Store address */
+		n = sprintf(dar_str, "DAR: %016llx ", ea);
+		if (pa)
+			sprintf(dar_str + n, "paddr: %016llx ", pa);
+	} else if (pa) {
+		sprintf(pa_str, " paddr: %016llx", pa);
+	}
+
+	printk("%sMCE: CPU%d: machine check (%s) %s %s %s %s[%s]\n",
+		level, evt->cpu, sevstr, in_guest ? "Guest" : "Host",
+		err_type, subtype, dar_str,
+		evt->disposition == MCE_DISPOSITION_RECOVERED ?
+		"Recovered" : "Not recovered");
+
+	if (in_guest || user_mode) {
+		printk("%sMCE: CPU%d: PID: %d Comm: %s %sNIP: [%016llx]%s\n",
+			level, evt->cpu, current->pid, current->comm,
+			in_guest ? "Guest " : "", evt->srr0, pa_str);
+	} else {
+		printk("%sMCE: CPU%d: NIP: [%016llx] %pS%s\n",
+			level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str);
+	}
 }
 EXPORT_SYMBOL_GPL(machine_check_print_event_info);
 


^ permalink raw reply related

* [PATCH v3 2/3] powernv/mce: Print correct severity for mce error.
From: Mahesh Salgaonkar @ 2019-04-29 18:15 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <155656174872.20636.6539465047019566013.stgit@jupiter>

Currently all machine check errors are printed as severe errors which isn't
correct. Print soft errors as warning instead of severe errors.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
change in v2:
- Use kernel types i.e. u8, u64 etc.
- Define sync_error as bool.
---
 arch/powerpc/include/asm/mce.h        |   86 ++++++++++----------
 arch/powerpc/kernel/mce.c             |    5 +
 arch/powerpc/kernel/mce_power.c       |  144 +++++++++++++++++----------------
 arch/powerpc/platforms/powernv/opal.c |    2 
 4 files changed, 123 insertions(+), 114 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 8d0b1c24c636..b1f4bf863c95 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -31,7 +31,7 @@ enum MCE_Version {
 enum MCE_Severity {
 	MCE_SEV_NO_ERROR = 0,
 	MCE_SEV_WARNING = 1,
-	MCE_SEV_ERROR_SYNC = 2,
+	MCE_SEV_SEVERE = 2,
 	MCE_SEV_FATAL = 3,
 };
 
@@ -110,73 +110,74 @@ enum MCE_LinkErrorType {
 };
 
 struct machine_check_event {
-	enum MCE_Version	version:8;	/* 0x00 */
-	uint8_t			in_use;		/* 0x01 */
-	enum MCE_Severity	severity:8;	/* 0x02 */
-	enum MCE_Initiator	initiator:8;	/* 0x03 */
-	enum MCE_ErrorType	error_type:8;	/* 0x04 */
-	enum MCE_Disposition	disposition:8;	/* 0x05 */
-	uint16_t		cpu;		/* 0x06 */
-	uint64_t		gpr3;		/* 0x08 */
-	uint64_t		srr0;		/* 0x10 */
-	uint64_t		srr1;		/* 0x18 */
-	union {					/* 0x20 */
+	enum MCE_Version	version:8;
+	u8			in_use;
+	enum MCE_Severity	severity:8;
+	enum MCE_Initiator	initiator:8;
+	enum MCE_ErrorType	error_type:8;
+	enum MCE_Disposition	disposition:8;
+	bool			sync_error;
+	u16			cpu;
+	u64			gpr3;
+	u64			srr0;
+	u64			srr1;
+	union {
 		struct {
 			enum MCE_UeErrorType ue_error_type:8;
-			uint8_t		effective_address_provided;
-			uint8_t		physical_address_provided;
-			uint8_t		reserved_1[5];
-			uint64_t	effective_address;
-			uint64_t	physical_address;
-			uint8_t		reserved_2[8];
+			u8		effective_address_provided;
+			u8		physical_address_provided;
+			u8		reserved_1[5];
+			u64		effective_address;
+			u64		physical_address;
+			u8		reserved_2[8];
 		} ue_error;
 
 		struct {
 			enum MCE_SlbErrorType slb_error_type:8;
-			uint8_t		effective_address_provided;
-			uint8_t		reserved_1[6];
-			uint64_t	effective_address;
-			uint8_t		reserved_2[16];
+			u8		effective_address_provided;
+			u8		reserved_1[6];
+			u64		effective_address;
+			u8		reserved_2[16];
 		} slb_error;
 
 		struct {
 			enum MCE_EratErrorType erat_error_type:8;
-			uint8_t		effective_address_provided;
-			uint8_t		reserved_1[6];
-			uint64_t	effective_address;
-			uint8_t		reserved_2[16];
+			u8		effective_address_provided;
+			u8		reserved_1[6];
+			u64		effective_address;
+			u8		reserved_2[16];
 		} erat_error;
 
 		struct {
 			enum MCE_TlbErrorType tlb_error_type:8;
-			uint8_t		effective_address_provided;
-			uint8_t		reserved_1[6];
-			uint64_t	effective_address;
-			uint8_t		reserved_2[16];
+			u8		effective_address_provided;
+			u8		reserved_1[6];
+			u64		effective_address;
+			u8		reserved_2[16];
 		} tlb_error;
 
 		struct {
 			enum MCE_UserErrorType user_error_type:8;
-			uint8_t		effective_address_provided;
-			uint8_t		reserved_1[6];
-			uint64_t	effective_address;
-			uint8_t		reserved_2[16];
+			u8		effective_address_provided;
+			u8		reserved_1[6];
+			u64		effective_address;
+			u8		reserved_2[16];
 		} user_error;
 
 		struct {
 			enum MCE_RaErrorType ra_error_type:8;
-			uint8_t		effective_address_provided;
-			uint8_t		reserved_1[6];
-			uint64_t	effective_address;
-			uint8_t		reserved_2[16];
+			u8		effective_address_provided;
+			u8		reserved_1[6];
+			u64		effective_address;
+			u8		reserved_2[16];
 		} ra_error;
 
 		struct {
 			enum MCE_LinkErrorType link_error_type:8;
-			uint8_t		effective_address_provided;
-			uint8_t		reserved_1[6];
-			uint64_t	effective_address;
-			uint8_t		reserved_2[16];
+			u8		effective_address_provided;
+			u8		reserved_1[6];
+			u64		effective_address;
+			u8		reserved_2[16];
 		} link_error;
 	} u;
 };
@@ -194,6 +195,7 @@ struct mce_error_info {
 	} u;
 	enum MCE_Severity	severity:8;
 	enum MCE_Initiator	initiator:8;
+	bool			sync_error;
 };
 
 #define MAX_MC_EVT	100
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 25a8b20cbbdc..71d245a387ab 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -122,6 +122,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
 
 	mce->initiator = mce_err->initiator;
 	mce->severity = mce_err->severity;
+	mce->sync_error = mce_err->sync_error;
 
 	/*
 	 * Populate the mce error_type and type-specific error_type.
@@ -376,9 +377,9 @@ void machine_check_print_event_info(struct machine_check_event *evt,
 		break;
 	case MCE_SEV_WARNING:
 		level = KERN_WARNING;
-		sevstr = "";
+		sevstr = "Warning";
 		break;
-	case MCE_SEV_ERROR_SYNC:
+	case MCE_SEV_SEVERE:
 		level = KERN_ERR;
 		sevstr = "Severe";
 		break;
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 6b800eec31f2..606af87a4dda 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -133,106 +133,107 @@ struct mce_ierror_table {
 	unsigned int error_subtype;
 	unsigned int initiator;
 	unsigned int severity;
+	bool sync_error;
 };
 
 static const struct mce_ierror_table mce_p7_ierror_table[] = {
 { 0x00000000001c0000, 0x0000000000040000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000001c0000, 0x0000000000080000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000001c0000, 0x00000000000c0000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000001c0000, 0x0000000000100000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000001c0000, 0x0000000000140000, true,
   MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000001c0000, 0x0000000000180000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000001c0000, 0x00000000001c0000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
-{ 0, 0, 0, 0, 0, 0 } };
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
+{ 0, 0, 0, 0, 0, 0, 0 } };
 
 static const struct mce_ierror_table mce_p8_ierror_table[] = {
 { 0x00000000081c0000, 0x0000000000040000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000000080000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000000c0000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000100000, true,
   MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000140000, true,
   MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000180000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000001c0000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008000000, true,
   MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008040000, true,
   MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
-{ 0, 0, 0, 0, 0, 0 } };
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
+{ 0, 0, 0, 0, 0, 0, 0 } };
 
 static const struct mce_ierror_table mce_p9_ierror_table[] = {
 { 0x00000000081c0000, 0x0000000000040000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000000080000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000000c0000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000100000, true,
   MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000140000, true,
   MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000180000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000001c0000, true,
   MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_IFETCH_FOREIGN,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008000000, true,
   MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008040000, true,
   MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000080c0000, true,
   MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008100000, true,
   MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008140000, false,
   MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_STORE,
-  MCE_INITIATOR_CPU,  MCE_SEV_FATAL, }, /* ASYNC is fatal */
+  MCE_INITIATOR_CPU,  MCE_SEV_FATAL, false }, /* ASYNC is fatal */
 { 0x00000000081c0000, 0x0000000008180000, false,
   MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_STORE_TIMEOUT,
-  MCE_INITIATOR_CPU,  MCE_SEV_FATAL, }, /* ASYNC is fatal */
+  MCE_INITIATOR_CPU,  MCE_SEV_FATAL, false }, /* ASYNC is fatal */
 { 0x00000000081c0000, 0x00000000081c0000, true,
   MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
-{ 0, 0, 0, 0, 0, 0 } };
+  MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
+{ 0, 0, 0, 0, 0, 0, 0 } };
 
 struct mce_derror_table {
 	unsigned long dsisr_value;
@@ -241,103 +242,104 @@ struct mce_derror_table {
 	unsigned int error_subtype;
 	unsigned int initiator;
 	unsigned int severity;
+	bool sync_error;
 };
 
 static const struct mce_derror_table mce_p7_derror_table[] = {
 { 0x00008000, false,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00004000, true,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000800, true,
   MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000400, true,
   MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000080, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,	/* Before PARITY */
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000100, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000040, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
-{ 0, false, 0, 0, 0, 0 } };
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
+{ 0, false, 0, 0, 0, 0, 0 } };
 
 static const struct mce_derror_table mce_p8_derror_table[] = {
 { 0x00008000, false,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00004000, true,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00002000, true,
   MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00001000, true,
   MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000800, true,
   MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000400, true,
   MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000200, true,
   MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000080, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,	/* Before PARITY */
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000100, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
-{ 0, false, 0, 0, 0, 0 } };
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
+{ 0, false, 0, 0, 0, 0, 0 } };
 
 static const struct mce_derror_table mce_p9_derror_table[] = {
 { 0x00008000, false,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00004000, true,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00002000, true,
   MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00001000, true,
   MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000800, true,
   MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000400, true,
   MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000200, false,
   MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000080, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,	/* Before PARITY */
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000100, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000040, true,
   MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000020, false,
   MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000010, false,
   MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000008, false,
   MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD_STORE_FOREIGN,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
-{ 0, false, 0, 0, 0, 0 } };
+  MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
+{ 0, false, 0, 0, 0, 0, 0 } };
 
 static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr,
 					uint64_t *phys_addr)
@@ -427,11 +429,12 @@ static int mce_handle_ierror(struct pt_regs *regs,
 			mce_err->u.link_error_type = table[i].error_subtype;
 			break;
 		}
+		mce_err->sync_error = table[i].sync_error;
 		mce_err->severity = table[i].severity;
 		mce_err->initiator = table[i].initiator;
 		if (table[i].nip_valid) {
 			*addr = regs->nip;
-			if (mce_err->severity == MCE_SEV_ERROR_SYNC &&
+			if (mce_err->sync_error &&
 				table[i].error_type == MCE_ERROR_TYPE_UE) {
 				unsigned long pfn;
 
@@ -448,8 +451,9 @@ static int mce_handle_ierror(struct pt_regs *regs,
 	}
 
 	mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
-	mce_err->severity = MCE_SEV_ERROR_SYNC;
+	mce_err->severity = MCE_SEV_SEVERE;
 	mce_err->initiator = MCE_INITIATOR_CPU;
+	mce_err->sync_error = true;
 
 	return 0;
 }
@@ -519,11 +523,12 @@ static int mce_handle_derror(struct pt_regs *regs,
 			mce_err->u.link_error_type = table[i].error_subtype;
 			break;
 		}
+		mce_err->sync_error = table[i].sync_error;
 		mce_err->severity = table[i].severity;
 		mce_err->initiator = table[i].initiator;
 		if (table[i].dar_valid)
 			*addr = regs->dar;
-		else if (mce_err->severity == MCE_SEV_ERROR_SYNC &&
+		else if (mce_err->sync_error &&
 				table[i].error_type == MCE_ERROR_TYPE_UE) {
 			/*
 			 * We do a maximum of 4 nested MCE calls, see
@@ -539,8 +544,9 @@ static int mce_handle_derror(struct pt_regs *regs,
 		return handled;
 
 	mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
-	mce_err->severity = MCE_SEV_ERROR_SYNC;
+	mce_err->severity = MCE_SEV_SEVERE;
 	mce_err->initiator = MCE_INITIATOR_CPU;
+	mce_err->sync_error = true;
 
 	return 0;
 }
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 77552e525e33..f2b063b027f0 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -505,7 +505,7 @@ static int opal_recover_mce(struct pt_regs *regs,
 		recovered = 0;
 	}
 
-	if (!recovered && evt->severity == MCE_SEV_ERROR_SYNC) {
+	if (!recovered && evt->sync_error) {
 		/*
 		 * Try to kill processes if we get a synchronous machine check
 		 * (e.g., one caused by execution of this instruction). This


^ permalink raw reply related

* [PATCH v3 3/3] powernv/mce: print additional information about mce error.
From: Mahesh Salgaonkar @ 2019-04-29 18:16 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <155656174872.20636.6539465047019566013.stgit@jupiter>

Print more information about mce error whether it is an hardware or
software error.

Some of the mce errors can be easily categorized as hardware or software
errors e.g. UEs are due to hardware error, where as error triggered due to
invalid usage of tlbie is a pure software bug. But not all the mce errors
can be easily categorize into either software or hardware. There are errors
like multihit errors which are usually result of a software bug, but in
some rare cases a hardware failure can cause a multihit error. In past, we
have seen case where after replacing faulty chip, multihit errors stopped
occurring. Same with parity errors, which are usually due to faulty hardware
but there are chances where multihit can also cause an parity error. Such
errors are difficult to determine what really caused it. Hence this patch
classifies mce errors into following four categorize:
	1. Hardware error:
		UE and Link timeout failure errors.
	2. Probable hardware error (some chance of software cause)
		SLB/ERAT/TLB Parity errors.
	3. Software error
		Invalid tlbie form.
	4. Probable software error (some chance of hardware cause)
		SLB/ERAT/TLB Multihit errors.

Sample o/p:

[ 1289.447571] MCE: CPU80: machine check (Warning) Guest SLB Multihit DAR: 000001001b6e0320 [Recovered]
[ 1289.447615] MCE: CPU80: PID: 24765 Comm: qemu-system-ppc Guest NIP: [00007fffa309dc60]
[ 1289.447634] MCE: CPU80: Probable Software error (some chance of hardware cause)

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
Change in v2:
- Rephrase the wording for error class as suggested by Michael.
---
 arch/powerpc/include/asm/mce.h  |   10 ++++
 arch/powerpc/kernel/mce.c       |   12 ++++
 arch/powerpc/kernel/mce_power.c |  107 +++++++++++++++++++++++----------------
 3 files changed, 86 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index b1f4bf863c95..8741f4c21a1a 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -56,6 +56,14 @@ enum MCE_ErrorType {
 	MCE_ERROR_TYPE_LINK = 7,
 };
 
+enum MCE_ErrorClass {
+	MCE_ECLASS_UNKNOWN = 0,
+	MCE_ECLASS_HARDWARE,
+	MCE_ECLASS_HARD_INDETERMINATE,
+	MCE_ECLASS_SOFTWARE,
+	MCE_ECLASS_SOFT_INDETERMINATE,
+};
+
 enum MCE_UeErrorType {
 	MCE_UE_ERROR_INDETERMINATE = 0,
 	MCE_UE_ERROR_IFETCH = 1,
@@ -115,6 +123,7 @@ struct machine_check_event {
 	enum MCE_Severity	severity:8;
 	enum MCE_Initiator	initiator:8;
 	enum MCE_ErrorType	error_type:8;
+	enum MCE_ErrorClass	error_class:8;
 	enum MCE_Disposition	disposition:8;
 	bool			sync_error;
 	u16			cpu;
@@ -195,6 +204,7 @@ struct mce_error_info {
 	} u;
 	enum MCE_Severity	severity:8;
 	enum MCE_Initiator	initiator:8;
+	enum MCE_ErrorClass	error_class:8;
 	bool			sync_error;
 };
 
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 71d245a387ab..4581377cfc98 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -123,6 +123,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
 	mce->initiator = mce_err->initiator;
 	mce->severity = mce_err->severity;
 	mce->sync_error = mce_err->sync_error;
+	mce->error_class = mce_err->error_class;
 
 	/*
 	 * Populate the mce error_type and type-specific error_type.
@@ -363,6 +364,13 @@ void machine_check_print_event_info(struct machine_check_event *evt,
 		"Store (timeout)",
 		"Page table walk Load/Store (timeout)",
 	};
+	static const char *mc_error_class[] = {
+		"Unknown",
+		"Hardware error",
+		"Probable Hardware error (some chance of software cause)",
+		"Software error",
+		"Probable Software error (some chance of hardware cause)",
+	};
 
 	/* Print things out */
 	if (evt->version != MCE_V1) {
@@ -487,6 +495,10 @@ void machine_check_print_event_info(struct machine_check_event *evt,
 		printk("%sMCE: CPU%d: NIP: [%016llx] %pS%s\n",
 			level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str);
 	}
+
+	subtype = evt->error_class < ARRAY_SIZE(mc_error_class) ?
+		mc_error_class[evt->error_class] : "Unknown";
+	printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype);
 }
 EXPORT_SYMBOL_GPL(machine_check_print_event_info);
 
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 606af87a4dda..3658af85e48a 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -131,6 +131,7 @@ struct mce_ierror_table {
 	bool nip_valid; /* nip is a valid indicator of faulting address */
 	unsigned int error_type;
 	unsigned int error_subtype;
+	unsigned int error_class;
 	unsigned int initiator;
 	unsigned int severity;
 	bool sync_error;
@@ -138,99 +139,103 @@ struct mce_ierror_table {
 
 static const struct mce_ierror_table mce_p7_ierror_table[] = {
 { 0x00000000001c0000, 0x0000000000040000, true,
-  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000001c0000, 0x0000000000080000, true,
-  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000001c0000, 0x00000000000c0000, true,
-  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000001c0000, 0x0000000000100000, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
+  MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000001c0000, 0x0000000000140000, true,
-  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000001c0000, 0x0000000000180000, true,
-  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000001c0000, 0x00000000001c0000, true,
-  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0, 0, 0, 0, 0, 0, 0 } };
 
 static const struct mce_ierror_table mce_p8_ierror_table[] = {
 { 0x00000000081c0000, 0x0000000000040000, true,
-  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000000080000, true,
-  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000000c0000, true,
-  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000100000, true,
-  MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000140000, true,
-  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000180000, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000001c0000, true,
-  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008000000, true,
-  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_IFETCH_TIMEOUT, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008040000, true,
   MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0, 0, 0, 0, 0, 0, 0 } };
 
 static const struct mce_ierror_table mce_p9_ierror_table[] = {
 { 0x00000000081c0000, 0x0000000000040000, true,
-  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000000080000, true,
-  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000000c0000, true,
-  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000100000, true,
-  MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000140000, true,
-  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,  MCE_SEV_WARNING, true },
 { 0x00000000081c0000, 0x0000000000180000, true,
-  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000001c0000, true,
-  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_IFETCH_FOREIGN,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_IFETCH_FOREIGN, MCE_ECLASS_SOFTWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008000000, true,
-  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_IFETCH_TIMEOUT, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008040000, true,
   MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x00000000080c0000, true,
-  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_IFETCH,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_IFETCH, MCE_ECLASS_SOFTWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008100000, true,
-  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH, MCE_ECLASS_SOFTWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0x00000000081c0000, 0x0000000008140000, false,
-  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_STORE,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_STORE, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,  MCE_SEV_FATAL, false }, /* ASYNC is fatal */
 { 0x00000000081c0000, 0x0000000008180000, false,
   MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_STORE_TIMEOUT,
   MCE_INITIATOR_CPU,  MCE_SEV_FATAL, false }, /* ASYNC is fatal */
-{ 0x00000000081c0000, 0x00000000081c0000, true,
+{ 0x00000000081c0000, 0x00000000081c0000, true, MCE_ECLASS_HARDWARE,
   MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN,
   MCE_INITIATOR_CPU,  MCE_SEV_SEVERE, true },
 { 0, 0, 0, 0, 0, 0, 0 } };
@@ -240,6 +245,7 @@ struct mce_derror_table {
 	bool dar_valid; /* dar is a valid indicator of faulting address */
 	unsigned int error_type;
 	unsigned int error_subtype;
+	unsigned int error_class;
 	unsigned int initiator;
 	unsigned int severity;
 	bool sync_error;
@@ -247,97 +253,108 @@ struct mce_derror_table {
 
 static const struct mce_derror_table mce_p7_derror_table[] = {
 { 0x00008000, false,
-  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00004000, true,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000800, true,
-  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000400, true,
-  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000080, true,
-  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,	/* Before PARITY */
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000100, true,
-  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000040, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
+  MCE_ECLASS_HARD_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0, false, 0, 0, 0, 0, 0 } };
 
 static const struct mce_derror_table mce_p8_derror_table[] = {
 { 0x00008000, false,
-  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00004000, true,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00002000, true,
-  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00001000, true,
   MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000800, true,
-  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000400, true,
-  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000200, true,
   MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */
+  MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000080, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,	/* Before PARITY */
+  MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000100, true,
-  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0, false, 0, 0, 0, 0, 0 } };
 
 static const struct mce_derror_table mce_p9_derror_table[] = {
 { 0x00008000, false,
-  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00004000, true,
   MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00002000, true,
-  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00001000, true,
   MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000800, true,
-  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000400, true,
-  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
+  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000200, false,
-  MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE,
+  MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE, MCE_ECLASS_SOFTWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000080, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,	/* Before PARITY */
+  MCE_ECLASS_SOFT_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_WARNING, true },
 { 0x00000100, true,
-  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000040, true,
-  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD,
+  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000020, false,
   MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000010, false,
   MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN,
+  MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0x00000008, false,
-  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD_STORE_FOREIGN,
+  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD_STORE_FOREIGN, MCE_ECLASS_HARDWARE,
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0, false, 0, 0, 0, 0, 0 } };
 
@@ -406,6 +423,7 @@ static int mce_handle_ierror(struct pt_regs *regs,
 
 		/* now fill in mce_error_info */
 		mce_err->error_type = table[i].error_type;
+		mce_err->error_class = table[i].error_class;
 		switch (table[i].error_type) {
 		case MCE_ERROR_TYPE_UE:
 			mce_err->u.ue_error_type = table[i].error_subtype;
@@ -451,6 +469,7 @@ static int mce_handle_ierror(struct pt_regs *regs,
 	}
 
 	mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
+	mce_err->error_class = MCE_ECLASS_UNKNOWN;
 	mce_err->severity = MCE_SEV_SEVERE;
 	mce_err->initiator = MCE_INITIATOR_CPU;
 	mce_err->sync_error = true;
@@ -500,6 +519,7 @@ static int mce_handle_derror(struct pt_regs *regs,
 
 		/* now fill in mce_error_info */
 		mce_err->error_type = table[i].error_type;
+		mce_err->error_class = table[i].error_class;
 		switch (table[i].error_type) {
 		case MCE_ERROR_TYPE_UE:
 			mce_err->u.ue_error_type = table[i].error_subtype;
@@ -544,6 +564,7 @@ static int mce_handle_derror(struct pt_regs *regs,
 		return handled;
 
 	mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
+	mce_err->error_class = MCE_ECLASS_UNKNOWN;
 	mce_err->severity = MCE_SEV_SEVERE;
 	mce_err->initiator = MCE_INITIATOR_CPU;
 	mce_err->sync_error = true;


^ permalink raw reply related

* Re: [PATCH 13/41] drivers: tty: serial: uartlite: fill mapsize and use it
From: Enrico Weigelt, metux IT consult @ 2019-04-29 18:26 UTC (permalink / raw)
  To: Peter Korsgaard, Enrico Weigelt, metux IT consult
  Cc: lorenzo.pieralisi, linux-ia64, macro, andrew, gregkh,
	slemieux.tyco, liviu.dudau, linux-kernel, andriy.shevchenko,
	linux-mips, linux, matthias.bgg, khilman, linux-serial,
	sudeep.holla, sparclinux, jacmet, linux-amlogic, vz, linuxppc-dev,
	davem
In-Reply-To: <87muk8rg82.fsf@dell.be.48ers.dk>

On 29.04.19 17:19, Peter Korsgaard wrote:
>>>>>> "Enrico" == Enrico Weigelt, metux IT consult <info@metux.net> writes:
> 
>  > Fill the struct uart_port->mapsize field and use it, insteaf of
> 
> s/insteaf/instead/

Fixed.

>  > hardcoded values in many places. This makes the code layout a bit
>  > more consistent and easily allows using generic helpers for the
>  > io memory handling.
> 
>  > Candidates for such helpers could be eg. the request+ioremap and
>  > iounmap+release combinations.
> 
>  > Signed-off-by: Enrico Weigelt <info@metux.net>
> 
> Acked-by: Peter Korsgaard <peter@korsgaard.com>

thanks for review.


--mtx

-- 
Enrico Weigelt, metux IT consult
Free software and Linux embedded engineering
info@metux.net -- +49-151-27565287

^ permalink raw reply

* [PATCH] powerpc: Fix kobject memleak
From: Tobin C. Harding @ 2019-04-30  1:09 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: Greg Kroah-Hartman, linuxppc-dev, Tobin C. Harding, linux-kernel

Currently error return from kobject_init_and_add() is not followed by a
call to kobject_put().  This means there is a memory leak.

Add call to kobject_put() in error path of kobject_init_and_add().

Signed-off-by: Tobin C. Harding <tobin@kernel.org>
---

Untested, did not even build.  Bad Tobin, no biscuit.

No call to kobject_uevent()?  I'll be back to check usage of
kobject_create_and_add() later, then I intend to check all the uevent
stuff, feel free to leave this for me to check later if you don't know
the reasoning straight off the top of your head.

thanks,
Tobin.

arch/powerpc/kernel/cacheinfo.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 53102764fd2f..f2ed3ef4b129 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -759,23 +759,22 @@ static void cacheinfo_create_index_dir(struct cache *cache, int index,
 
 	index_dir = kzalloc(sizeof(*index_dir), GFP_KERNEL);
 	if (!index_dir)
-		goto err;
+		return;
 
 	index_dir->cache = cache;
 
 	rc = kobject_init_and_add(&index_dir->kobj, &cache_index_type,
 				  cache_dir->kobj, "index%d", index);
-	if (rc)
-		goto err;
+	if (rc) {
+		kobject_put(&index_dir->kobj);
+		kfree(index_dir);
+		return;
+	}
 
 	index_dir->next = cache_dir->index;
 	cache_dir->index = index_dir;
 
 	cacheinfo_create_index_opt_attrs(index_dir);
-
-	return;
-err:
-	kfree(index_dir);
 }
 
 static void cacheinfo_sysfs_populate(unsigned int cpu_id,
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH v2 0/5] Allow CPU0 to be nohz full
From: Nicholas Piggin @ 2019-04-30  2:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Frederic Weisbecker, Rafael J . Wysocki, linux-kernel,
	Ingo Molnar, Thomas Gleixner, linuxppc-dev
In-Reply-To: <20190425120427.GS4038@hirez.programming.kicks-ass.net>

Peter Zijlstra's on April 25, 2019 10:04 pm:
> On Thu, Apr 11, 2019 at 01:34:43PM +1000, Nicholas Piggin wrote:
>> Since last time, I added a compile time option to opt-out of this
>> if the platform does not support suspend on non-zero, and tried to
>> improve legibility of changelogs and explain the justification
>> better.
>> 
>> I have been testing this on powerpc/pseries and it seems to work
>> fine (the firmware call to suspend can be called on any CPU and
>> resumes where it left off), but not included here because the
>> code has some bitrot unrelated to this series which I hacked to
>> fix. I will discuss it and either send an acked patch to go with
>> this series if it is small, or fix it in powerpc tree.
>> 
> 
> Rafael, Frederic, any comments?
> 

Sorry to ping again, I guess people are probably busy after vacation.
Any chance we could get this in next merge window? Peter are you okay
with the config option as it is, then we can look at adapting it to
what x86 needs as a follow up (e.g., allow nohz CPU0 for
cpu0_hotpluggable case)?

Thanks,
Nick


^ permalink raw reply

* RE: Machine Check Exceptions
From: Rodriguez Quesada, Pablo @ 2019-04-30  0:49 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: linuxppc-dev@lists.ozlabs.org
In-Reply-To: <49f28ecd-b2c2-aa9f-8183-f6c945824292@c-s.fr>

Hi Christophe,

I will try with the NMI approach. Just one question: 

>The driver I wrote that you are referring to is to handle the watchdog in e300 powerpc core. This is because the e300 gererates a machine check on expiry >when the watchdog is configured in NMI mode.

By NMI mode do you mean enabling the CONFIG_HAVE_NMI_WATCHDOG symbol on the .config?


Thank you,
Pablo

-----Original Message-----
From: Christophe Leroy <christophe.leroy@c-s.fr> 
Sent: Monday, April 29, 2019 12:10 AM
To: Rodriguez Quesada, Pablo <Pablo.Rodriguez-Quesada@windriver.com>
Cc: linuxppc-dev@lists.ozlabs.org
Subject: Re: Machine Check Exceptions

Hi Pablo,

I know nothing about mce-inject.

The driver I wrote that you are referring to is to handle the watchdog in e300 powerpc core. This is because the e300 gererates a machine check on expiry when the watchdog is configured in NMI mode.

Therefore, triggering the machine check is rather easy in that case, I just have to stop refreshing the watchdog.

Christophe

Le 25/04/2019 à 17:35, Rodriguez Quesada, Pablo a écrit :
> Hi Christopher,
> 
> I am trying to create a custom handler for MCE, but I don't have a way 
> to trigger it (Or don't know how)
> 
> Therefore I looked at the mce-inject tool(**) and compile it for PPC but the path "/sys/devices/system/machine check/machinecheck0" doesn't exist unlike Intel architecture and the app fails. I want to fix it.
> 
> My question is: Do you know if there is a way to inject an MCE on PowerPC? Or can you guide me on how to start my research about it. I find it very difficult to search for PPC documentation. That is why I am writing to you, to get any tips for developing a mce-inject for PPC. And of course, I would be more than glad to contribute to the OpenSource community.
> 
> Also, how did you test your handler? Do you have an easy mechanism to trigger the exceptions?
> 
> Thank you,
> Pablo
> 
> (**) https://github.com/andikleen/mce-inject/
> 
> 
> 
> -----Original Message-----
> From: Christophe Leroy <christophe.leroy@c-s.fr>
> Sent: Wednesday, April 24, 2019 11:07 PM
> To: Rodriguez Quesada, Pablo <Pablo.Rodriguez-Quesada@windriver.com>
> Subject: Re: Machine Check Exceptions
> 
> Hi Pablo,
> 
> No problem
> 
> Christophe
> 
> 
> Le 24/04/2019 à 21:33, Rodriguez Quesada, Pablo a écrit :
>> Hi Cristopher,
>>
>> Hope you are doing good!
>>
>> I am learning about Machine Check Exceptions on PPC, and I found this 
>> commit of yours:
>>
>> https://github.com/torvalds/linux/commit/0deae39cec6dab3a66794f3e9e83
>> c
>> a4dc30080f1
>>
>> Would you mind if I ask you a couple of questions?
>>
>> Thank you,
>>
>> WR logo signiture <http://www.windriver.com/>
>>
>> 	
>>
>> *Pablo Rodriguez*, Intern, Wind River Costa Rica
>>

^ permalink raw reply

* Re: [PATCH v4 1/7] ocxl: Split pci.c
From: Andrew Donnellan @ 2019-04-30  4:20 UTC (permalink / raw)
  To: Alastair D'Silva, alastair
  Cc: Frederic Barrat, Greg Kroah-Hartman, linuxppc-dev, linux-kernel,
	Arnd Bergmann
In-Reply-To: <20190327053137.15173-2-alastair@au1.ibm.com>

On 27/3/19 4:31 pm, Alastair D'Silva wrote:
> From: Alastair D'Silva <alastair@d-silva.org>
> 
> In preparation for making core code available for external drivers,
> move the core code out of pci.c and into core.c
> 
> Signed-off-by: Alastair D'Silva <alastair@d-silva.org>

There doesn't seem to be much left in pci.c, is there?

Acked-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>

> ---
>   drivers/misc/ocxl/Makefile        |   1 +
>   drivers/misc/ocxl/core.c          | 517 +++++++++++++++++++++++++++++
>   drivers/misc/ocxl/ocxl_internal.h |   5 +
>   drivers/misc/ocxl/pci.c           | 519 +-----------------------------
>   4 files changed, 524 insertions(+), 518 deletions(-)
>   create mode 100644 drivers/misc/ocxl/core.c
> 
> diff --git a/drivers/misc/ocxl/Makefile b/drivers/misc/ocxl/Makefile
> index 5229dcda8297..bc4e39bfda7b 100644
> --- a/drivers/misc/ocxl/Makefile
> +++ b/drivers/misc/ocxl/Makefile
> @@ -3,6 +3,7 @@ ccflags-$(CONFIG_PPC_WERROR)	+= -Werror
>   
>   ocxl-y				+= main.o pci.o config.o file.o pasid.o
>   ocxl-y				+= link.o context.o afu_irq.o sysfs.o trace.o
> +ocxl-y				+= core.o
>   obj-$(CONFIG_OCXL)		+= ocxl.o
>   
>   # For tracepoints to include our trace.h from tracepoint infrastructure:
> diff --git a/drivers/misc/ocxl/core.c b/drivers/misc/ocxl/core.c
> new file mode 100644
> index 000000000000..1a4411b72d35
> --- /dev/null
> +++ b/drivers/misc/ocxl/core.c
> @@ -0,0 +1,517 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +// Copyright 2019 IBM Corp.
> +#include <linux/idr.h>
> +#include "ocxl_internal.h"
> +
> +static struct ocxl_fn *ocxl_fn_get(struct ocxl_fn *fn)
> +{
> +	return (get_device(&fn->dev) == NULL) ? NULL : fn;
> +}
> +
> +static void ocxl_fn_put(struct ocxl_fn *fn)
> +{
> +	put_device(&fn->dev);
> +}
> +
> +struct ocxl_afu *ocxl_afu_get(struct ocxl_afu *afu)
> +{
> +	return (get_device(&afu->dev) == NULL) ? NULL : afu;
> +}
> +
> +void ocxl_afu_put(struct ocxl_afu *afu)
> +{
> +	put_device(&afu->dev);
> +}
> +
> +static struct ocxl_afu *alloc_afu(struct ocxl_fn *fn)
> +{
> +	struct ocxl_afu *afu;
> +
> +	afu = kzalloc(sizeof(struct ocxl_afu), GFP_KERNEL);
> +	if (!afu)
> +		return NULL;
> +
> +	mutex_init(&afu->contexts_lock);
> +	mutex_init(&afu->afu_control_lock);
> +	idr_init(&afu->contexts_idr);
> +	afu->fn = fn;
> +	ocxl_fn_get(fn);
> +	return afu;
> +}
> +
> +static void free_afu(struct ocxl_afu *afu)
> +{
> +	idr_destroy(&afu->contexts_idr);
> +	ocxl_fn_put(afu->fn);
> +	kfree(afu);
> +}
> +
> +static void free_afu_dev(struct device *dev)
> +{
> +	struct ocxl_afu *afu = to_ocxl_afu(dev);
> +
> +	ocxl_unregister_afu(afu);
> +	free_afu(afu);
> +}
> +
> +static int set_afu_device(struct ocxl_afu *afu, const char *location)
> +{
> +	struct ocxl_fn *fn = afu->fn;
> +	int rc;
> +
> +	afu->dev.parent = &fn->dev;
> +	afu->dev.release = free_afu_dev;
> +	rc = dev_set_name(&afu->dev, "%s.%s.%hhu", afu->config.name, location,
> +		afu->config.idx);
> +	return rc;
> +}
> +
> +static int assign_afu_actag(struct ocxl_afu *afu, struct pci_dev *dev)
> +{
> +	struct ocxl_fn *fn = afu->fn;
> +	int actag_count, actag_offset;
> +
> +	/*
> +	 * if there were not enough actags for the function, each afu
> +	 * reduces its count as well
> +	 */
> +	actag_count = afu->config.actag_supported *
> +		fn->actag_enabled / fn->actag_supported;
> +	actag_offset = ocxl_actag_afu_alloc(fn, actag_count);
> +	if (actag_offset < 0) {
> +		dev_err(&afu->dev, "Can't allocate %d actags for AFU: %d\n",
> +			actag_count, actag_offset);
> +		return actag_offset;
> +	}
> +	afu->actag_base = fn->actag_base + actag_offset;
> +	afu->actag_enabled = actag_count;
> +
> +	ocxl_config_set_afu_actag(dev, afu->config.dvsec_afu_control_pos,
> +				afu->actag_base, afu->actag_enabled);
> +	dev_dbg(&afu->dev, "actag base=%d enabled=%d\n",
> +		afu->actag_base, afu->actag_enabled);
> +	return 0;
> +}
> +
> +static void reclaim_afu_actag(struct ocxl_afu *afu)
> +{
> +	struct ocxl_fn *fn = afu->fn;
> +	int start_offset, size;
> +
> +	start_offset = afu->actag_base - fn->actag_base;
> +	size = afu->actag_enabled;
> +	ocxl_actag_afu_free(afu->fn, start_offset, size);
> +}
> +
> +static int assign_afu_pasid(struct ocxl_afu *afu, struct pci_dev *dev)
> +{
> +	struct ocxl_fn *fn = afu->fn;
> +	int pasid_count, pasid_offset;
> +
> +	/*
> +	 * We only support the case where the function configuration
> +	 * requested enough PASIDs to cover all AFUs.
> +	 */
> +	pasid_count = 1 << afu->config.pasid_supported_log;
> +	pasid_offset = ocxl_pasid_afu_alloc(fn, pasid_count);
> +	if (pasid_offset < 0) {
> +		dev_err(&afu->dev, "Can't allocate %d PASIDs for AFU: %d\n",
> +			pasid_count, pasid_offset);
> +		return pasid_offset;
> +	}
> +	afu->pasid_base = fn->pasid_base + pasid_offset;
> +	afu->pasid_count = 0;
> +	afu->pasid_max = pasid_count;
> +
> +	ocxl_config_set_afu_pasid(dev, afu->config.dvsec_afu_control_pos,
> +				afu->pasid_base,
> +				afu->config.pasid_supported_log);
> +	dev_dbg(&afu->dev, "PASID base=%d, enabled=%d\n",
> +		afu->pasid_base, pasid_count);
> +	return 0;
> +}
> +
> +static void reclaim_afu_pasid(struct ocxl_afu *afu)
> +{
> +	struct ocxl_fn *fn = afu->fn;
> +	int start_offset, size;
> +
> +	start_offset = afu->pasid_base - fn->pasid_base;
> +	size = 1 << afu->config.pasid_supported_log;
> +	ocxl_pasid_afu_free(afu->fn, start_offset, size);
> +}
> +
> +static int reserve_fn_bar(struct ocxl_fn *fn, int bar)
> +{
> +	struct pci_dev *dev = to_pci_dev(fn->dev.parent);
> +	int rc, idx;
> +
> +	if (bar != 0 && bar != 2 && bar != 4)
> +		return -EINVAL;
> +
> +	idx = bar >> 1;
> +	if (fn->bar_used[idx]++ == 0) {
> +		rc = pci_request_region(dev, bar, "ocxl");
> +		if (rc)
> +			return rc;
> +	}
> +	return 0;
> +}
> +
> +static void release_fn_bar(struct ocxl_fn *fn, int bar)
> +{
> +	struct pci_dev *dev = to_pci_dev(fn->dev.parent);
> +	int idx;
> +
> +	if (bar != 0 && bar != 2 && bar != 4)
> +		return;
> +
> +	idx = bar >> 1;
> +	if (--fn->bar_used[idx] == 0)
> +		pci_release_region(dev, bar);
> +	WARN_ON(fn->bar_used[idx] < 0);
> +}
> +
> +static int map_mmio_areas(struct ocxl_afu *afu, struct pci_dev *dev)
> +{
> +	int rc;
> +
> +	rc = reserve_fn_bar(afu->fn, afu->config.global_mmio_bar);
> +	if (rc)
> +		return rc;
> +
> +	rc = reserve_fn_bar(afu->fn, afu->config.pp_mmio_bar);
> +	if (rc) {
> +		release_fn_bar(afu->fn, afu->config.global_mmio_bar);
> +		return rc;
> +	}
> +
> +	afu->global_mmio_start =
> +		pci_resource_start(dev, afu->config.global_mmio_bar) +
> +		afu->config.global_mmio_offset;
> +	afu->pp_mmio_start =
> +		pci_resource_start(dev, afu->config.pp_mmio_bar) +
> +		afu->config.pp_mmio_offset;
> +
> +	afu->global_mmio_ptr = ioremap(afu->global_mmio_start,
> +				afu->config.global_mmio_size);
> +	if (!afu->global_mmio_ptr) {
> +		release_fn_bar(afu->fn, afu->config.pp_mmio_bar);
> +		release_fn_bar(afu->fn, afu->config.global_mmio_bar);
> +		dev_err(&dev->dev, "Error mapping global mmio area\n");
> +		return -ENOMEM;
> +	}
> +
> +	/*
> +	 * Leave an empty page between the per-process mmio area and
> +	 * the AFU interrupt mappings
> +	 */
> +	afu->irq_base_offset = afu->config.pp_mmio_stride + PAGE_SIZE;
> +	return 0;
> +}
> +
> +static void unmap_mmio_areas(struct ocxl_afu *afu)
> +{
> +	if (afu->global_mmio_ptr) {
> +		iounmap(afu->global_mmio_ptr);
> +		afu->global_mmio_ptr = NULL;
> +	}
> +	afu->global_mmio_start = 0;
> +	afu->pp_mmio_start = 0;
> +	release_fn_bar(afu->fn, afu->config.pp_mmio_bar);
> +	release_fn_bar(afu->fn, afu->config.global_mmio_bar);
> +}
> +
> +static int configure_afu(struct ocxl_afu *afu, u8 afu_idx, struct pci_dev *dev)
> +{
> +	int rc;
> +
> +	rc = ocxl_config_read_afu(dev, &afu->fn->config, &afu->config, afu_idx);
> +	if (rc)
> +		return rc;
> +
> +	rc = set_afu_device(afu, dev_name(&dev->dev));
> +	if (rc)
> +		return rc;
> +
> +	rc = assign_afu_actag(afu, dev);
> +	if (rc)
> +		return rc;
> +
> +	rc = assign_afu_pasid(afu, dev);
> +	if (rc) {
> +		reclaim_afu_actag(afu);
> +		return rc;
> +	}
> +
> +	rc = map_mmio_areas(afu, dev);
> +	if (rc) {
> +		reclaim_afu_pasid(afu);
> +		reclaim_afu_actag(afu);
> +		return rc;
> +	}
> +	return 0;
> +}
> +
> +static void deconfigure_afu(struct ocxl_afu *afu)
> +{
> +	unmap_mmio_areas(afu);
> +	reclaim_afu_pasid(afu);
> +	reclaim_afu_actag(afu);
> +}
> +
> +static int activate_afu(struct pci_dev *dev, struct ocxl_afu *afu)
> +{
> +	int rc;
> +
> +	ocxl_config_set_afu_state(dev, afu->config.dvsec_afu_control_pos, 1);
> +	/*
> +	 * Char device creation is the last step, as processes can
> +	 * call our driver immediately, so all our inits must be finished.
> +	 */
> +	rc = ocxl_create_cdev(afu);
> +	if (rc)
> +		return rc;
> +	return 0;
> +}
> +
> +static void deactivate_afu(struct ocxl_afu *afu)
> +{
> +	struct pci_dev *dev = to_pci_dev(afu->fn->dev.parent);
> +
> +	ocxl_destroy_cdev(afu);
> +	ocxl_config_set_afu_state(dev, afu->config.dvsec_afu_control_pos, 0);
> +}
> +
> +int init_afu(struct pci_dev *dev, struct ocxl_fn *fn, u8 afu_idx)
> +{
> +	int rc;
> +	struct ocxl_afu *afu;
> +
> +	afu = alloc_afu(fn);
> +	if (!afu)
> +		return -ENOMEM;
> +
> +	rc = configure_afu(afu, afu_idx, dev);
> +	if (rc) {
> +		free_afu(afu);
> +		return rc;
> +	}
> +
> +	rc = ocxl_register_afu(afu);
> +	if (rc)
> +		goto err;
> +
> +	rc = ocxl_sysfs_add_afu(afu);
> +	if (rc)
> +		goto err;
> +
> +	rc = activate_afu(dev, afu);
> +	if (rc)
> +		goto err_sys;
> +
> +	list_add_tail(&afu->list, &fn->afu_list);
> +	return 0;
> +
> +err_sys:
> +	ocxl_sysfs_remove_afu(afu);
> +err:
> +	deconfigure_afu(afu);
> +	device_unregister(&afu->dev);
> +	return rc;
> +}
> +
> +void remove_afu(struct ocxl_afu *afu)
> +{
> +	list_del(&afu->list);
> +	ocxl_context_detach_all(afu);
> +	deactivate_afu(afu);
> +	ocxl_sysfs_remove_afu(afu);
> +	deconfigure_afu(afu);
> +	device_unregister(&afu->dev);
> +}
> +
> +static struct ocxl_fn *alloc_function(struct pci_dev *dev)
> +{
> +	struct ocxl_fn *fn;
> +
> +	fn = kzalloc(sizeof(struct ocxl_fn), GFP_KERNEL);
> +	if (!fn)
> +		return NULL;
> +
> +	INIT_LIST_HEAD(&fn->afu_list);
> +	INIT_LIST_HEAD(&fn->pasid_list);
> +	INIT_LIST_HEAD(&fn->actag_list);
> +	return fn;
> +}
> +
> +static void free_function(struct ocxl_fn *fn)
> +{
> +	WARN_ON(!list_empty(&fn->afu_list));
> +	WARN_ON(!list_empty(&fn->pasid_list));
> +	kfree(fn);
> +}
> +
> +static void free_function_dev(struct device *dev)
> +{
> +	struct ocxl_fn *fn = to_ocxl_function(dev);
> +
> +	free_function(fn);
> +}
> +
> +static int set_function_device(struct ocxl_fn *fn, struct pci_dev *dev)
> +{
> +	int rc;
> +
> +	fn->dev.parent = &dev->dev;
> +	fn->dev.release = free_function_dev;
> +	rc = dev_set_name(&fn->dev, "ocxlfn.%s", dev_name(&dev->dev));
> +	if (rc)
> +		return rc;
> +	pci_set_drvdata(dev, fn);
> +	return 0;
> +}
> +
> +static int assign_function_actag(struct ocxl_fn *fn)
> +{
> +	struct pci_dev *dev = to_pci_dev(fn->dev.parent);
> +	u16 base, enabled, supported;
> +	int rc;
> +
> +	rc = ocxl_config_get_actag_info(dev, &base, &enabled, &supported);
> +	if (rc)
> +		return rc;
> +
> +	fn->actag_base = base;
> +	fn->actag_enabled = enabled;
> +	fn->actag_supported = supported;
> +
> +	ocxl_config_set_actag(dev, fn->config.dvsec_function_pos,
> +			fn->actag_base,	fn->actag_enabled);
> +	dev_dbg(&fn->dev, "actag range starting at %d, enabled %d\n",
> +		fn->actag_base, fn->actag_enabled);
> +	return 0;
> +}
> +
> +static int set_function_pasid(struct ocxl_fn *fn)
> +{
> +	struct pci_dev *dev = to_pci_dev(fn->dev.parent);
> +	int rc, desired_count, max_count;
> +
> +	/* A function may not require any PASID */
> +	if (fn->config.max_pasid_log < 0)
> +		return 0;
> +
> +	rc = ocxl_config_get_pasid_info(dev, &max_count);
> +	if (rc)
> +		return rc;
> +
> +	desired_count = 1 << fn->config.max_pasid_log;
> +
> +	if (desired_count > max_count) {
> +		dev_err(&fn->dev,
> +			"Function requires more PASIDs than is available (%d vs. %d)\n",
> +			desired_count, max_count);
> +		return -ENOSPC;
> +	}
> +
> +	fn->pasid_base = 0;
> +	return 0;
> +}
> +
> +static int configure_function(struct ocxl_fn *fn, struct pci_dev *dev)
> +{
> +	int rc;
> +
> +	rc = pci_enable_device(dev);
> +	if (rc) {
> +		dev_err(&dev->dev, "pci_enable_device failed: %d\n", rc);
> +		return rc;
> +	}
> +
> +	/*
> +	 * Once it has been confirmed to work on our hardware, we
> +	 * should reset the function, to force the adapter to restart
> +	 * from scratch.
> +	 * A function reset would also reset all its AFUs.
> +	 *
> +	 * Some hints for implementation:
> +	 *
> +	 * - there's not status bit to know when the reset is done. We
> +	 *   should try reading the config space to know when it's
> +	 *   done.
> +	 * - probably something like:
> +	 *	Reset
> +	 *	wait 100ms
> +	 *	issue config read
> +	 *	allow device up to 1 sec to return success on config
> +	 *	read before declaring it broken
> +	 *
> +	 * Some shared logic on the card (CFG, TLX) won't be reset, so
> +	 * there's no guarantee that it will be enough.
> +	 */
> +	rc = ocxl_config_read_function(dev, &fn->config);
> +	if (rc)
> +		return rc;
> +
> +	rc = set_function_device(fn, dev);
> +	if (rc)
> +		return rc;
> +
> +	rc = assign_function_actag(fn);
> +	if (rc)
> +		return rc;
> +
> +	rc = set_function_pasid(fn);
> +	if (rc)
> +		return rc;
> +
> +	rc = ocxl_link_setup(dev, 0, &fn->link);
> +	if (rc)
> +		return rc;
> +
> +	rc = ocxl_config_set_TL(dev, fn->config.dvsec_tl_pos);
> +	if (rc) {
> +		ocxl_link_release(dev, fn->link);
> +		return rc;
> +	}
> +	return 0;
> +}
> +
> +static void deconfigure_function(struct ocxl_fn *fn)
> +{
> +	struct pci_dev *dev = to_pci_dev(fn->dev.parent);
> +
> +	ocxl_link_release(dev, fn->link);
> +	pci_disable_device(dev);
> +}
> +
> +struct ocxl_fn *init_function(struct pci_dev *dev)
> +{
> +	struct ocxl_fn *fn;
> +	int rc;
> +
> +	fn = alloc_function(dev);
> +	if (!fn)
> +		return ERR_PTR(-ENOMEM);
> +
> +	rc = configure_function(fn, dev);
> +	if (rc) {
> +		free_function(fn);
> +		return ERR_PTR(rc);
> +	}
> +
> +	rc = device_register(&fn->dev);
> +	if (rc) {
> +		deconfigure_function(fn);
> +		put_device(&fn->dev);
> +		return ERR_PTR(rc);
> +	}
> +	return fn;
> +}
> +
> +void remove_function(struct ocxl_fn *fn)
> +{
> +	deconfigure_function(fn);
> +	device_unregister(&fn->dev);
> +}
> diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h
> index 06fd98c989c8..81086534dab5 100644
> --- a/drivers/misc/ocxl/ocxl_internal.h
> +++ b/drivers/misc/ocxl/ocxl_internal.h
> @@ -150,4 +150,9 @@ int ocxl_afu_irq_set_fd(struct ocxl_context *ctx, u64 irq_offset,
>   			int eventfd);
>   u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, u64 irq_offset);
>   
> +struct ocxl_fn *init_function(struct pci_dev *dev);
> +void remove_function(struct ocxl_fn *fn);
> +int init_afu(struct pci_dev *dev, struct ocxl_fn *fn, u8 afu_idx);
> +void remove_afu(struct ocxl_afu *afu);
> +
>   #endif /* _OCXL_INTERNAL_H_ */
> diff --git a/drivers/misc/ocxl/pci.c b/drivers/misc/ocxl/pci.c
> index 21f425472a82..4ed7cb1a667f 100644
> --- a/drivers/misc/ocxl/pci.c
> +++ b/drivers/misc/ocxl/pci.c
> @@ -1,9 +1,6 @@
>   // SPDX-License-Identifier: GPL-2.0+
> -// Copyright 2017 IBM Corp.
> +// Copyright 2019 IBM Corp.
>   #include <linux/module.h>
> -#include <linux/pci.h>
> -#include <linux/idr.h>
> -#include <asm/pnv-ocxl.h>
>   #include "ocxl_internal.h"
>   
>   /*
> @@ -17,520 +14,6 @@ static const struct pci_device_id ocxl_pci_tbl[] = {
>   };
>   MODULE_DEVICE_TABLE(pci, ocxl_pci_tbl);
>   
> -
> -static struct ocxl_fn *ocxl_fn_get(struct ocxl_fn *fn)
> -{
> -	return (get_device(&fn->dev) == NULL) ? NULL : fn;
> -}
> -
> -static void ocxl_fn_put(struct ocxl_fn *fn)
> -{
> -	put_device(&fn->dev);
> -}
> -
> -struct ocxl_afu *ocxl_afu_get(struct ocxl_afu *afu)
> -{
> -	return (get_device(&afu->dev) == NULL) ? NULL : afu;
> -}
> -
> -void ocxl_afu_put(struct ocxl_afu *afu)
> -{
> -	put_device(&afu->dev);
> -}
> -
> -static struct ocxl_afu *alloc_afu(struct ocxl_fn *fn)
> -{
> -	struct ocxl_afu *afu;
> -
> -	afu = kzalloc(sizeof(struct ocxl_afu), GFP_KERNEL);
> -	if (!afu)
> -		return NULL;
> -
> -	mutex_init(&afu->contexts_lock);
> -	mutex_init(&afu->afu_control_lock);
> -	idr_init(&afu->contexts_idr);
> -	afu->fn = fn;
> -	ocxl_fn_get(fn);
> -	return afu;
> -}
> -
> -static void free_afu(struct ocxl_afu *afu)
> -{
> -	idr_destroy(&afu->contexts_idr);
> -	ocxl_fn_put(afu->fn);
> -	kfree(afu);
> -}
> -
> -static void free_afu_dev(struct device *dev)
> -{
> -	struct ocxl_afu *afu = to_ocxl_afu(dev);
> -
> -	ocxl_unregister_afu(afu);
> -	free_afu(afu);
> -}
> -
> -static int set_afu_device(struct ocxl_afu *afu, const char *location)
> -{
> -	struct ocxl_fn *fn = afu->fn;
> -	int rc;
> -
> -	afu->dev.parent = &fn->dev;
> -	afu->dev.release = free_afu_dev;
> -	rc = dev_set_name(&afu->dev, "%s.%s.%hhu", afu->config.name, location,
> -		afu->config.idx);
> -	return rc;
> -}
> -
> -static int assign_afu_actag(struct ocxl_afu *afu, struct pci_dev *dev)
> -{
> -	struct ocxl_fn *fn = afu->fn;
> -	int actag_count, actag_offset;
> -
> -	/*
> -	 * if there were not enough actags for the function, each afu
> -	 * reduces its count as well
> -	 */
> -	actag_count = afu->config.actag_supported *
> -		fn->actag_enabled / fn->actag_supported;
> -	actag_offset = ocxl_actag_afu_alloc(fn, actag_count);
> -	if (actag_offset < 0) {
> -		dev_err(&afu->dev, "Can't allocate %d actags for AFU: %d\n",
> -			actag_count, actag_offset);
> -		return actag_offset;
> -	}
> -	afu->actag_base = fn->actag_base + actag_offset;
> -	afu->actag_enabled = actag_count;
> -
> -	ocxl_config_set_afu_actag(dev, afu->config.dvsec_afu_control_pos,
> -				afu->actag_base, afu->actag_enabled);
> -	dev_dbg(&afu->dev, "actag base=%d enabled=%d\n",
> -		afu->actag_base, afu->actag_enabled);
> -	return 0;
> -}
> -
> -static void reclaim_afu_actag(struct ocxl_afu *afu)
> -{
> -	struct ocxl_fn *fn = afu->fn;
> -	int start_offset, size;
> -
> -	start_offset = afu->actag_base - fn->actag_base;
> -	size = afu->actag_enabled;
> -	ocxl_actag_afu_free(afu->fn, start_offset, size);
> -}
> -
> -static int assign_afu_pasid(struct ocxl_afu *afu, struct pci_dev *dev)
> -{
> -	struct ocxl_fn *fn = afu->fn;
> -	int pasid_count, pasid_offset;
> -
> -	/*
> -	 * We only support the case where the function configuration
> -	 * requested enough PASIDs to cover all AFUs.
> -	 */
> -	pasid_count = 1 << afu->config.pasid_supported_log;
> -	pasid_offset = ocxl_pasid_afu_alloc(fn, pasid_count);
> -	if (pasid_offset < 0) {
> -		dev_err(&afu->dev, "Can't allocate %d PASIDs for AFU: %d\n",
> -			pasid_count, pasid_offset);
> -		return pasid_offset;
> -	}
> -	afu->pasid_base = fn->pasid_base + pasid_offset;
> -	afu->pasid_count = 0;
> -	afu->pasid_max = pasid_count;
> -
> -	ocxl_config_set_afu_pasid(dev, afu->config.dvsec_afu_control_pos,
> -				afu->pasid_base,
> -				afu->config.pasid_supported_log);
> -	dev_dbg(&afu->dev, "PASID base=%d, enabled=%d\n",
> -		afu->pasid_base, pasid_count);
> -	return 0;
> -}
> -
> -static void reclaim_afu_pasid(struct ocxl_afu *afu)
> -{
> -	struct ocxl_fn *fn = afu->fn;
> -	int start_offset, size;
> -
> -	start_offset = afu->pasid_base - fn->pasid_base;
> -	size = 1 << afu->config.pasid_supported_log;
> -	ocxl_pasid_afu_free(afu->fn, start_offset, size);
> -}
> -
> -static int reserve_fn_bar(struct ocxl_fn *fn, int bar)
> -{
> -	struct pci_dev *dev = to_pci_dev(fn->dev.parent);
> -	int rc, idx;
> -
> -	if (bar != 0 && bar != 2 && bar != 4)
> -		return -EINVAL;
> -
> -	idx = bar >> 1;
> -	if (fn->bar_used[idx]++ == 0) {
> -		rc = pci_request_region(dev, bar, "ocxl");
> -		if (rc)
> -			return rc;
> -	}
> -	return 0;
> -}
> -
> -static void release_fn_bar(struct ocxl_fn *fn, int bar)
> -{
> -	struct pci_dev *dev = to_pci_dev(fn->dev.parent);
> -	int idx;
> -
> -	if (bar != 0 && bar != 2 && bar != 4)
> -		return;
> -
> -	idx = bar >> 1;
> -	if (--fn->bar_used[idx] == 0)
> -		pci_release_region(dev, bar);
> -	WARN_ON(fn->bar_used[idx] < 0);
> -}
> -
> -static int map_mmio_areas(struct ocxl_afu *afu, struct pci_dev *dev)
> -{
> -	int rc;
> -
> -	rc = reserve_fn_bar(afu->fn, afu->config.global_mmio_bar);
> -	if (rc)
> -		return rc;
> -
> -	rc = reserve_fn_bar(afu->fn, afu->config.pp_mmio_bar);
> -	if (rc) {
> -		release_fn_bar(afu->fn, afu->config.global_mmio_bar);
> -		return rc;
> -	}
> -
> -	afu->global_mmio_start =
> -		pci_resource_start(dev, afu->config.global_mmio_bar) +
> -		afu->config.global_mmio_offset;
> -	afu->pp_mmio_start =
> -		pci_resource_start(dev, afu->config.pp_mmio_bar) +
> -		afu->config.pp_mmio_offset;
> -
> -	afu->global_mmio_ptr = ioremap(afu->global_mmio_start,
> -				afu->config.global_mmio_size);
> -	if (!afu->global_mmio_ptr) {
> -		release_fn_bar(afu->fn, afu->config.pp_mmio_bar);
> -		release_fn_bar(afu->fn, afu->config.global_mmio_bar);
> -		dev_err(&dev->dev, "Error mapping global mmio area\n");
> -		return -ENOMEM;
> -	}
> -
> -	/*
> -	 * Leave an empty page between the per-process mmio area and
> -	 * the AFU interrupt mappings
> -	 */
> -	afu->irq_base_offset = afu->config.pp_mmio_stride + PAGE_SIZE;
> -	return 0;
> -}
> -
> -static void unmap_mmio_areas(struct ocxl_afu *afu)
> -{
> -	if (afu->global_mmio_ptr) {
> -		iounmap(afu->global_mmio_ptr);
> -		afu->global_mmio_ptr = NULL;
> -	}
> -	afu->global_mmio_start = 0;
> -	afu->pp_mmio_start = 0;
> -	release_fn_bar(afu->fn, afu->config.pp_mmio_bar);
> -	release_fn_bar(afu->fn, afu->config.global_mmio_bar);
> -}
> -
> -static int configure_afu(struct ocxl_afu *afu, u8 afu_idx, struct pci_dev *dev)
> -{
> -	int rc;
> -
> -	rc = ocxl_config_read_afu(dev, &afu->fn->config, &afu->config, afu_idx);
> -	if (rc)
> -		return rc;
> -
> -	rc = set_afu_device(afu, dev_name(&dev->dev));
> -	if (rc)
> -		return rc;
> -
> -	rc = assign_afu_actag(afu, dev);
> -	if (rc)
> -		return rc;
> -
> -	rc = assign_afu_pasid(afu, dev);
> -	if (rc) {
> -		reclaim_afu_actag(afu);
> -		return rc;
> -	}
> -
> -	rc = map_mmio_areas(afu, dev);
> -	if (rc) {
> -		reclaim_afu_pasid(afu);
> -		reclaim_afu_actag(afu);
> -		return rc;
> -	}
> -	return 0;
> -}
> -
> -static void deconfigure_afu(struct ocxl_afu *afu)
> -{
> -	unmap_mmio_areas(afu);
> -	reclaim_afu_pasid(afu);
> -	reclaim_afu_actag(afu);
> -}
> -
> -static int activate_afu(struct pci_dev *dev, struct ocxl_afu *afu)
> -{
> -	int rc;
> -
> -	ocxl_config_set_afu_state(dev, afu->config.dvsec_afu_control_pos, 1);
> -	/*
> -	 * Char device creation is the last step, as processes can
> -	 * call our driver immediately, so all our inits must be finished.
> -	 */
> -	rc = ocxl_create_cdev(afu);
> -	if (rc)
> -		return rc;
> -	return 0;
> -}
> -
> -static void deactivate_afu(struct ocxl_afu *afu)
> -{
> -	struct pci_dev *dev = to_pci_dev(afu->fn->dev.parent);
> -
> -	ocxl_destroy_cdev(afu);
> -	ocxl_config_set_afu_state(dev, afu->config.dvsec_afu_control_pos, 0);
> -}
> -
> -static int init_afu(struct pci_dev *dev, struct ocxl_fn *fn, u8 afu_idx)
> -{
> -	int rc;
> -	struct ocxl_afu *afu;
> -
> -	afu = alloc_afu(fn);
> -	if (!afu)
> -		return -ENOMEM;
> -
> -	rc = configure_afu(afu, afu_idx, dev);
> -	if (rc) {
> -		free_afu(afu);
> -		return rc;
> -	}
> -
> -	rc = ocxl_register_afu(afu);
> -	if (rc)
> -		goto err;
> -
> -	rc = ocxl_sysfs_add_afu(afu);
> -	if (rc)
> -		goto err;
> -
> -	rc = activate_afu(dev, afu);
> -	if (rc)
> -		goto err_sys;
> -
> -	list_add_tail(&afu->list, &fn->afu_list);
> -	return 0;
> -
> -err_sys:
> -	ocxl_sysfs_remove_afu(afu);
> -err:
> -	deconfigure_afu(afu);
> -	device_unregister(&afu->dev);
> -	return rc;
> -}
> -
> -static void remove_afu(struct ocxl_afu *afu)
> -{
> -	list_del(&afu->list);
> -	ocxl_context_detach_all(afu);
> -	deactivate_afu(afu);
> -	ocxl_sysfs_remove_afu(afu);
> -	deconfigure_afu(afu);
> -	device_unregister(&afu->dev);
> -}
> -
> -static struct ocxl_fn *alloc_function(struct pci_dev *dev)
> -{
> -	struct ocxl_fn *fn;
> -
> -	fn = kzalloc(sizeof(struct ocxl_fn), GFP_KERNEL);
> -	if (!fn)
> -		return NULL;
> -
> -	INIT_LIST_HEAD(&fn->afu_list);
> -	INIT_LIST_HEAD(&fn->pasid_list);
> -	INIT_LIST_HEAD(&fn->actag_list);
> -	return fn;
> -}
> -
> -static void free_function(struct ocxl_fn *fn)
> -{
> -	WARN_ON(!list_empty(&fn->afu_list));
> -	WARN_ON(!list_empty(&fn->pasid_list));
> -	kfree(fn);
> -}
> -
> -static void free_function_dev(struct device *dev)
> -{
> -	struct ocxl_fn *fn = to_ocxl_function(dev);
> -
> -	free_function(fn);
> -}
> -
> -static int set_function_device(struct ocxl_fn *fn, struct pci_dev *dev)
> -{
> -	int rc;
> -
> -	fn->dev.parent = &dev->dev;
> -	fn->dev.release = free_function_dev;
> -	rc = dev_set_name(&fn->dev, "ocxlfn.%s", dev_name(&dev->dev));
> -	if (rc)
> -		return rc;
> -	pci_set_drvdata(dev, fn);
> -	return 0;
> -}
> -
> -static int assign_function_actag(struct ocxl_fn *fn)
> -{
> -	struct pci_dev *dev = to_pci_dev(fn->dev.parent);
> -	u16 base, enabled, supported;
> -	int rc;
> -
> -	rc = ocxl_config_get_actag_info(dev, &base, &enabled, &supported);
> -	if (rc)
> -		return rc;
> -
> -	fn->actag_base = base;
> -	fn->actag_enabled = enabled;
> -	fn->actag_supported = supported;
> -
> -	ocxl_config_set_actag(dev, fn->config.dvsec_function_pos,
> -			fn->actag_base,	fn->actag_enabled);
> -	dev_dbg(&fn->dev, "actag range starting at %d, enabled %d\n",
> -		fn->actag_base, fn->actag_enabled);
> -	return 0;
> -}
> -
> -static int set_function_pasid(struct ocxl_fn *fn)
> -{
> -	struct pci_dev *dev = to_pci_dev(fn->dev.parent);
> -	int rc, desired_count, max_count;
> -
> -	/* A function may not require any PASID */
> -	if (fn->config.max_pasid_log < 0)
> -		return 0;
> -
> -	rc = ocxl_config_get_pasid_info(dev, &max_count);
> -	if (rc)
> -		return rc;
> -
> -	desired_count = 1 << fn->config.max_pasid_log;
> -
> -	if (desired_count > max_count) {
> -		dev_err(&fn->dev,
> -			"Function requires more PASIDs than is available (%d vs. %d)\n",
> -			desired_count, max_count);
> -		return -ENOSPC;
> -	}
> -
> -	fn->pasid_base = 0;
> -	return 0;
> -}
> -
> -static int configure_function(struct ocxl_fn *fn, struct pci_dev *dev)
> -{
> -	int rc;
> -
> -	rc = pci_enable_device(dev);
> -	if (rc) {
> -		dev_err(&dev->dev, "pci_enable_device failed: %d\n", rc);
> -		return rc;
> -	}
> -
> -	/*
> -	 * Once it has been confirmed to work on our hardware, we
> -	 * should reset the function, to force the adapter to restart
> -	 * from scratch.
> -	 * A function reset would also reset all its AFUs.
> -	 *
> -	 * Some hints for implementation:
> -	 *
> -	 * - there's not status bit to know when the reset is done. We
> -	 *   should try reading the config space to know when it's
> -	 *   done.
> -	 * - probably something like:
> -	 *	Reset
> -	 *	wait 100ms
> -	 *	issue config read
> -	 *	allow device up to 1 sec to return success on config
> -	 *	read before declaring it broken
> -	 *
> -	 * Some shared logic on the card (CFG, TLX) won't be reset, so
> -	 * there's no guarantee that it will be enough.
> -	 */
> -	rc = ocxl_config_read_function(dev, &fn->config);
> -	if (rc)
> -		return rc;
> -
> -	rc = set_function_device(fn, dev);
> -	if (rc)
> -		return rc;
> -
> -	rc = assign_function_actag(fn);
> -	if (rc)
> -		return rc;
> -
> -	rc = set_function_pasid(fn);
> -	if (rc)
> -		return rc;
> -
> -	rc = ocxl_link_setup(dev, 0, &fn->link);
> -	if (rc)
> -		return rc;
> -
> -	rc = ocxl_config_set_TL(dev, fn->config.dvsec_tl_pos);
> -	if (rc) {
> -		ocxl_link_release(dev, fn->link);
> -		return rc;
> -	}
> -	return 0;
> -}
> -
> -static void deconfigure_function(struct ocxl_fn *fn)
> -{
> -	struct pci_dev *dev = to_pci_dev(fn->dev.parent);
> -
> -	ocxl_link_release(dev, fn->link);
> -	pci_disable_device(dev);
> -}
> -
> -static struct ocxl_fn *init_function(struct pci_dev *dev)
> -{
> -	struct ocxl_fn *fn;
> -	int rc;
> -
> -	fn = alloc_function(dev);
> -	if (!fn)
> -		return ERR_PTR(-ENOMEM);
> -
> -	rc = configure_function(fn, dev);
> -	if (rc) {
> -		free_function(fn);
> -		return ERR_PTR(rc);
> -	}
> -
> -	rc = device_register(&fn->dev);
> -	if (rc) {
> -		deconfigure_function(fn);
> -		put_device(&fn->dev);
> -		return ERR_PTR(rc);
> -	}
> -	return fn;
> -}
> -
> -static void remove_function(struct ocxl_fn *fn)
> -{
> -	deconfigure_function(fn);
> -	device_unregister(&fn->dev);
> -}
> -
>   static int ocxl_probe(struct pci_dev *dev, const struct pci_device_id *id)
>   {
>   	int rc, afu_count = 0;
> 

-- 
Andrew Donnellan              OzLabs, ADL Canberra
andrew.donnellan@au1.ibm.com  IBM Australia Limited


^ permalink raw reply

* Re: [PATCH v5 1/8] powerpc/pci: Access PCI config space directly w/o pci_dn
From: Oliver @ 2019-04-30  4:22 UTC (permalink / raw)
  To: Sergey Miroshnichenko
  Cc: Stewart Smith, Alexey Kardashevskiy, linux-pci, linux,
	linuxppc-dev
In-Reply-To: <20190311115233.6514-2-s.miroshnichenko@yadro.com>

On Mon, Mar 11, 2019 at 10:52 PM Sergey Miroshnichenko
<s.miroshnichenko@yadro.com> wrote:
>
> To fetch an updated DT for the newly hotplugged device, OS must explicitly
> request it from the firmware via the pnv_php driver.
>
> If pnv_php wasn't triggered/loaded, it is still possible to discover new
> devices if PCIe I/O will not stop in absence of the pci_dn structure.
>
> Signed-off-by: Sergey Miroshnichenko <s.miroshnichenko@yadro.com>
> ---
>  arch/powerpc/kernel/rtas_pci.c       | 97 +++++++++++++++++++---------
>  arch/powerpc/platforms/powernv/pci.c | 64 ++++++++++++------
>  2 files changed, 109 insertions(+), 52 deletions(-)
>
> diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c
> index c2b148b1634a..f675b5ecb5bc 100644
> --- a/arch/powerpc/kernel/rtas_pci.c
> +++ b/arch/powerpc/kernel/rtas_pci.c
> @@ -55,10 +55,26 @@ static inline int config_access_valid(struct pci_dn *dn, int where)
>         return 0;
>  }
>
> -int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val)
> +static int rtas_read_raw_config(unsigned long buid, int busno, unsigned int devfn,
> +                               int where, int size, u32 *val)
>  {
>         int returnval = -1;
> -       unsigned long buid, addr;
> +       unsigned long addr = rtas_config_addr(busno, devfn, where);
> +       int ret;
> +
> +       if (buid) {
> +               ret = rtas_call(ibm_read_pci_config, 4, 2, &returnval,
> +                               addr, BUID_HI(buid), BUID_LO(buid), size);
> +       } else {
> +               ret = rtas_call(read_pci_config, 2, 2, &returnval, addr, size);
> +       }
> +       *val = returnval;
> +
> +       return ret;
> +}
> +
> +int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val)
> +{
>         int ret;
>
>         if (!pdn)
> @@ -71,16 +87,8 @@ int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val)
>                 return PCIBIOS_SET_FAILED;
>  #endif
>
> -       addr = rtas_config_addr(pdn->busno, pdn->devfn, where);
> -       buid = pdn->phb->buid;
> -       if (buid) {
> -               ret = rtas_call(ibm_read_pci_config, 4, 2, &returnval,
> -                               addr, BUID_HI(buid), BUID_LO(buid), size);
> -       } else {
> -               ret = rtas_call(read_pci_config, 2, 2, &returnval, addr, size);
> -       }
> -       *val = returnval;
> -
> +       ret = rtas_read_raw_config(pdn->phb->buid, pdn->busno, pdn->devfn,
> +                                  where, size, val);
>         if (ret)
>                 return PCIBIOS_DEVICE_NOT_FOUND;
>
> @@ -98,18 +106,44 @@ static int rtas_pci_read_config(struct pci_bus *bus,
>
>         pdn = pci_get_pdn_by_devfn(bus, devfn);
>
> -       /* Validity of pdn is checked in here */
> -       ret = rtas_read_config(pdn, where, size, val);
> -       if (*val == EEH_IO_ERROR_VALUE(size) &&
> -           eeh_dev_check_failure(pdn_to_eeh_dev(pdn)))
> -               return PCIBIOS_DEVICE_NOT_FOUND;
> +       if (pdn) {
> +               /* Validity of pdn is checked in here */
> +               ret = rtas_read_config(pdn, where, size, val);
> +
> +               if (*val == EEH_IO_ERROR_VALUE(size) &&
> +                   eeh_dev_check_failure(pdn_to_eeh_dev(pdn)))
> +                       ret = PCIBIOS_DEVICE_NOT_FOUND;
> +       } else {
> +               struct pci_controller *phb = pci_bus_to_host(bus);
> +
> +               ret = rtas_read_raw_config(phb->buid, bus->number, devfn,
> +                                          where, size, val);
> +       }
>
>         return ret;
>  }
>
> +static int rtas_write_raw_config(unsigned long buid, int busno, unsigned int devfn,
> +                                int where, int size, u32 val)
> +{
> +       unsigned long addr = rtas_config_addr(busno, devfn, where);
> +       int ret;
> +
> +       if (buid) {
> +               ret = rtas_call(ibm_write_pci_config, 5, 1, NULL, addr,
> +                               BUID_HI(buid), BUID_LO(buid), size, (ulong)val);
> +       } else {
> +               ret = rtas_call(write_pci_config, 3, 1, NULL, addr, size, (ulong)val);
> +       }
> +
> +       if (ret)
> +               return PCIBIOS_DEVICE_NOT_FOUND;
> +
> +       return PCIBIOS_SUCCESSFUL;
> +}
> +
>  int rtas_write_config(struct pci_dn *pdn, int where, int size, u32 val)
>  {
> -       unsigned long buid, addr;
>         int ret;
>
>         if (!pdn)
> @@ -122,15 +156,8 @@ int rtas_write_config(struct pci_dn *pdn, int where, int size, u32 val)
>                 return PCIBIOS_SET_FAILED;
>  #endif
>
> -       addr = rtas_config_addr(pdn->busno, pdn->devfn, where);
> -       buid = pdn->phb->buid;
> -       if (buid) {
> -               ret = rtas_call(ibm_write_pci_config, 5, 1, NULL, addr,
> -                       BUID_HI(buid), BUID_LO(buid), size, (ulong) val);
> -       } else {
> -               ret = rtas_call(write_pci_config, 3, 1, NULL, addr, size, (ulong)val);
> -       }
> -
> +       ret = rtas_write_raw_config(pdn->phb->buid, pdn->busno, pdn->devfn,
> +                                   where, size, val);
>         if (ret)
>                 return PCIBIOS_DEVICE_NOT_FOUND;
>
> @@ -141,12 +168,20 @@ static int rtas_pci_write_config(struct pci_bus *bus,
>                                  unsigned int devfn,
>                                  int where, int size, u32 val)
>  {
> -       struct pci_dn *pdn;
> +       struct pci_dn *pdn = pci_get_pdn_by_devfn(bus, devfn);
> +       int ret;
>
> -       pdn = pci_get_pdn_by_devfn(bus, devfn);
> +       if (pdn) {
> +               /* Validity of pdn is checked in here. */
> +               ret = rtas_write_config(pdn, where, size, val);
> +       } else {
> +               struct pci_controller *phb = pci_bus_to_host(bus);
>
> -       /* Validity of pdn is checked in here. */
> -       return rtas_write_config(pdn, where, size, val);
> +               ret = rtas_write_raw_config(phb->buid, bus->number, devfn,
> +                                           where, size, val);
> +       }
> +
> +       return ret;
>  }
>
>  static struct pci_ops rtas_pci_ops = {
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index ef9448a907c6..41a381dfc2a1 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -652,30 +652,29 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
>         }
>  }
>
> -int pnv_pci_cfg_read(struct pci_dn *pdn,
> -                    int where, int size, u32 *val)
> +static int pnv_pci_cfg_read_raw(u64 phb_id, int busno, unsigned int devfn,
> +                               int where, int size, u32 *val)
>  {
> -       struct pnv_phb *phb = pdn->phb->private_data;
> -       u32 bdfn = (pdn->busno << 8) | pdn->devfn;
> +       u32 bdfn = (busno << 8) | devfn;
>         s64 rc;
>
>         switch (size) {
>         case 1: {
>                 u8 v8;
> -               rc = opal_pci_config_read_byte(phb->opal_id, bdfn, where, &v8);
> +               rc = opal_pci_config_read_byte(phb_id, bdfn, where, &v8);
>                 *val = (rc == OPAL_SUCCESS) ? v8 : 0xff;
>                 break;
>         }
>         case 2: {
>                 __be16 v16;
> -               rc = opal_pci_config_read_half_word(phb->opal_id, bdfn, where,
> -                                                  &v16);
> +               rc = opal_pci_config_read_half_word(phb_id, bdfn, where,
> +                                                   &v16);
>                 *val = (rc == OPAL_SUCCESS) ? be16_to_cpu(v16) : 0xffff;
>                 break;
>         }
>         case 4: {
>                 __be32 v32;
> -               rc = opal_pci_config_read_word(phb->opal_id, bdfn, where, &v32);
> +               rc = opal_pci_config_read_word(phb_id, bdfn, where, &v32);
>                 *val = (rc == OPAL_SUCCESS) ? be32_to_cpu(v32) : 0xffffffff;
>                 break;
>         }
> @@ -684,27 +683,28 @@ int pnv_pci_cfg_read(struct pci_dn *pdn,
>         }
>
>         pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
> -                __func__, pdn->busno, pdn->devfn, where, size, *val);
> +                __func__, busno, devfn, where, size, *val);
> +
>         return PCIBIOS_SUCCESSFUL;
>  }
>
> -int pnv_pci_cfg_write(struct pci_dn *pdn,
> -                     int where, int size, u32 val)
> +static int pnv_pci_cfg_write_raw(u64 phb_id, int busno, unsigned int devfn,
> +                                int where, int size, u32 val)
>  {
> -       struct pnv_phb *phb = pdn->phb->private_data;
> -       u32 bdfn = (pdn->busno << 8) | pdn->devfn;
> +       u32 bdfn = (busno << 8) | devfn;
>
>         pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
> -                __func__, pdn->busno, pdn->devfn, where, size, val);
> +                __func__, busno, devfn, where, size, val);
> +
>         switch (size) {
>         case 1:
> -               opal_pci_config_write_byte(phb->opal_id, bdfn, where, val);
> +               opal_pci_config_write_byte(phb_id, bdfn, where, val);
>                 break;
>         case 2:
> -               opal_pci_config_write_half_word(phb->opal_id, bdfn, where, val);
> +               opal_pci_config_write_half_word(phb_id, bdfn, where, val);
>                 break;
>         case 4:
> -               opal_pci_config_write_word(phb->opal_id, bdfn, where, val);
> +               opal_pci_config_write_word(phb_id, bdfn, where, val);
>                 break;
>         default:
>                 return PCIBIOS_FUNC_NOT_SUPPORTED;
> @@ -713,6 +713,24 @@ int pnv_pci_cfg_write(struct pci_dn *pdn,
>         return PCIBIOS_SUCCESSFUL;
>  }
>
> +int pnv_pci_cfg_read(struct pci_dn *pdn,
> +                    int where, int size, u32 *val)
> +{
> +       struct pnv_phb *phb = pdn->phb->private_data;
> +
> +       return pnv_pci_cfg_read_raw(phb->opal_id, pdn->busno, pdn->devfn,
> +                                   where, size, val);
> +}
> +
> +int pnv_pci_cfg_write(struct pci_dn *pdn,
> +                     int where, int size, u32 val)
> +{
> +       struct pnv_phb *phb = pdn->phb->private_data;
> +
> +       return pnv_pci_cfg_write_raw(phb->opal_id, pdn->busno, pdn->devfn,
> +                                    where, size, val);
> +}
> +
>  #if CONFIG_EEH
>  static bool pnv_pci_cfg_check(struct pci_dn *pdn)
>  {
> @@ -748,13 +766,15 @@ static int pnv_pci_read_config(struct pci_bus *bus,
>                                int where, int size, u32 *val)
>  {
>         struct pci_dn *pdn;
> -       struct pnv_phb *phb;
> +       struct pci_controller *hose = pci_bus_to_host(bus);
> +       struct pnv_phb *phb = hose->private_data;
>         int ret;
>
>         *val = 0xFFFFFFFF;
>         pdn = pci_get_pdn_by_devfn(bus, devfn);
>         if (!pdn)
> -               return PCIBIOS_DEVICE_NOT_FOUND;
> +               return pnv_pci_cfg_read_raw(phb->opal_id, bus->number, devfn,
> +                                           where, size, val);
>
>         if (!pnv_pci_cfg_check(pdn))
>                 return PCIBIOS_DEVICE_NOT_FOUND;
> @@ -777,12 +797,14 @@ static int pnv_pci_write_config(struct pci_bus *bus,
>                                 int where, int size, u32 val)
>  {
>         struct pci_dn *pdn;
> -       struct pnv_phb *phb;
> +       struct pci_controller *hose = pci_bus_to_host(bus);
> +       struct pnv_phb *phb = hose->private_data;
>         int ret;
>
>         pdn = pci_get_pdn_by_devfn(bus, devfn);
>         if (!pdn)
> -               return PCIBIOS_DEVICE_NOT_FOUND;
> +               return pnv_pci_cfg_write_raw(phb->opal_id, bus->number, devfn,
> +                                            where, size, val);
>
>         if (!pnv_pci_cfg_check(pdn))
>                 return PCIBIOS_DEVICE_NOT_FOUND;
> --
> 2.20.1
>

Reviewed-by: Oliver O'Halloran <oohall@gmail.com>

^ permalink raw reply

* Re: [PATCH v4 2/7] ocxl: Don't pass pci_dev around
From: Andrew Donnellan @ 2019-04-30  4:25 UTC (permalink / raw)
  To: Alastair D'Silva, alastair
  Cc: Frederic Barrat, Greg Kroah-Hartman, linuxppc-dev, linux-kernel,
	Arnd Bergmann
In-Reply-To: <20190327053137.15173-3-alastair@au1.ibm.com>

On 27/3/19 4:31 pm, Alastair D'Silva wrote:
> From: Alastair D'Silva <alastair@d-silva.org>
> 
> This data is already available in a struct
> 
> Signed-off-by: Alastair D'Silva <alastair@d-silva.org>

Acked-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>

> ---
>   drivers/misc/ocxl/core.c | 38 +++++++++++++++++++++-----------------
>   1 file changed, 21 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/misc/ocxl/core.c b/drivers/misc/ocxl/core.c
> index 1a4411b72d35..2f2fe12eac1e 100644
> --- a/drivers/misc/ocxl/core.c
> +++ b/drivers/misc/ocxl/core.c
> @@ -66,10 +66,11 @@ static int set_afu_device(struct ocxl_afu *afu, const char *location)
>   	return rc;
>   }
>   
> -static int assign_afu_actag(struct ocxl_afu *afu, struct pci_dev *dev)
> +static int assign_afu_actag(struct ocxl_afu *afu)
>   {
>   	struct ocxl_fn *fn = afu->fn;
>   	int actag_count, actag_offset;
> +	struct pci_dev *pci_dev = to_pci_dev(fn->dev.parent);
>   
>   	/*
>   	 * if there were not enough actags for the function, each afu
> @@ -79,16 +80,16 @@ static int assign_afu_actag(struct ocxl_afu *afu, struct pci_dev *dev)
>   		fn->actag_enabled / fn->actag_supported;
>   	actag_offset = ocxl_actag_afu_alloc(fn, actag_count);
>   	if (actag_offset < 0) {
> -		dev_err(&afu->dev, "Can't allocate %d actags for AFU: %d\n",
> +		dev_err(&pci_dev->dev, "Can't allocate %d actags for AFU: %d\n",
>   			actag_count, actag_offset);
>   		return actag_offset;
>   	}
>   	afu->actag_base = fn->actag_base + actag_offset;
>   	afu->actag_enabled = actag_count;
>   
> -	ocxl_config_set_afu_actag(dev, afu->config.dvsec_afu_control_pos,
> +	ocxl_config_set_afu_actag(pci_dev, afu->config.dvsec_afu_control_pos,
>   				afu->actag_base, afu->actag_enabled);
> -	dev_dbg(&afu->dev, "actag base=%d enabled=%d\n",
> +	dev_dbg(&pci_dev->dev, "actag base=%d enabled=%d\n",
>   		afu->actag_base, afu->actag_enabled);
>   	return 0;
>   }
> @@ -103,10 +104,11 @@ static void reclaim_afu_actag(struct ocxl_afu *afu)
>   	ocxl_actag_afu_free(afu->fn, start_offset, size);
>   }
>   
> -static int assign_afu_pasid(struct ocxl_afu *afu, struct pci_dev *dev)
> +static int assign_afu_pasid(struct ocxl_afu *afu)
>   {
>   	struct ocxl_fn *fn = afu->fn;
>   	int pasid_count, pasid_offset;
> +	struct pci_dev *pci_dev = to_pci_dev(fn->dev.parent);
>   
>   	/*
>   	 * We only support the case where the function configuration
> @@ -115,7 +117,7 @@ static int assign_afu_pasid(struct ocxl_afu *afu, struct pci_dev *dev)
>   	pasid_count = 1 << afu->config.pasid_supported_log;
>   	pasid_offset = ocxl_pasid_afu_alloc(fn, pasid_count);
>   	if (pasid_offset < 0) {
> -		dev_err(&afu->dev, "Can't allocate %d PASIDs for AFU: %d\n",
> +		dev_err(&pci_dev->dev, "Can't allocate %d PASIDs for AFU: %d\n",
>   			pasid_count, pasid_offset);
>   		return pasid_offset;
>   	}
> @@ -123,10 +125,10 @@ static int assign_afu_pasid(struct ocxl_afu *afu, struct pci_dev *dev)
>   	afu->pasid_count = 0;
>   	afu->pasid_max = pasid_count;
>   
> -	ocxl_config_set_afu_pasid(dev, afu->config.dvsec_afu_control_pos,
> +	ocxl_config_set_afu_pasid(pci_dev, afu->config.dvsec_afu_control_pos,
>   				afu->pasid_base,
>   				afu->config.pasid_supported_log);
> -	dev_dbg(&afu->dev, "PASID base=%d, enabled=%d\n",
> +	dev_dbg(&pci_dev->dev, "PASID base=%d, enabled=%d\n",
>   		afu->pasid_base, pasid_count);
>   	return 0;
>   }
> @@ -172,9 +174,10 @@ static void release_fn_bar(struct ocxl_fn *fn, int bar)
>   	WARN_ON(fn->bar_used[idx] < 0);
>   }
>   
> -static int map_mmio_areas(struct ocxl_afu *afu, struct pci_dev *dev)
> +static int map_mmio_areas(struct ocxl_afu *afu)
>   {
>   	int rc;
> +	struct pci_dev *pci_dev = to_pci_dev(afu->fn->dev.parent);
>   
>   	rc = reserve_fn_bar(afu->fn, afu->config.global_mmio_bar);
>   	if (rc)
> @@ -187,10 +190,10 @@ static int map_mmio_areas(struct ocxl_afu *afu, struct pci_dev *dev)
>   	}
>   
>   	afu->global_mmio_start =
> -		pci_resource_start(dev, afu->config.global_mmio_bar) +
> +		pci_resource_start(pci_dev, afu->config.global_mmio_bar) +
>   		afu->config.global_mmio_offset;
>   	afu->pp_mmio_start =
> -		pci_resource_start(dev, afu->config.pp_mmio_bar) +
> +		pci_resource_start(pci_dev, afu->config.pp_mmio_bar) +
>   		afu->config.pp_mmio_offset;
>   
>   	afu->global_mmio_ptr = ioremap(afu->global_mmio_start,
> @@ -198,7 +201,7 @@ static int map_mmio_areas(struct ocxl_afu *afu, struct pci_dev *dev)
>   	if (!afu->global_mmio_ptr) {
>   		release_fn_bar(afu->fn, afu->config.pp_mmio_bar);
>   		release_fn_bar(afu->fn, afu->config.global_mmio_bar);
> -		dev_err(&dev->dev, "Error mapping global mmio area\n");
> +		dev_err(&pci_dev->dev, "Error mapping global mmio area\n");
>   		return -ENOMEM;
>   	}
>   
> @@ -234,17 +237,17 @@ static int configure_afu(struct ocxl_afu *afu, u8 afu_idx, struct pci_dev *dev)
>   	if (rc)
>   		return rc;
>   
> -	rc = assign_afu_actag(afu, dev);
> +	rc = assign_afu_actag(afu);
>   	if (rc)
>   		return rc;
>   
> -	rc = assign_afu_pasid(afu, dev);
> +	rc = assign_afu_pasid(afu);
>   	if (rc) {
>   		reclaim_afu_actag(afu);
>   		return rc;
>   	}
>   
> -	rc = map_mmio_areas(afu, dev);
> +	rc = map_mmio_areas(afu);
>   	if (rc) {
>   		reclaim_afu_pasid(afu);
>   		reclaim_afu_actag(afu);
> @@ -331,7 +334,7 @@ void remove_afu(struct ocxl_afu *afu)
>   	device_unregister(&afu->dev);
>   }
>   
> -static struct ocxl_fn *alloc_function(struct pci_dev *dev)
> +static struct ocxl_fn *alloc_function(void)
>   {
>   	struct ocxl_fn *fn;
>   
> @@ -342,6 +345,7 @@ static struct ocxl_fn *alloc_function(struct pci_dev *dev)
>   	INIT_LIST_HEAD(&fn->afu_list);
>   	INIT_LIST_HEAD(&fn->pasid_list);
>   	INIT_LIST_HEAD(&fn->actag_list);
> +
>   	return fn;
>   }
>   
> @@ -491,7 +495,7 @@ struct ocxl_fn *init_function(struct pci_dev *dev)
>   	struct ocxl_fn *fn;
>   	int rc;
>   
> -	fn = alloc_function(dev);
> +	fn = alloc_function();
>   	if (!fn)
>   		return ERR_PTR(-ENOMEM);
>   
> 

-- 
Andrew Donnellan              OzLabs, ADL Canberra
andrew.donnellan@au1.ibm.com  IBM Australia Limited


^ permalink raw reply

* Re: [PATCH v5 2/8] powerpc/powernv/pci: Suppress an EEH error when reading an empty slot
From: Oliver @ 2019-04-30  4:26 UTC (permalink / raw)
  To: Sergey Miroshnichenko
  Cc: Stewart Smith, Alexey Kardashevskiy, linux-pci, linux,
	linuxppc-dev
In-Reply-To: <20190311115233.6514-3-s.miroshnichenko@yadro.com>

On Mon, Mar 11, 2019 at 10:52 PM Sergey Miroshnichenko
<s.miroshnichenko@yadro.com> wrote:
>
> Reading an empty slot returns all ones, which triggers a false
> EEH error event on PowerNV. This patch unfreezes the bus where
> it has happened.
>
> Signed-off-by: Sergey Miroshnichenko <s.miroshnichenko@yadro.com>
> ---
>  arch/powerpc/include/asm/ppc-pci.h   |  1 +
>  arch/powerpc/kernel/pci_dn.c         |  2 +-
>  arch/powerpc/platforms/powernv/pci.c | 31 +++++++++++++++++++++++++---
>  3 files changed, 30 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
> index f191ef0d2a0a..a22d52a9bb1f 100644
> --- a/arch/powerpc/include/asm/ppc-pci.h
> +++ b/arch/powerpc/include/asm/ppc-pci.h
> @@ -40,6 +40,7 @@ void *traverse_pci_dn(struct pci_dn *root,
>                       void *(*fn)(struct pci_dn *, void *),
>                       void *data);
>  extern void pci_devs_phb_init_dynamic(struct pci_controller *phb);
> +struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus);
>
>  /* From rtas_pci.h */
>  extern void init_pci_config_tokens (void);
> diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
> index ab147a1909c8..341ed71250f1 100644
> --- a/arch/powerpc/kernel/pci_dn.c
> +++ b/arch/powerpc/kernel/pci_dn.c
> @@ -40,7 +40,7 @@
>   * one of PF's bridge. For other devices, their firmware
>   * data is linked to that of their bridge.
>   */
> -static struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus)
> +struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus)
>  {
>         struct pci_bus *pbus;
>         struct device_node *dn;
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 41a381dfc2a1..8cc6661781e2 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -761,6 +761,21 @@ static inline pnv_pci_cfg_check(struct pci_dn *pdn)
>  }
>  #endif /* CONFIG_EEH */
>
> +static int get_bus_pe_number(struct pci_bus *bus)
> +{
> +       struct pci_dn *pdn = pci_bus_to_pdn(bus);
> +       struct pci_dn *child;
> +
> +       if (!pdn)
> +               return IODA_INVALID_PE;
> +
> +       list_for_each_entry(child, &pdn->child_list, list)
> +               if (child->pe_number != IODA_INVALID_PE)
> +                       return child->pe_number;
> +
> +       return IODA_INVALID_PE;
> +}
> +
>  static int pnv_pci_read_config(struct pci_bus *bus,
>                                unsigned int devfn,
>                                int where, int size, u32 *val)
> @@ -772,9 +787,19 @@ static int pnv_pci_read_config(struct pci_bus *bus,
>
>         *val = 0xFFFFFFFF;
>         pdn = pci_get_pdn_by_devfn(bus, devfn);
> -       if (!pdn)
> -               return pnv_pci_cfg_read_raw(phb->opal_id, bus->number, devfn,
> -                                           where, size, val);
> +       if (!pdn) {
> +               int pe_number = get_bus_pe_number(bus);
> +
> +               ret = pnv_pci_cfg_read_raw(phb->opal_id, bus->number, devfn,
> +                                          where, size, val);
> +
> +               if (!ret && (*val == EEH_IO_ERROR_VALUE(size)) && phb->unfreeze_pe)
> +                       phb->unfreeze_pe(phb, (pe_number == IODA_INVALID_PE) ?
> +                                        phb->ioda.reserved_pe_idx : pe_number,
> +                                        OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
> +
> +               return ret;
> +       }
>
>         if (!pnv_pci_cfg_check(pdn))
>                 return PCIBIOS_DEVICE_NOT_FOUND;
> --
> 2.20.1
>

Reviewed-by: Oliver O'Halloran <oohall@gmail.com>

^ permalink raw reply

* Re: [PATCH V32 01/27] Add the ability to lock down access to the running kernel image
From: Andrew Donnellan @ 2019-04-30  5:15 UTC (permalink / raw)
  To: Daniel Axtens, Matthew Garrett
  Cc: Linux API, cmr, James Morris, Linux Kernel Mailing List,
	David Howells, LSM List, Andy Lutomirski, linuxppc-dev
In-Reply-To: <87tvehxvh0.fsf@dja-thinkpad.axtens.net>

On 29/4/19 2:54 pm, Daniel Axtens wrote:
> Hi,
> 
>>>> I'm thinking about whether we should lock down the powerpc xmon debug
>>>> monitor - intuitively, I think the answer is yes if for no other reason
>>>> than Least Astonishment, when lockdown is enabled you probably don't
>>>> expect xmon to keep letting you access kernel memory.
>>>
>>> The original patchset contained a sysrq hotkey to allow physically
>>> present users to disable lockdown, so I'm not super concerned about
>>> this case - I could definitely be convinced otherwise, though.
> 
> So Mimi contacted me offlist and very helpfully provided me with a much
> better and less confused justification for disabling xmon in lockdown:
> 
> On x86, physical presence (== console access) is a trigger to
> disable/enable lockdown mode.
> 
> In lockdown mode, you're not supposed to be able to modify memory. xmon
> allows you to modify memory, and therefore shouldn't be allowed in
> lockdown.
> 
> So, if you can disable lockdown on the console that's probably OK, but
> it should be specifically disabling lockdown, not randomly editing
> memory with xmon.

That makes sense.

-- 
Andrew Donnellan              OzLabs, ADL Canberra
andrew.donnellan@au1.ibm.com  IBM Australia Limited


^ permalink raw reply

* Re: [PATCH 3/8] powerpc/eeh: Convert PNV_PHB_FLAG_EEH to global flag
From: Sam Bobroff @ 2019-04-30  5:30 UTC (permalink / raw)
  To: Oliver O'Halloran; +Cc: linuxppc-dev
In-Reply-To: <1434231d4e8216e5b12382ad081e81c886c33c14.camel@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 6586 bytes --]

On Thu, Apr 18, 2019 at 07:51:40PM +1000, Oliver O'Halloran wrote:
> On Wed, 2019-03-20 at 13:58 +1100, Sam Bobroff wrote:
> > The PHB flag, PNV_PHB_FLAG_EEH, is set (on PowerNV) individually on
> > each PHB once the EEH subsystem is ready. It is the only use of the
> > flags member of the phb struct.
> > 
> > However there is no need to store this separately on each PHB, so
> > convert it to a global flag. For symmetry, the flag is now also set
> > for pSeries; although it is currently unused it may be useful in the
> > future.
> 
> I'd drop this patch and keep it as a seperate flag. Making this a
> global flag doesn't really buy us anything either. It's also worth
> remembering that we do have virtual PHBs, like the NVLink ones, that
> don't have real EEH support and we should be doing something more
> intelligent about that.
> 
> If you are going to remove the PNV_PHB flag then i would look at moving
> that flag into the generic pci_controller structure that we use for
> tracking PHBs since that would give us a way to toggle EEH support on a
> per PHB basis on pseries and powernv.

OK. I want to keep these changes as small as possible, so I'll try a
simpler approach and use the exsiting flag. (I only touched this area
because it's now necessary to set the EEH_ENABLED flag both at boot time
and after it.)

> > Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
> > ---
> >  arch/powerpc/include/asm/eeh.h               | 11 +++++++++++
> >  arch/powerpc/platforms/powernv/eeh-powernv.c | 14 +++-----------
> >  arch/powerpc/platforms/powernv/pci.c         |  7 +++----
> >  arch/powerpc/platforms/powernv/pci.h         |  2 --
> >  arch/powerpc/platforms/pseries/pci.c         |  4 ++++
> >  5 files changed, 21 insertions(+), 17 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> > index 3613a56281f2..fe4cf7208890 100644
> > --- a/arch/powerpc/include/asm/eeh.h
> > +++ b/arch/powerpc/include/asm/eeh.h
> > @@ -43,6 +43,7 @@ struct pci_dn;
> >  #define EEH_VALID_PE_ZERO	0x10	/* PE#0 is valid		     */
> >  #define EEH_ENABLE_IO_FOR_LOG	0x20	/* Enable IO for log		     */
> >  #define EEH_EARLY_DUMP_LOG	0x40	/* Dump log immediately		     */
> > +#define EEH_PHB_ENABLED		0x80	/* PHB recovery uses EEH	     */
> >  
> >  /*
> >   * Delay for PE reset, all in ms
> > @@ -245,6 +246,11 @@ static inline bool eeh_enabled(void)
> >  	return eeh_has_flag(EEH_ENABLED) && !eeh_has_flag(EEH_FORCE_DISABLED);
> >  }
> >  
> > +static inline bool eeh_phb_enabled(void)
> > +{
> > +	return eeh_has_flag(EEH_PHB_ENABLED);
> > +}
> > +
> >  static inline void eeh_serialize_lock(unsigned long *flags)
> >  {
> >  	raw_spin_lock_irqsave(&confirm_error_lock, *flags);
> > @@ -332,6 +338,11 @@ static inline bool eeh_enabled(void)
> >          return false;
> >  }
> >  
> > +static inline bool eeh_phb_enabled(void)
> > +{
> > +	return false;
> > +}
> > +
> >  static inline void eeh_probe_devices(void) { }
> >  
> >  static inline void *eeh_dev_init(struct pci_dn *pdn, void *data)
> > diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
> > index 6fc1a463b796..f0a95f663810 100644
> > --- a/arch/powerpc/platforms/powernv/eeh-powernv.c
> > +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
> > @@ -264,22 +264,14 @@ int pnv_eeh_post_init(void)
> >  		return ret;
> >  	}
> >  
> > -	if (!eeh_enabled())
> > +	if (eeh_enabled())
> > +		eeh_add_flag(EEH_PHB_ENABLED);
> > +	else
> >  		disable_irq(eeh_event_irq);
> >  
> >  	list_for_each_entry(hose, &hose_list, list_node) {
> >  		phb = hose->private_data;
> >  
> > -		/*
> > -		 * If EEH is enabled, we're going to rely on that.
> > -		 * Otherwise, we restore to conventional mechanism
> > -		 * to clear frozen PE during PCI config access.
> > -		 */
> > -		if (eeh_enabled())
> > -			phb->flags |= PNV_PHB_FLAG_EEH;
> > -		else
> > -			phb->flags &= ~PNV_PHB_FLAG_EEH;
> > -
> >  		/* Create debugfs entries */
> >  #ifdef CONFIG_DEBUG_FS
> >  		if (phb->has_dbgfs || !phb->dbgfs)
> > diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> > index 307181fd8a17..d2b50f3bf6b1 100644
> > --- a/arch/powerpc/platforms/powernv/pci.c
> > +++ b/arch/powerpc/platforms/powernv/pci.c
> > @@ -717,10 +717,9 @@ int pnv_pci_cfg_write(struct pci_dn *pdn,
> >  static bool pnv_pci_cfg_check(struct pci_dn *pdn)
> >  {
> >  	struct eeh_dev *edev = NULL;
> > -	struct pnv_phb *phb = pdn->phb->private_data;
> >  
> >  	/* EEH not enabled ? */
> > -	if (!(phb->flags & PNV_PHB_FLAG_EEH))
> > +	if (!eeh_phb_enabled())
> >  		return true;
> >  
> >  	/* PE reset or device removed ? */
> > @@ -761,7 +760,7 @@ static int pnv_pci_read_config(struct pci_bus *bus,
> >  
> >  	ret = pnv_pci_cfg_read(pdn, where, size, val);
> >  	phb = pdn->phb->private_data;
> > -	if (phb->flags & PNV_PHB_FLAG_EEH && pdn->edev) {
> > +	if (eeh_phb_enabled() && pdn->edev) {
> >  		if (*val == EEH_IO_ERROR_VALUE(size) &&
> >  		    eeh_dev_check_failure(pdn->edev))
> >                          return PCIBIOS_DEVICE_NOT_FOUND;
> > @@ -789,7 +788,7 @@ static int pnv_pci_write_config(struct pci_bus *bus,
> >  
> >  	ret = pnv_pci_cfg_write(pdn, where, size, val);
> >  	phb = pdn->phb->private_data;
> > -	if (!(phb->flags & PNV_PHB_FLAG_EEH))
> > +	if (!eeh_phb_enabled())
> >  		pnv_pci_config_check_eeh(pdn);
> >  
> >  	return ret;
> > diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> > index 8e36da379252..eb0add61397b 100644
> > --- a/arch/powerpc/platforms/powernv/pci.h
> > +++ b/arch/powerpc/platforms/powernv/pci.h
> > @@ -85,8 +85,6 @@ struct pnv_ioda_pe {
> >  	struct list_head	list;
> >  };
> >  
> > -#define PNV_PHB_FLAG_EEH	(1 << 0)
> > -
> >  struct pnv_phb {
> >  	struct pci_controller	*hose;
> >  	enum pnv_phb_type	type;
> > diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
> > index 37a77e57893e..7be80882c08d 100644
> > --- a/arch/powerpc/platforms/pseries/pci.c
> > +++ b/arch/powerpc/platforms/pseries/pci.c
> > @@ -244,6 +244,10 @@ void __init pSeries_final_fixup(void)
> >  
> >  	eeh_probe_devices();
> >  	eeh_addr_cache_build();
> > +#ifdef CONFIG_EEH
> > +	if (eeh_enabled())
> > +		eeh_add_flag(EEH_PHB_ENABLED);
> > +#endif
> >  
> >  #ifdef CONFIG_PCI_IOV
> >  	ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable;
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH 5/8] powerpc/eeh: Add eeh_show_enabled()
From: Sam Bobroff @ 2019-04-30  5:44 UTC (permalink / raw)
  To: Oliver O'Halloran; +Cc: linuxppc-dev
In-Reply-To: <7d93d678a1e37a1efb935f88fbc91f134d298ac8.camel@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 3490 bytes --]

On Thu, Apr 18, 2019 at 08:01:36PM +1000, Oliver O'Halloran wrote:
> On Wed, 2019-03-20 at 13:58 +1100, Sam Bobroff wrote:
> > Move the EEH enabled message into it's own function so that future
> > work can call it from multiple places.
> > 
> > Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
> > ---
> >  arch/powerpc/include/asm/eeh.h |  3 +++
> >  arch/powerpc/kernel/eeh.c      | 16 +++++++++++-----
> >  2 files changed, 14 insertions(+), 5 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> > index fe4cf7208890..e217ccda55d0 100644
> > --- a/arch/powerpc/include/asm/eeh.h
> > +++ b/arch/powerpc/include/asm/eeh.h
> > @@ -289,6 +289,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
> >  
> >  struct eeh_dev *eeh_dev_init(struct pci_dn *pdn);
> >  void eeh_dev_phb_init_dynamic(struct pci_controller *phb);
> > +void eeh_show_enabled(void);
> >  void eeh_probe_devices(void);
> >  int __init eeh_ops_register(struct eeh_ops *ops);
> >  int __exit eeh_ops_unregister(const char *name);
> > @@ -338,6 +339,8 @@ static inline bool eeh_enabled(void)
> >          return false;
> >  }
> >  
> > +static inline void eeh_show_enabled(void) { }
> > +
> >  static inline bool eeh_phb_enabled(void)
> >  {
> >  	return false;
> > diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> > index b14d89547895..3dcff29cb9b3 100644
> > --- a/arch/powerpc/kernel/eeh.c
> > +++ b/arch/powerpc/kernel/eeh.c
> > @@ -163,6 +163,16 @@ static int __init eeh_setup(char *str)
> >  }
> >  __setup("eeh=", eeh_setup);
> >  
> > +void eeh_show_enabled(void)
> > +{
> > +	if (eeh_has_flag(EEH_FORCE_DISABLED))
> > +		pr_info("EEH: PCI Enhanced I/O Error Handling DISABLED (by eeh=off)\n");
> 
> The allcaps looks kind of stupid.

Argh, true!

> > +	else if (eeh_enabled())
> > +		pr_info("EEH: PCI Enhanced I/O Error Handling ENABLED (capable adapter found)\n");
> > +	else
> > +		pr_info("EEH: PCI Enhanced I/O Error Handling DISABLED (no capable adapter found)\n");
> 
> If we really have to keep these messages then make it say "no EEH
> capable buses found" or something. EEH support has absolutely nothing

I don't think it's critical, but I'd like something that lets you
determine if EEH_ENABLED is set or not and why, since it's helpful for
debugging.

And I know what you mean about EEH support and adapters, but (before
these patches anyway) if you don't have an EEH capable adapter in the
machine at boot time, you won't ever get EEH recovery because
EEH_ENABLED won't be set. (I think we want to clean that up, so this
probably only matters until we do that.)

> to do with the adapters and I'm not even sure we can get the "DISABLED"
> message on PowerNV since the root port will always be there.

Yes, you'd have to force it off via the command line.

Anyway, I'll try to improve the messages (and no more caps) :-)

> > +}
> > +
> >  /*
> >   * This routine captures assorted PCI configuration space data
> >   * for the indicated PCI device, and puts them into a buffer
> > @@ -1166,11 +1176,7 @@ void eeh_probe_devices(void)
> >  		pdn = hose->pci_data;
> >  		traverse_pci_dn(pdn, eeh_ops->probe, NULL);
> >  	}
> > -	if (eeh_enabled())
> > -		pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
> > -	else
> > -		pr_info("EEH: No capable adapters found\n");
> > -
> > +	eeh_show_enabled();
> >  }
> >  
> >  /**
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH kernel v3] powerpc/powernv: Isolate NVLinks between GV100GL on Witherspoon
From: Alistair Popple @ 2019-04-30  5:45 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Jose Ricardo Ziviani, kvm, Alexey Kardashevskiy,
	Daniel Henrique Barboza, Piotr Jaroszynski, kvm-ppc, Sam Bobroff,
	Alex Williamson, Leonardo Augusto Guimarães Garcia,
	Reza Arbab, David Gibson
In-Reply-To: <da41cd35-32f6-043e-13ab-9a225c4e910a@ozlabs.ru>

Alexey,

> >>> +void pnv_try_isolate_nvidia_v100(struct pci_dev *bridge)
> >>> +{
> >>> +	u32 mask, val;
> >>> +	void __iomem *bar0_0, *bar0_120000, *bar0_a00000;
> >>> +	struct pci_dev *pdev;
> >>> +	u16 cmd = 0, cmdmask = PCI_COMMAND_MEMORY;
> >>> +
> >>> +	if (!bridge->subordinate)
> >>> +		return;
> >>> +
> >>> +	pdev = list_first_entry_or_null(&bridge->subordinate->devices,
> >>> +			struct pci_dev, bus_list);
> >>> +	if (!pdev)
> >>> +		return;
> >>> +
> >>> +	if (pdev->vendor != PCI_VENDOR_ID_NVIDIA)

Don't you also need to check the PCIe devid to match only [PV]100 devices as 
well? I doubt there's any guarantee these registers will remain the same for 
all future (or older) NVIDIA devices.

IMHO this should really be done in the device driver in the guest. A malcious 
guest could load a modified driver that doesn't do this, but that should not 
compromise other guests which presumably load a non-compromised driver that 
disables the links on that guests GPU. However I guess in practice what you 
have here should work equally well.

- Alistair

> >>> +		return;
> >>> +
> >>> +	mask = nvlinkgpu_get_disable_mask(&pdev->dev);
> >>> +	if (!mask)
> >>> +		return;
> >>> +
> >>> +	bar0_0 = pci_iomap_range(pdev, 0, 0, 0x10000);
> >>> +	if (!bar0_0) {
> >>> +		pci_err(pdev, "Error mapping BAR0 @0\n");
> >>> +		return;
> >>> +	}
> >>> +	bar0_120000 = pci_iomap_range(pdev, 0, 0x120000, 0x10000);
> >>> +	if (!bar0_120000) {
> >>> +		pci_err(pdev, "Error mapping BAR0 @120000\n");
> >>> +		goto bar0_0_unmap;
> >>> +	}
> >>> +	bar0_a00000 = pci_iomap_range(pdev, 0, 0xA00000, 0x10000);
> >>> +	if (!bar0_a00000) {
> >>> +		pci_err(pdev, "Error mapping BAR0 @A00000\n");
> >>> +		goto bar0_120000_unmap;
> >>> +	}
> >> 
> >> Is it really necessary to do three separate ioremaps vs one that would
> >> cover them all here?  I suspect you're just sneaking in PAGE_SIZE with
> >> the 0x10000 size mappings anyway.  Seems like it would simplify setup,
> >> error reporting, and cleanup to to ioremap to the PAGE_ALIGN'd range
> >> of the highest register accessed. Thanks,
> > 
> > Sure I can map it once, I just do not see the point in mapping/unmapping
> > all 0xa10000>>16=161 system pages for a very short period of time while
> > we know precisely that we need just 3 pages.
> > 
> > Repost?
> 
> Ping?
> 
> Can this go in as it is (i.e. should I ping Michael) or this needs
> another round? It would be nice to get some formal acks. Thanks,
> 
> >> Alex
> >> 
> >>> +
> >>> +	pci_restore_state(pdev);
> >>> +	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
> >>> +	if ((cmd & cmdmask) != cmdmask)
> >>> +		pci_write_config_word(pdev, PCI_COMMAND, cmd | cmdmask);
> >>> +
> >>> +	/*
> >>> +	 * The sequence is from "Tesla P100 and V100 SXM2 NVLink Isolation on
> >>> +	 * Multi-Tenant Systems".
> >>> +	 * The register names are not provided there either, hence raw values.
> >>> +	 */
> >>> +	iowrite32(0x4, bar0_120000 + 0x4C);
> >>> +	iowrite32(0x2, bar0_120000 + 0x2204);
> >>> +	val = ioread32(bar0_0 + 0x200);
> >>> +	val |= 0x02000000;
> >>> +	iowrite32(val, bar0_0 + 0x200);
> >>> +	val = ioread32(bar0_a00000 + 0x148);
> >>> +	val |= mask;
> >>> +	iowrite32(val, bar0_a00000 + 0x148);
> >>> +
> >>> +	if ((cmd | cmdmask) != cmd)
> >>> +		pci_write_config_word(pdev, PCI_COMMAND, cmd);
> >>> +
> >>> +	pci_iounmap(pdev, bar0_a00000);
> >>> +bar0_120000_unmap:
> >>> +	pci_iounmap(pdev, bar0_120000);
> >>> +bar0_0_unmap:
> >>> +	pci_iounmap(pdev, bar0_0);
> >>> +}



^ permalink raw reply

* powernv/subpage_prot: kernel NULL pointer dereference running kernel self tests (subpage_prot)
From: Sachin Sant @ 2019-04-30  5:52 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Aneesh Kumar K. V

While running kselftests (subpage_prot) ran into following OOPS. 
This is against linuxppc merge branch. Does not happen against mainline.
Same problem is seen with latest linux-next tree as well.

Following recent commits have modified the code in question.

ef629cc5bf0543eb57d6e344ba776880ac35fd00 :
    powerc/mm/hash: Reduce hash_mm_context size

60458fba469a695a026334b364cf8adbcd5807e3: 
   powerpc/mm: Add helpers for accessing hash translation related variables


(47/47) avocado-misc-tests/kernel/kselftest.py:kselftest.test:  [17020.529615] BUG: Kernel NULL pointer dereference at 0x00001038
[17020.529639] Faulting instruction address: 0xc00000000008bdb4
[17020.529646] Oops: Kernel access of bad area, sig: 11 [#1]
[17020.529651] LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
[17020.529695] Dumping ftrace buffer:
[17020.529779]    (ftrace buffer empty)
[17020.529788] Modules linked in: iscsi_target_mod target_core_mod macsec tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag binfmt_misc xt_CHECKSUM ipt_MASQUERADE tun ip6t_rpfilter ipt_REJECT nf_reject_ipv4 ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute bridge stp llc ip6table_nat ip6table_mangle ip6table_security ip6table_raw iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c iptable_mangle iptable_security iptable_raw ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter kvm_hv kvm i2c_dev sunrpc dm_mirror dm_region_hash dm_log dm_mod ses enclosure scsi_transport_sas sg ibmpowernv ipmi_powernv leds_powernv ipmi_devintf uio_pdrv_genirq ipmi_msghandler uio opal_prd powernv_op_panel ip_tables ext4 mbcache jbd2 sd_mod ipr libata tg3 ptp pps_core [last unloaded: kretprobe_example]
[17020.529873] CPU: 138 PID: 17457 Comm: subpage_prot Tainted: G           O      5.1.0-rc6-autotest-autotest #1
[17020.529880] NIP:  c00000000008bdb4 LR: c00000000000b688 CTR: c00000000008bd40
[17020.529886] REGS: c00020002c6b7aa0 TRAP: 0300   Tainted: G           O       (5.1.0-rc6-autotest-autotest)
[17020.529892] MSR:  9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 22000842  XER: 00000000
[17020.529901] CFAR: c00000000000b684 DAR: 0000000000001038 DSISR: 40000000 IRQMASK: 0 
[17020.529901] GPR00: c00000000000b688 c00020002c6b7d30 c0000000013b6600 0000000000000000 
[17020.529901] GPR04: 0000000000000000 0000000000000000 0000000000000001 0000000000000008 
[17020.529901] GPR08: 0000000000000000 0000000000000000 0000000000000000 c000000000ab1058 
[17020.529901] GPR12: c00000000008bd40 c0002007ff6e5900 0000000000000000 0000000000000000 
[17020.529901] GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 
[17020.529901] GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 
[17020.529901] GPR24: 0000000000000000 0000000000000000 0000000000000000 00007fff997afbb8 
[17020.529901] GPR28: 00007fff997b0000 0000000010001f50 0000000010001350 c000200662c3ea00 
[17020.529949] NIP [c00000000008bdb4] sys_subpage_prot+0x74/0x590
[17020.529955] LR [c00000000000b688] system_call+0x5c/0x70
[17020.529959] Call Trace:
[17020.529964] [c00020002c6b7d30] [c00020002c6b7d90] 0xc00020002c6b7d90 (unreliable)
[17020.529971] [c00020002c6b7e20] [c00000000000b688] system_call+0x5c/0x70
[17020.529976] Instruction dump:
[17020.529980] fb61ffd8 fb81ffe0 fba1ffe8 fbc1fff0 fbe1fff8 f821ff11 e92d1178 f9210068 
[17020.529987] 39200000 e92d0968 ebe90630 e93f03e8 <eb891038> 60000000 3860fffe e9410068 
[17020.529997] ---[ end trace 90c07b8228c575ad ]—

Thanks
-Sachin

^ permalink raw reply

* Re: [PATCH 6/8] powerpc/eeh: Initialize EEH address cache earlier
From: Sam Bobroff @ 2019-04-30  5:54 UTC (permalink / raw)
  To: Oliver O'Halloran; +Cc: linuxppc-dev
In-Reply-To: <ab420fc4bdfae1bd85f3e478eb1629b1e41f768c.camel@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 4606 bytes --]

On Thu, Apr 18, 2019 at 08:13:53PM +1000, Oliver O'Halloran wrote:
> On Wed, 2019-03-20 at 13:58 +1100, Sam Bobroff wrote:
> > The EEH address cache is currently initialized and populated by a
> > single function: eeh_addr_cache_build().  While the initial population
> > of the cache can only be done once resources are allocated,
> > initialization (just setting up a spinlock) could be done much
> > earlier.
> > 
> > So move the initialization step into a separate function and call it
> > from a core_initcall (rather than a subsys initcall).
> > 
> 
> > This will allow future work to make use of the cache during boot time
> > PCI scanning.
> 
> What's the idea there? Checking for overlaps in the BAR assignments?

The idea is just to be able to populate the cache earlier during boot
time, because that allows more code to be consolidated:

Before this set, the pcibios_add_device handlers were called during boot
but they couldn't populate the cache because it wasn't set up. Instead,
the handlers did nothing and a separate sweep was done after setting up
the cache. (This is annoying because when devices are added after boot,
the pcibios_add_device handlers are the only place where the cache can
be populated.)

> > Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
> > ---
> >  arch/powerpc/include/asm/eeh.h  |  3 +++
> >  arch/powerpc/kernel/eeh.c       |  2 ++
> >  arch/powerpc/kernel/eeh_cache.c | 13 +++++++++++--
> >  3 files changed, 16 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> > index e217ccda55d0..791b9e6fcc45 100644
> > --- a/arch/powerpc/include/asm/eeh.h
> > +++ b/arch/powerpc/include/asm/eeh.h
> > @@ -295,6 +295,7 @@ int __init eeh_ops_register(struct eeh_ops *ops);
> >  int __exit eeh_ops_unregister(const char *name);
> >  int eeh_check_failure(const volatile void __iomem *token);
> >  int eeh_dev_check_failure(struct eeh_dev *edev);
> > +void eeh_addr_cache_init(void);
> >  void eeh_addr_cache_build(void);
> >  void eeh_add_device_early(struct pci_dn *);
> >  void eeh_add_device_tree_early(struct pci_dn *);
> > @@ -362,6 +363,8 @@ static inline int eeh_check_failure(const volatile void __iomem *token)
> >  
> >  #define eeh_dev_check_failure(x) (0)
> >  
> > +static inline void eeh_addr_cache_init(void) { }
> > +
> >  static inline void eeh_addr_cache_build(void) { }
> >  
> >  static inline void eeh_add_device_early(struct pci_dn *pdn) { }
> > diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> > index 3dcff29cb9b3..7a406d58d2c0 100644
> > --- a/arch/powerpc/kernel/eeh.c
> > +++ b/arch/powerpc/kernel/eeh.c
> > @@ -1219,6 +1219,8 @@ static int eeh_init(void)
> >  	list_for_each_entry_safe(hose, tmp, &hose_list, list_node)
> >  		eeh_dev_phb_init_dynamic(hose);
> >  
> > +	eeh_addr_cache_init();
> > +
> >  	/* Initialize EEH event */
> >  	return eeh_event_init();
> >  }
> > diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
> > index 9c68f0837385..f93dd5cf6a39 100644
> > --- a/arch/powerpc/kernel/eeh_cache.c
> > +++ b/arch/powerpc/kernel/eeh_cache.c
> > @@ -267,6 +267,17 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev)
> >  	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
> >  }
> >  
> > +/**
> > + * eeh_addr_cache_init - Initialize a cache of I/O addresses
> > + *
> > + * Initialize a cache of pci i/o addresses.  This cache will be used to
> > + * find the pci device that corresponds to a given address.
> > + */
> > +void eeh_addr_cache_init(void)
> > +{
> > +	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
> > +}
> 
> You could move this out of the pci_io_addr_cache structure and use
> DEFINE_SPINLOCK() too. We might even be able to get rid of the hand-
> rolled interval tree in eeh_cache.c in favour of the generic
> implementation (see mm/interval_tree.c). I'm pretty sure the generic
> one is a drop-in replacement, but I haven't had a chance to have a
> detailed look to see if there's any differences in behaviour.

Sounds good, I'll try to take a look at doing it in a future patch :-)

> > +
> >  /**
> >   * eeh_addr_cache_build - Build a cache of I/O addresses
> >   *
> > @@ -282,8 +293,6 @@ void eeh_addr_cache_build(void)
> >  	struct eeh_dev *edev;
> >  	struct pci_dev *dev = NULL;
> >  
> > -	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
> > -
> >  	for_each_pci_dev(dev) {
> >  		pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
> >  		if (!pdn)
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH v5 5/8] powerpc/pci/IOV: Add support for runtime enabling the VFs
From: Oliver O'Halloran @ 2019-04-30  6:00 UTC (permalink / raw)
  To: Sergey Miroshnichenko, linuxppc-dev, linux-pci
  Cc: Stewart Smith, Alexey Kardashevskiy, linux
In-Reply-To: <20190311115233.6514-6-s.miroshnichenko@yadro.com>

On Mon, 2019-03-11 at 14:52 +0300, Sergey Miroshnichenko wrote:

> When called within pcibios_sriov_enable(), the pci_sriov_get_totalvfs(pdev)
> returns zero, because the device is yet preparing to enable the VFs.

I don't think this is correct. The earliest pcibios_sriov_enable() can
be called is during a driver probe function. The totalvfs field is
initialised by pci_iov_init() which is called before the device has
been added to the bus. If it's returning zero then maybe the driver
limited the number of VFs to zero?

That said, you need to reset numvfs to zero before changing the value. 
So limiting the number of pci_dns that are created to the number
actually required rather than totalvfs doesn't hurt.

> With this patch it becomes possible to enable VFs via sysfs "sriov_numvfs"
> on PowerNV.

I tested on a few of our lab systems with random kernel versions
spanning from 4.15 to 5.0 and sriov_numvfs seemed to work fine on all
of them. Is there a specific configuration you're testing that needed
this change?

> Signed-off-by: Sergey Miroshnichenko <s.miroshnichenko@yadro.com>
> ---
>  arch/powerpc/include/asm/pci-bridge.h     |  4 +--
>  arch/powerpc/kernel/pci_dn.c              | 32 ++++++++++++++---------
>  arch/powerpc/platforms/powernv/pci-ioda.c |  4 +--
>  arch/powerpc/platforms/pseries/pci.c      |  4 +--
>  4 files changed, 25 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
> index fc188e0e9179..6479bc96e0b6 100644
> --- a/arch/powerpc/include/asm/pci-bridge.h
> +++ b/arch/powerpc/include/asm/pci-bridge.h
> @@ -225,8 +225,8 @@ struct pci_dn {
>  extern struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
>  					   int devfn);
>  extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev);
> -extern struct pci_dn *add_dev_pci_data(struct pci_dev *pdev);
> -extern void remove_dev_pci_data(struct pci_dev *pdev);
> +extern struct pci_dn *pci_create_vf_pdns(struct pci_dev *pdev, int num_vfs);
> +extern void pci_destroy_vf_pdns(struct pci_dev *pdev);
>  extern struct pci_dn *pci_add_device_node_info(struct pci_controller *hose,
>  					       struct device_node *dn);
>  extern void pci_remove_device_node_info(struct device_node *dn);
> diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
> index 7f12882d8882..7fa362f8038d 100644
> --- a/arch/powerpc/kernel/pci_dn.c
> +++ b/arch/powerpc/kernel/pci_dn.c
> @@ -222,18 +222,19 @@ static struct pci_dn *pci_create_pdn_from_dev(struct pci_dev *pdev,
>  	return pdn;
>  }
>  
> -struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
> +struct pci_dn *pci_create_vf_pdns(struct pci_dev *pdev, int num_vfs)
>  {
> +	struct pci_dn *pdn = pci_get_pdn(pdev);
> +
>  #ifdef CONFIG_PCI_IOV
> -	struct pci_dn *parent, *pdn;
> +	struct pci_dn *parent;
>  	int i;
>  
>  	/* Only support IOV for now */
>  	if (!pdev->is_physfn)
> -		return pci_get_pdn(pdev);
> +		return pdn;
>  
>  	/* Check if VFs have been populated */
> -	pdn = pci_get_pdn(pdev);
>  	if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF))
>  		return NULL;
>  
> @@ -242,33 +243,38 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
>  	if (!parent)
>  		return NULL;
>  
> -	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
> +	for (i = 0; i < num_vfs; i++) {
>  		struct eeh_dev *edev __maybe_unused;
> +		struct pci_dn *vpdn;
>  
> -		pdn = pci_alloc_pdn(parent,
> -				    pci_iov_virtfn_bus(pdev, i),
> -				    pci_iov_virtfn_devfn(pdev, i));
> -		if (!pdn) {
> +		vpdn = pci_alloc_pdn(parent,
> +				     pci_iov_virtfn_bus(pdev, i),
> +				     pci_iov_virtfn_devfn(pdev, i));
> +		if (!vpdn) {
>  			dev_warn(&pdev->dev, "%s: Cannot create firmware data for VF#%d\n",
>  				 __func__, i);
>  			return NULL;
>  		}
>  
> -		pdn->vf_index = i;
> +		vpdn->vf_index = i;
> +		vpdn->vendor_id = pdn->vendor_id;
> +		vpdn->device_id = pdn->device_id;
> +		vpdn->class_code = pdn->class_code;
> +		vpdn->pci_ext_config_space = 0;
>  
>  #ifdef CONFIG_EEH
>  		/* Create the EEH device for the VF */
> -		edev = eeh_dev_init(pdn);
> +		edev = eeh_dev_init(vpdn);
>  		BUG_ON(!edev);
>  		edev->physfn = pdev;
>  #endif /* CONFIG_EEH */
>  	}
>  #endif /* CONFIG_PCI_IOV */
>  
> -	return pci_get_pdn(pdev);
> +	return pdn;
>  }
>  
> -void remove_dev_pci_data(struct pci_dev *pdev)
> +void pci_destroy_vf_pdns(struct pci_dev *pdev)
>  {
>  #ifdef CONFIG_PCI_IOV
>  	struct pci_dn *parent;
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index ed500f51d449..979c901535f2 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1720,14 +1720,14 @@ int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
>  	pnv_pci_sriov_disable(pdev);
>  
>  	/* Release PCI data */
> -	remove_dev_pci_data(pdev);
> +	pci_destroy_vf_pdns(pdev);
>  	return 0;
>  }
>  
>  int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>  {
>  	/* Allocate PCI data */
> -	add_dev_pci_data(pdev);
> +	pci_create_vf_pdns(pdev, num_vfs);
>  
>  	return pnv_pci_sriov_enable(pdev, num_vfs);
>  }
> diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
> index 37a77e57893e..5e87596903a6 100644
> --- a/arch/powerpc/platforms/pseries/pci.c
> +++ b/arch/powerpc/platforms/pseries/pci.c
> @@ -205,7 +205,7 @@ int pseries_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>  int pseries_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>  {
>  	/* Allocate PCI data */
> -	add_dev_pci_data(pdev);
> +	pci_create_vf_pdns(pdev, num_vfs);
>  	return pseries_pci_sriov_enable(pdev, num_vfs);
>  }
>  
> @@ -217,7 +217,7 @@ int pseries_pcibios_sriov_disable(struct pci_dev *pdev)
>  	/* Releasing pe_num_map */
>  	kfree(pdn->pe_num_map);
>  	/* Release PCI data */
> -	remove_dev_pci_data(pdev);
> +	pci_destroy_vf_pdns(pdev);
>  	pci_vf_drivers_autoprobe(pdev, true);
>  	return 0;
>  }


^ permalink raw reply

* Re: [PATCH kernel v3] powerpc/powernv: Isolate NVLinks between GV100GL on Witherspoon
From: Alexey Kardashevskiy @ 2019-04-30  6:14 UTC (permalink / raw)
  To: Alistair Popple, linuxppc-dev
  Cc: Jose Ricardo Ziviani, kvm, Sam Bobroff, Daniel Henrique Barboza,
	Piotr Jaroszynski, kvm-ppc, Alex Williamson,
	Leonardo Augusto Guimarães Garcia, Reza Arbab, David Gibson
In-Reply-To: <5149814.2BROG1NTNO@townsend>



On 30/04/2019 15:45, Alistair Popple wrote:
> Alexey,
> 
>>>>> +void pnv_try_isolate_nvidia_v100(struct pci_dev *bridge)
>>>>> +{
>>>>> +	u32 mask, val;
>>>>> +	void __iomem *bar0_0, *bar0_120000, *bar0_a00000;
>>>>> +	struct pci_dev *pdev;
>>>>> +	u16 cmd = 0, cmdmask = PCI_COMMAND_MEMORY;
>>>>> +
>>>>> +	if (!bridge->subordinate)
>>>>> +		return;
>>>>> +
>>>>> +	pdev = list_first_entry_or_null(&bridge->subordinate->devices,
>>>>> +			struct pci_dev, bus_list);
>>>>> +	if (!pdev)
>>>>> +		return;
>>>>> +
>>>>> +	if (pdev->vendor != PCI_VENDOR_ID_NVIDIA)
> 
> Don't you also need to check the PCIe devid to match only [PV]100 devices as 
> well? I doubt there's any guarantee these registers will remain the same for 
> all future (or older) NVIDIA devices.


I do not have the complete list of IDs and I already saw 3 different
device ids and this only works for machines with ibm,npu/gpu/nvlinks
properties so for now it works and for the future we are hoping to
either have an open source nvidia driver or some small minidriver (also
from nvidia, or may be a spec allowing us to write one) to allow
topology discovery on the host so we would not depend on the skiboot's
powernv DT.

> IMHO this should really be done in the device driver in the guest. A malcious 
> guest could load a modified driver that doesn't do this, but that should not 
> compromise other guests which presumably load a non-compromised driver that 
> disables the links on that guests GPU. However I guess in practice what you 
> have here should work equally well.

Doing it in the guest means a good guest needs to have an updated
driver, we do not really want to depend on this. The idea of IOMMU
groups is that the hypervisor provides isolation irrespective to what
the guest does.

Also vfio+qemu+slof needs to convey the nvlink topology to the guest,
seems like an unnecessary complication.



> - Alistair
> 
>>>>> +		return;
>>>>> +
>>>>> +	mask = nvlinkgpu_get_disable_mask(&pdev->dev);
>>>>> +	if (!mask)
>>>>> +		return;
>>>>> +
>>>>> +	bar0_0 = pci_iomap_range(pdev, 0, 0, 0x10000);
>>>>> +	if (!bar0_0) {
>>>>> +		pci_err(pdev, "Error mapping BAR0 @0\n");
>>>>> +		return;
>>>>> +	}
>>>>> +	bar0_120000 = pci_iomap_range(pdev, 0, 0x120000, 0x10000);
>>>>> +	if (!bar0_120000) {
>>>>> +		pci_err(pdev, "Error mapping BAR0 @120000\n");
>>>>> +		goto bar0_0_unmap;
>>>>> +	}
>>>>> +	bar0_a00000 = pci_iomap_range(pdev, 0, 0xA00000, 0x10000);
>>>>> +	if (!bar0_a00000) {
>>>>> +		pci_err(pdev, "Error mapping BAR0 @A00000\n");
>>>>> +		goto bar0_120000_unmap;
>>>>> +	}
>>>>
>>>> Is it really necessary to do three separate ioremaps vs one that would
>>>> cover them all here?  I suspect you're just sneaking in PAGE_SIZE with
>>>> the 0x10000 size mappings anyway.  Seems like it would simplify setup,
>>>> error reporting, and cleanup to to ioremap to the PAGE_ALIGN'd range
>>>> of the highest register accessed. Thanks,
>>>
>>> Sure I can map it once, I just do not see the point in mapping/unmapping
>>> all 0xa10000>>16=161 system pages for a very short period of time while
>>> we know precisely that we need just 3 pages.
>>>
>>> Repost?
>>
>> Ping?
>>
>> Can this go in as it is (i.e. should I ping Michael) or this needs
>> another round? It would be nice to get some formal acks. Thanks,
>>
>>>> Alex
>>>>
>>>>> +
>>>>> +	pci_restore_state(pdev);
>>>>> +	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
>>>>> +	if ((cmd & cmdmask) != cmdmask)
>>>>> +		pci_write_config_word(pdev, PCI_COMMAND, cmd | cmdmask);
>>>>> +
>>>>> +	/*
>>>>> +	 * The sequence is from "Tesla P100 and V100 SXM2 NVLink Isolation on
>>>>> +	 * Multi-Tenant Systems".
>>>>> +	 * The register names are not provided there either, hence raw values.
>>>>> +	 */
>>>>> +	iowrite32(0x4, bar0_120000 + 0x4C);
>>>>> +	iowrite32(0x2, bar0_120000 + 0x2204);
>>>>> +	val = ioread32(bar0_0 + 0x200);
>>>>> +	val |= 0x02000000;
>>>>> +	iowrite32(val, bar0_0 + 0x200);
>>>>> +	val = ioread32(bar0_a00000 + 0x148);
>>>>> +	val |= mask;
>>>>> +	iowrite32(val, bar0_a00000 + 0x148);
>>>>> +
>>>>> +	if ((cmd | cmdmask) != cmd)
>>>>> +		pci_write_config_word(pdev, PCI_COMMAND, cmd);
>>>>> +
>>>>> +	pci_iounmap(pdev, bar0_a00000);
>>>>> +bar0_120000_unmap:
>>>>> +	pci_iounmap(pdev, bar0_120000);
>>>>> +bar0_0_unmap:
>>>>> +	pci_iounmap(pdev, bar0_0);
>>>>> +}
> 
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH v2 1/2] powerpc/perf: init pmu from core-book3s
From: Madhavan Srinivasan @ 2019-04-30  6:34 UTC (permalink / raw)
  To: Christophe Leroy, mpe; +Cc: linuxppc-dev
In-Reply-To: <b3678667-a550-8ab2-c904-99f92e4251be@c-s.fr>


On 29/04/19 11:08 AM, Christophe Leroy wrote:
>
>
> Le 29/04/2019 à 04:52, Madhavan Srinivasan a écrit :
>> Currenty pmu driver file for each ppc64 generation processor
>> has a __init call in itself. Refactor the code by moving the
>> __init call to core-books.c. This also clean's up compat mode
>> pmu driver registration.
>
> Can you explain the advantage of doing so ?

Was not comfortable having dependency on the link ordering, so
took this approach. This will avoid registering generic driver
when there is a platform specific driver.


> For me it makes more sense to have independant drivers with their own 
> init call.
>
>
>>
>> Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
>> Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
>> ---
>> Changelog v1:
>> - Added "internal.h" file and moved the extern definitions to that file
>>
>>   arch/powerpc/perf/core-book3s.c | 28 ++++++++++++++++++++++++++++
>>   arch/powerpc/perf/internal.h    | 16 ++++++++++++++++
>>   arch/powerpc/perf/power5+-pmu.c |  4 +---
>>   arch/powerpc/perf/power5-pmu.c  |  4 +---
>>   arch/powerpc/perf/power6-pmu.c  |  4 +---
>>   arch/powerpc/perf/power7-pmu.c  |  4 +---
>>   arch/powerpc/perf/power8-pmu.c  |  3 +--
>>   arch/powerpc/perf/power9-pmu.c  |  3 +--
>>   arch/powerpc/perf/ppc970-pmu.c  |  4 +---
>>   9 files changed, 51 insertions(+), 19 deletions(-)
>>   create mode 100644 arch/powerpc/perf/internal.h
>>
>> diff --git a/arch/powerpc/perf/core-book3s.c 
>> b/arch/powerpc/perf/core-book3s.c
>> index b0723002a396..a96f9420139c 100644
>> --- a/arch/powerpc/perf/core-book3s.c
>> +++ b/arch/powerpc/perf/core-book3s.c
>> @@ -22,6 +22,10 @@
>>   #include <asm/ptrace.h>
>>   #include <asm/code-patching.h>
>>   +#ifdef CONFIG_PPC64
>
> Can we avoid that CONFIG_PPC64 ifdef ? Why isn't it compatible with 
> PPC32 ?

IIUC, Driver handled here are specific to server side ppc and secondly, 
infrastructure
can be extend for ppc32 if needed.
>> +#include "internal.h"
>> +#endif
>> +
>>   #define BHRB_MAX_ENTRIES    32
>>   #define BHRB_TARGET        0x0000000000000002
>>   #define BHRB_PREDICTION        0x0000000000000001
>> @@ -2294,3 +2298,27 @@ int register_power_pmu(struct power_pmu *pmu)
>>                 power_pmu_prepare_cpu, NULL);
>>       return 0;
>>   }
>> +
>> +#ifdef CONFIG_PPC64
>
> Same, why PPC64 ?
>
>> +static int __init init_ppc64_pmu(void)
>> +{
>> +    /* run through all the pmu drivers one at a time */
>> +    if (!init_power5_pmu())
>> +        return 0;
>> +    else if (!init_power5p_pmu())
>> +        return 0;
>> +    else if (!init_power6_pmu())
>> +        return 0;
>> +    else if (!init_power7_pmu())
>> +        return 0;
>> +    else if (!init_power8_pmu())
>> +        return 0;
>> +    else if (!init_power9_pmu())
>> +        return 0;
>> +    else if (!init_ppc970_pmu())
>> +        return 0;
>> +    else
>> +        return -ENODEV;
>> +}
>> +early_initcall(init_ppc64_pmu);
>> +#endif
>> diff --git a/arch/powerpc/perf/internal.h b/arch/powerpc/perf/internal.h
>> new file mode 100644
>> index 000000000000..e54d524d4283
>> --- /dev/null
>> +++ b/arch/powerpc/perf/internal.h
>> @@ -0,0 +1,16 @@
>> +/*
>> + * Copyright 2019 Madhavan Srinivasan, IBM Corporation.
>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU General Public License
>> + * as published by the Free Software Foundation; either version
>> + * 2 of the License, or (at your option) any later version.
>> + */
>> +
>> +extern int init_ppc970_pmu(void);
>> +extern int init_power5_pmu(void);
>> +extern int init_power5p_pmu(void);
>> +extern int init_power6_pmu(void);
>> +extern int init_power7_pmu(void);
>> +extern int init_power8_pmu(void);
>> +extern int init_power9_pmu(void);
>
> 'extern' keyword is pointless, please remove it (checkpatch --strict 
> probably told it to you).

Ok will re-spin it (will use --strict in future patches thanks :) )

Thanks for review
Maddy

>
>
> Christophe
>
>
>> diff --git a/arch/powerpc/perf/power5+-pmu.c 
>> b/arch/powerpc/perf/power5+-pmu.c
>> index 0526dac66007..9aa803504cb2 100644
>> --- a/arch/powerpc/perf/power5+-pmu.c
>> +++ b/arch/powerpc/perf/power5+-pmu.c
>> @@ -677,7 +677,7 @@ static struct power_pmu power5p_pmu = {
>>       .cache_events        = &power5p_cache_events,
>>   };
>>   -static int __init init_power5p_pmu(void)
>> +int init_power5p_pmu(void)
>>   {
>>       if (!cur_cpu_spec->oprofile_cpu_type ||
>>           (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5+")
>> @@ -686,5 +686,3 @@ static int __init init_power5p_pmu(void)
>>         return register_power_pmu(&power5p_pmu);
>>   }
>> -
>> -early_initcall(init_power5p_pmu);
>> diff --git a/arch/powerpc/perf/power5-pmu.c 
>> b/arch/powerpc/perf/power5-pmu.c
>> index 4dc99f9f7962..30cb13d081a9 100644
>> --- a/arch/powerpc/perf/power5-pmu.c
>> +++ b/arch/powerpc/perf/power5-pmu.c
>> @@ -618,7 +618,7 @@ static struct power_pmu power5_pmu = {
>>       .flags            = PPMU_HAS_SSLOT,
>>   };
>>   -static int __init init_power5_pmu(void)
>> +int init_power5_pmu(void)
>>   {
>>       if (!cur_cpu_spec->oprofile_cpu_type ||
>>           strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5"))
>> @@ -626,5 +626,3 @@ static int __init init_power5_pmu(void)
>>         return register_power_pmu(&power5_pmu);
>>   }
>> -
>> -early_initcall(init_power5_pmu);
>> diff --git a/arch/powerpc/perf/power6-pmu.c 
>> b/arch/powerpc/perf/power6-pmu.c
>> index 9c9d646b68a1..80ec48632cfe 100644
>> --- a/arch/powerpc/perf/power6-pmu.c
>> +++ b/arch/powerpc/perf/power6-pmu.c
>> @@ -540,7 +540,7 @@ static struct power_pmu power6_pmu = {
>>       .cache_events        = &power6_cache_events,
>>   };
>>   -static int __init init_power6_pmu(void)
>> +int init_power6_pmu(void)
>>   {
>>       if (!cur_cpu_spec->oprofile_cpu_type ||
>>           strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power6"))
>> @@ -548,5 +548,3 @@ static int __init init_power6_pmu(void)
>>         return register_power_pmu(&power6_pmu);
>>   }
>> -
>> -early_initcall(init_power6_pmu);
>> diff --git a/arch/powerpc/perf/power7-pmu.c 
>> b/arch/powerpc/perf/power7-pmu.c
>> index 6dbae9884ec4..bb6efd5d2530 100644
>> --- a/arch/powerpc/perf/power7-pmu.c
>> +++ b/arch/powerpc/perf/power7-pmu.c
>> @@ -445,7 +445,7 @@ static struct power_pmu power7_pmu = {
>>       .cache_events        = &power7_cache_events,
>>   };
>>   -static int __init init_power7_pmu(void)
>> +int init_power7_pmu(void)
>>   {
>>       if (!cur_cpu_spec->oprofile_cpu_type ||
>>           strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power7"))
>> @@ -456,5 +456,3 @@ static int __init init_power7_pmu(void)
>>         return register_power_pmu(&power7_pmu);
>>   }
>> -
>> -early_initcall(init_power7_pmu);
>> diff --git a/arch/powerpc/perf/power8-pmu.c 
>> b/arch/powerpc/perf/power8-pmu.c
>> index d12a2db26353..bcc3409a06de 100644
>> --- a/arch/powerpc/perf/power8-pmu.c
>> +++ b/arch/powerpc/perf/power8-pmu.c
>> @@ -379,7 +379,7 @@ static struct power_pmu power8_pmu = {
>>       .bhrb_nr        = 32,
>>   };
>>   -static int __init init_power8_pmu(void)
>> +int init_power8_pmu(void)
>>   {
>>       int rc;
>>   @@ -399,4 +399,3 @@ static int __init init_power8_pmu(void)
>>         return 0;
>>   }
>> -early_initcall(init_power8_pmu);
>> diff --git a/arch/powerpc/perf/power9-pmu.c 
>> b/arch/powerpc/perf/power9-pmu.c
>> index 030544e35959..3a31ac6f4805 100644
>> --- a/arch/powerpc/perf/power9-pmu.c
>> +++ b/arch/powerpc/perf/power9-pmu.c
>> @@ -437,7 +437,7 @@ static struct power_pmu power9_pmu = {
>>       .bhrb_nr        = 32,
>>   };
>>   -static int __init init_power9_pmu(void)
>> +int init_power9_pmu(void)
>>   {
>>       int rc = 0;
>>       unsigned int pvr = mfspr(SPRN_PVR);
>> @@ -467,4 +467,3 @@ static int __init init_power9_pmu(void)
>>         return 0;
>>   }
>> -early_initcall(init_power9_pmu);
>> diff --git a/arch/powerpc/perf/ppc970-pmu.c 
>> b/arch/powerpc/perf/ppc970-pmu.c
>> index 8b6a8a36fa38..1d3370914022 100644
>> --- a/arch/powerpc/perf/ppc970-pmu.c
>> +++ b/arch/powerpc/perf/ppc970-pmu.c
>> @@ -490,7 +490,7 @@ static struct power_pmu ppc970_pmu = {
>>       .flags            = PPMU_NO_SIPR | PPMU_NO_CONT_SAMPLING,
>>   };
>>   -static int __init init_ppc970_pmu(void)
>> +int init_ppc970_pmu(void)
>>   {
>>       if (!cur_cpu_spec->oprofile_cpu_type ||
>>           (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/970")
>> @@ -499,5 +499,3 @@ static int __init init_ppc970_pmu(void)
>>         return register_power_pmu(&ppc970_pmu);
>>   }
>> -
>> -early_initcall(init_ppc970_pmu);
>>
>


^ permalink raw reply

* Re: [PATCH v2 2/2] powerpc/perf: Add generic compat mode pmu driver
From: Madhavan Srinivasan @ 2019-04-30  6:42 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <639fe9ec-365e-10dd-551e-44eb4bffd352@c-s.fr>


On 29/04/19 11:12 AM, Christophe Leroy wrote:
>
>
> Le 29/04/2019 à 04:52, Madhavan Srinivasan a écrit :
>> Most of the power processor generation performance monitoring
>> unit (PMU) driver code is bundled in the kernel and one of those
>> is enabled/registered based on the oprofile_cpu_type check at
>> the boot.
>>
>> But things get little tricky incase of "compat" mode boot.
>> IBM POWER System Server based processors has a compactibility
>> mode feature, which simpily put is, Nth generation processor
>> (lets say POWER8) will act and appear in a mode consistent
>> with an earlier generation (N-1) processor (that is POWER7).
>> And in this "compat" mode boot, kernel modify the
>> "oprofile_cpu_type" to be Nth generation (POWER8). If Nth
>> generation pmu driver is bundled (POWER8), it gets registered.
>>
>> Key dependency here is to have distro support for latest
>> processor performance monitoring support. Patch here adds
>> a generic "compat-mode" performance monitoring driver to
>> be register in absence of powernv platform specific pmu driver.
>>
>> Driver supports "cycles", "instruction" and "branch-miss" events.
>> "0x100F0" used as event code for "cycles", "0x00002"
>> used as event code for "instruction" events and "0x400F6"
>> used as event code for "branch miss". These are architected events
>> as part of ISA. New file called "generic-compat-pmu.c" is
>> created to contain the driver specific code. And base raw event
>> code format modeled on PPMU_ARCH_207S.
>>
>> Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
>> ---
>> Changelog v1:
>> - Updated architected event opcodes
>> - included branch miss with architected event opcode
>>
>>   arch/powerpc/perf/Makefile             |   3 +-
>>   arch/powerpc/perf/core-book3s.c        |   2 +-
>>   arch/powerpc/perf/generic-compat-pmu.c | 245 
>> +++++++++++++++++++++++++++++++++
>>   arch/powerpc/perf/internal.h           |   1 +
>>   4 files changed, 249 insertions(+), 2 deletions(-)
>>   create mode 100644 arch/powerpc/perf/generic-compat-pmu.c
>>
>> diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile
>> index ab26df5bacb9..c155dcbb8691 100644
>> --- a/arch/powerpc/perf/Makefile
>> +++ b/arch/powerpc/perf/Makefile
>> @@ -5,7 +5,8 @@ obj-$(CONFIG_PERF_EVENTS)    += callchain.o perf_regs.o
>>   obj-$(CONFIG_PPC_PERF_CTRS)    += core-book3s.o bhrb.o
>>   obj64-$(CONFIG_PPC_PERF_CTRS)    += ppc970-pmu.o power5-pmu.o \
>>                      power5+-pmu.o power6-pmu.o power7-pmu.o \
>> -                   isa207-common.o power8-pmu.o power9-pmu.o
>> +                   isa207-common.o power8-pmu.o power9-pmu.o \
>> +                   generic-compat-pmu.o
>
> Isn't that name a bit long ? What about compat-pmu instead ?

yeah I guess. Will fix it.

>
>>   obj32-$(CONFIG_PPC_PERF_CTRS)    += mpc7450-pmu.o
>>     obj-$(CONFIG_PPC_POWERNV)    += imc-pmu.o
>> diff --git a/arch/powerpc/perf/core-book3s.c 
>> b/arch/powerpc/perf/core-book3s.c
>> index a96f9420139c..a66fb9c01c9e 100644
>> --- a/arch/powerpc/perf/core-book3s.c
>> +++ b/arch/powerpc/perf/core-book3s.c
>> @@ -2318,7 +2318,7 @@ static int __init init_ppc64_pmu(void)
>>       else if (!init_ppc970_pmu())
>>           return 0;
>>       else
>> -        return -ENODEV;
>> +        return init_generic_compat_pmu();
>>   }
>>   early_initcall(init_ppc64_pmu);
>>   #endif
>> diff --git a/arch/powerpc/perf/generic-compat-pmu.c 
>> b/arch/powerpc/perf/generic-compat-pmu.c
>> new file mode 100644
>> index 000000000000..9c2d4bbc5c87
>> --- /dev/null
>> +++ b/arch/powerpc/perf/generic-compat-pmu.c
>> @@ -0,0 +1,245 @@
>> +/*
>> + * Performance counter support.
>> + *
>> + * Copyright 2019 Madhavan Srinivasan, IBM Corporation.
>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU General Public License
>> + * as published by the Free Software Foundation; either version
>> + * 2 of the License, or later version.
>
> Shouldn't we use the new licence format for new files ? ie:
>
> // SPDX-License-Identifier: GPL-2.0+

My bad. Thanks for pointing out.
Will fix and re-spin.

Thanks for review
Maddy


>
>> + */
>> +
>> +#define pr_fmt(fmt)    "generic-compat-pmu: " fmt
>> +
>> +#include "isa207-common.h"
>> +
>> +/*
>> + * Raw event encoding:
>> + *
>> + *        60        56        52        48        44 40        
>> 36        32
>> + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - 
>> - - | - - - - |
>> + *
>> + *        28        24        20        16        12 8         
>> 4         0
>> + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - 
>> - - | - - - - |
>> + *                                 [ pmc ]   [unit ]   [ ] m   [    
>> pmcxsel    ]
>> + *                                                     |     |
>> + *                                                     |     *- mark
>> + *                                                     |
>> + *                                                     |
>> + *                                                     *- combine
>> + *
>> + * Below uses IBM bit numbering.
>> + *
>> + * MMCR1[x:y] = unit    (PMCxUNIT)
>> + * MMCR1[24]   = pmc1combine[0]
>> + * MMCR1[25]   = pmc1combine[1]
>> + * MMCR1[26]   = pmc2combine[0]
>> + * MMCR1[27]   = pmc2combine[1]
>> + * MMCR1[28]   = pmc3combine[0]
>> + * MMCR1[29]   = pmc3combine[1]
>> + * MMCR1[30]   = pmc4combine[0]
>> + * MMCR1[31]   = pmc4combine[1]
>> + *
>> + */
>> +
>> +/*
>> + * Some power9 event codes.
>> + */
>> +#define EVENT(_name, _code)    _name = _code,
>> +
>> +enum {
>> +EVENT(PM_CYC,                    0x100F0)
>> +EVENT(PM_INST_CMPL,                0x00002)
>> +EVENT(PM_BR_MPRED_CMPL,                0x400F6)
>> +};
>> +
>> +#undef EVENT
>> +
>> +GENERIC_EVENT_ATTR(cpu-cycles,            PM_CYC);
>> +GENERIC_EVENT_ATTR(instructions,        PM_INST_CMPL);
>> +GENERIC_EVENT_ATTR(branch-misses, PM_BR_MPRED_CMPL);
>> +
>> +static struct attribute *generic_compat_events_attr[] = {
>> +    GENERIC_EVENT_PTR(PM_CYC),
>> +    GENERIC_EVENT_PTR(PM_INST_CMPL),
>> +    GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL),
>> +    NULL
>> +};
>> +
>> +static struct attribute_group generic_compat_pmu_events_group = {
>> +    .name = "events",
>> +    .attrs = generic_compat_events_attr,
>> +};
>> +
>> +PMU_FORMAT_ATTR(event,        "config:0-19");
>> +PMU_FORMAT_ATTR(pmcxsel,    "config:0-7");
>> +PMU_FORMAT_ATTR(mark,        "config:8");
>> +PMU_FORMAT_ATTR(combine,    "config:10-11");
>> +PMU_FORMAT_ATTR(unit,        "config:12-15");
>> +PMU_FORMAT_ATTR(pmc,        "config:16-19");
>> +
>> +static struct attribute *generic_compat_pmu_format_attr[] = {
>> +    &format_attr_event.attr,
>> +    &format_attr_pmcxsel.attr,
>> +    &format_attr_mark.attr,
>> +    &format_attr_combine.attr,
>> +    &format_attr_unit.attr,
>> +    &format_attr_pmc.attr,
>> +    NULL,
>> +};
>> +
>> +static struct attribute_group generic_compat_pmu_format_group = {
>> +    .name = "format",
>> +    .attrs = generic_compat_pmu_format_attr,
>> +};
>> +
>> +static const struct attribute_group 
>> *generic_compat_pmu_attr_groups[] = {
>> +    &generic_compat_pmu_format_group,
>> +    &generic_compat_pmu_events_group,
>> +    NULL,
>> +};
>> +
>> +static int compat_generic_events[] = {
>> +    [PERF_COUNT_HW_CPU_CYCLES] =            PM_CYC,
>> +    [PERF_COUNT_HW_INSTRUCTIONS] =            PM_INST_CMPL,
>> +    [PERF_COUNT_HW_BRANCH_MISSES] = PM_BR_MPRED_CMPL,
>> +};
>> +
>> +#define C(x)    PERF_COUNT_HW_CACHE_##x
>> +
>> +/*
>> + * Table of generalized cache-related events.
>> + * 0 means not supported, -1 means nonsensical, other values
>> + * are event codes.
>> + */
>> +static int 
>> generic_compat_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
>> +    [ C(L1D) ] = {
>> +        [ C(OP_READ) ] = {
>> +            [ C(RESULT_ACCESS) ] = 0,
>> +            [ C(RESULT_MISS)   ] = 0,
>> +        },
>> +        [ C(OP_WRITE) ] = {
>> +            [ C(RESULT_ACCESS) ] = 0,
>> +            [ C(RESULT_MISS)   ] = 0,
>> +        },
>> +        [ C(OP_PREFETCH) ] = {
>> +            [ C(RESULT_ACCESS) ] = 0,
>> +            [ C(RESULT_MISS)   ] = 0,
>> +        },
>> +    },
>> +    [ C(L1I) ] = {
>> +        [ C(OP_READ) ] = {
>> +            [ C(RESULT_ACCESS) ] = 0,
>> +            [ C(RESULT_MISS)   ] = 0,
>> +        },
>> +        [ C(OP_WRITE) ] = {
>> +            [ C(RESULT_ACCESS) ] = 0,
>> +            [ C(RESULT_MISS)   ] = -1,
>> +        },
>> +        [ C(OP_PREFETCH) ] = {
>> +            [ C(RESULT_ACCESS) ] = 0,
>> +            [ C(RESULT_MISS)   ] = 0,
>> +        },
>> +    },
>> +    [ C(LL) ] = {
>> +        [ C(OP_READ) ] = {
>> +            [ C(RESULT_ACCESS) ] = 0,
>> +            [ C(RESULT_MISS)   ] = 0,
>> +        },
>> +        [ C(OP_WRITE) ] = {
>> +            [ C(RESULT_ACCESS) ] = 0,
>> +            [ C(RESULT_MISS)   ] = 0,
>> +        },
>> +        [ C(OP_PREFETCH) ] = {
>> +            [ C(RESULT_ACCESS) ] = 0,
>> +            [ C(RESULT_MISS)   ] = 0,
>> +        },
>> +    },
>> +    [ C(DTLB) ] = {
>> +        [ C(OP_READ) ] = {
>> +            [ C(RESULT_ACCESS) ] = 0,
>> +            [ C(RESULT_MISS)   ] = 0,
>> +        },
>> +        [ C(OP_WRITE) ] = {
>> +            [ C(RESULT_ACCESS) ] = -1,
>> +            [ C(RESULT_MISS)   ] = -1,
>> +        },
>> +        [ C(OP_PREFETCH) ] = {
>> +            [ C(RESULT_ACCESS) ] = -1,
>> +            [ C(RESULT_MISS)   ] = -1,
>> +        },
>> +    },
>> +    [ C(ITLB) ] = {
>> +        [ C(OP_READ) ] = {
>> +            [ C(RESULT_ACCESS) ] = 0,
>> +            [ C(RESULT_MISS)   ] = 0,
>> +        },
>> +        [ C(OP_WRITE) ] = {
>> +            [ C(RESULT_ACCESS) ] = -1,
>> +            [ C(RESULT_MISS)   ] = -1,
>> +        },
>> +        [ C(OP_PREFETCH) ] = {
>> +            [ C(RESULT_ACCESS) ] = -1,
>> +            [ C(RESULT_MISS)   ] = -1,
>> +        },
>> +    },
>> +    [ C(BPU) ] = {
>> +        [ C(OP_READ) ] = {
>> +            [ C(RESULT_ACCESS) ] = 0,
>> +            [ C(RESULT_MISS)   ] = 0,
>> +        },
>> +        [ C(OP_WRITE) ] = {
>> +            [ C(RESULT_ACCESS) ] = -1,
>> +            [ C(RESULT_MISS)   ] = -1,
>> +        },
>> +        [ C(OP_PREFETCH) ] = {
>> +            [ C(RESULT_ACCESS) ] = -1,
>> +            [ C(RESULT_MISS)   ] = -1,
>> +        },
>> +    },
>> +    [ C(NODE) ] = {
>> +        [ C(OP_READ) ] = {
>> +            [ C(RESULT_ACCESS) ] = -1,
>> +            [ C(RESULT_MISS)   ] = -1,
>> +        },
>> +        [ C(OP_WRITE) ] = {
>> +            [ C(RESULT_ACCESS) ] = -1,
>> +            [ C(RESULT_MISS)   ] = -1,
>> +        },
>> +        [ C(OP_PREFETCH) ] = {
>> +            [ C(RESULT_ACCESS) ] = -1,
>> +            [ C(RESULT_MISS)   ] = -1,
>> +        },
>> +    },
>> +};
>> +
>> +#undef C
>> +
>> +static struct power_pmu generic_compat_pmu = {
>> +    .name            = "GENERIC_COMPAT",
>> +    .n_counter        = MAX_PMU_COUNTERS,
>> +    .add_fields        = ISA207_ADD_FIELDS,
>> +    .test_adder        = ISA207_TEST_ADDER,
>> +    .compute_mmcr        = isa207_compute_mmcr,
>> +    .get_constraint        = isa207_get_constraint,
>> +    .disable_pmc        = isa207_disable_pmc,
>> +    .flags            = PPMU_HAS_SIER | PPMU_ARCH_207S,
>> +    .n_generic        = ARRAY_SIZE(compat_generic_events),
>> +    .generic_events        = compat_generic_events,
>> +    .cache_events        = &generic_compat_cache_events,
>> +    .attr_groups        = generic_compat_pmu_attr_groups,
>> +};
>> +
>> +int init_generic_compat_pmu(void)
>> +{
>> +    int rc = 0;
>> +
>> +    rc = register_power_pmu(&generic_compat_pmu);
>> +    if (rc)
>> +        return rc;
>> +
>> +    /* Tell userspace that EBB is supported */
>> +    cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_EBB;
>> +
>> +    return 0;
>> +}
>> diff --git a/arch/powerpc/perf/internal.h b/arch/powerpc/perf/internal.h
>> index e54d524d4283..185a40d1adff 100644
>> --- a/arch/powerpc/perf/internal.h
>> +++ b/arch/powerpc/perf/internal.h
>> @@ -14,3 +14,4 @@ extern int init_power6_pmu(void);
>>   extern int init_power7_pmu(void);
>>   extern int init_power8_pmu(void);
>>   extern int init_power9_pmu(void);
>> +extern int init_generic_compat_pmu(void);
>
> Don't use 'extern' keyword.
>
> Christophe
>


^ permalink raw reply

* [PATCH] powerpc/mm/radix: Fix kernel crash when running subpage protect test
From: Aneesh Kumar K.V @ 2019-04-30  7:59 UTC (permalink / raw)
  To: npiggin, paulus, mpe; +Cc: Sachin Sant, Aneesh Kumar K.V, linuxppc-dev

This patch fixes the below crash by making sure we touch the subpage protection
related structures only if we know they are allocated on the platform. With
radix translation we don't allocate hash context at all and trying to access
subpage_prot_table results in

 Faulting instruction address: 0xc00000000008bdb4
 Oops: Kernel access of bad area, sig: 11 [#1]
 LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
 ....
 NIP [c00000000008bdb4] sys_subpage_prot+0x74/0x590
 LR [c00000000000b688] system_call+0x5c/0x70
 Call Trace:
 [c00020002c6b7d30] [c00020002c6b7d90] 0xc00020002c6b7d90 (unreliable)
 [c00020002c6b7e20] [c00000000000b688] system_call+0x5c/0x70
 Instruction dump:
 fb61ffd8 fb81ffe0 fba1ffe8 fbc1fff0 fbe1fff8 f821ff11 e92d1178 f9210068
 39200000 e92d0968 ebe90630 e93f03e8 <eb891038> 60000000 3860fffe e9410068

We also move the subpage_prot_table with mmp_sem held to avoid racec
between two parallel subpage_prot syscall.

Reported-by: Sachin Sant <sachinp@linux.vnet.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/subpage-prot.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index c9dff4e1f295..473dd430e306 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -90,16 +90,18 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
 static void subpage_prot_clear(unsigned long addr, unsigned long len)
 {
 	struct mm_struct *mm = current->mm;
-	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
+	struct subpage_prot_table *spt;
 	u32 **spm, *spp;
 	unsigned long i;
 	size_t nw;
 	unsigned long next, limit;
 
+	down_write(&mm->mmap_sem);
+
+	spt = mm_ctx_subpage_prot(&mm->context);
 	if (!spt)
-		return ;
+		goto err_out;
 
-	down_write(&mm->mmap_sem);
 	limit = addr + len;
 	if (limit > spt->maxaddr)
 		limit = spt->maxaddr;
@@ -127,6 +129,8 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
 		/* now flush any existing HPTEs for the range */
 		hpte_flush_range(mm, addr, nw);
 	}
+
+err_out:
 	up_write(&mm->mmap_sem);
 }
 
@@ -189,7 +193,7 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
 		unsigned long, len, u32 __user *, map)
 {
 	struct mm_struct *mm = current->mm;
-	struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
+	struct subpage_prot_table *spt;
 	u32 **spm, *spp;
 	unsigned long i;
 	size_t nw;
@@ -219,6 +223,7 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
 
 	down_write(&mm->mmap_sem);
 
+	spt = mm_ctx_subpage_prot(&mm->context);
 	if (!spt) {
 		/*
 		 * Allocate subpage prot table if not already done.
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH] powerpc: Fix kobject memleak
From: Greg Kroah-Hartman @ 2019-04-30  8:42 UTC (permalink / raw)
  To: Tobin C. Harding; +Cc: Paul Mackerras, linuxppc-dev, linux-kernel
In-Reply-To: <20190430010923.17092-1-tobin@kernel.org>

On Tue, Apr 30, 2019 at 11:09:23AM +1000, Tobin C. Harding wrote:
> Currently error return from kobject_init_and_add() is not followed by a
> call to kobject_put().  This means there is a memory leak.
> 
> Add call to kobject_put() in error path of kobject_init_and_add().
> 
> Signed-off-by: Tobin C. Harding <tobin@kernel.org>

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox