[rfc 0/3] Revisit MCE handling for UE errors

linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed

* [rfc 0/3] Revisit MCE handling for UE errors
@ 2017-09-05  4:15 Balbir Singh
  2017-09-05  4:15 ` [rfc 1/3] powerpc/mce.c: Remove unused function get_mce_fault_addr() Balbir Singh
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Balbir Singh @ 2017-09-05  4:15 UTC (permalink / raw)
  To: npiggin, mahesh, alistair; +Cc: linuxppc-dev, Balbir Singh

This patch series is designed to hook up memory_failure on
UE errors, this is specially helpful for user_mode UE errors.

The first patch is a cleanup patch, it removes dead code.
I could not find any users of get_mce_fault_addr().
The second patch walks kernel/user mode page tables in
real mode to extract the effective address of the instruction
that caused the UE error and the effective address it was
trying to access (for load/store).
The third patch hooks up memory_failure to the MCE patch.

TODO:
Log the address in NVRAM, so that we can recover from
bad pages at boot and keep the blacklist persistent.

Balbir Singh (3):
  powerpc/mce.c: Remove unused function get_mce_fault_addr()
  powerpc/mce: Extract physical_address for UE errors
  powerpc/mce: hookup memory_failure for UE errors

 arch/powerpc/include/asm/mce.h  |   4 +-
 arch/powerpc/kernel/mce.c       | 107 ++++++++++++++++++++++++----------------
 arch/powerpc/kernel/mce_power.c |  60 ++++++++++++++++++++--
 3 files changed, 120 insertions(+), 51 deletions(-)

-- 
2.9.5

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [rfc 1/3] powerpc/mce.c: Remove unused function get_mce_fault_addr()
  2017-09-05  4:15 [rfc 0/3] Revisit MCE handling for UE errors Balbir Singh
@ 2017-09-05  4:15 ` Balbir Singh
  2017-09-05  4:15 ` [rfc 2/3] powerpc/mce: Extract physical_address for UE errors Balbir Singh
  2017-09-05  4:15 ` [rfc 3/3] powerpc/mce: hookup memory_failure " Balbir Singh
  2 siblings, 0 replies; 9+ messages in thread
From: Balbir Singh @ 2017-09-05  4:15 UTC (permalink / raw)
  To: npiggin, mahesh, alistair; +Cc: linuxppc-dev, Balbir Singh

There are no users of get_mce_fault_addr()

Fixes: b63a0ff ("powerpc/powernv: Machine check exception handling.")

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
---
 arch/powerpc/include/asm/mce.h |  2 --
 arch/powerpc/kernel/mce.c      | 39 ---------------------------------------
 2 files changed, 41 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 190d69a..75292c7 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -210,6 +210,4 @@ extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
 					   bool user_mode);
-extern uint64_t get_mce_fault_addr(struct machine_check_event *evt);
-
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 9b2ea7e..e254399 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -411,45 +411,6 @@ void machine_check_print_event_info(struct machine_check_event *evt,
 }
 EXPORT_SYMBOL_GPL(machine_check_print_event_info);
 
-uint64_t get_mce_fault_addr(struct machine_check_event *evt)
-{
-	switch (evt->error_type) {
-	case MCE_ERROR_TYPE_UE:
-		if (evt->u.ue_error.effective_address_provided)
-			return evt->u.ue_error.effective_address;
-		break;
-	case MCE_ERROR_TYPE_SLB:
-		if (evt->u.slb_error.effective_address_provided)
-			return evt->u.slb_error.effective_address;
-		break;
-	case MCE_ERROR_TYPE_ERAT:
-		if (evt->u.erat_error.effective_address_provided)
-			return evt->u.erat_error.effective_address;
-		break;
-	case MCE_ERROR_TYPE_TLB:
-		if (evt->u.tlb_error.effective_address_provided)
-			return evt->u.tlb_error.effective_address;
-		break;
-	case MCE_ERROR_TYPE_USER:
-		if (evt->u.user_error.effective_address_provided)
-			return evt->u.user_error.effective_address;
-		break;
-	case MCE_ERROR_TYPE_RA:
-		if (evt->u.ra_error.effective_address_provided)
-			return evt->u.ra_error.effective_address;
-		break;
-	case MCE_ERROR_TYPE_LINK:
-		if (evt->u.link_error.effective_address_provided)
-			return evt->u.link_error.effective_address;
-		break;
-	default:
-	case MCE_ERROR_TYPE_UNKNOWN:
-		break;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(get_mce_fault_addr);
-
 /*
  * This function is called in real mode. Strictly no printk's please.
  *
-- 
2.9.5

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [rfc 2/3] powerpc/mce: Extract physical_address for UE errors
  2017-09-05  4:15 [rfc 0/3] Revisit MCE handling for UE errors Balbir Singh
  2017-09-05  4:15 ` [rfc 1/3] powerpc/mce.c: Remove unused function get_mce_fault_addr() Balbir Singh
@ 2017-09-05  4:15 ` Balbir Singh
  2017-09-06  0:36   ` Nicholas Piggin
                     ` (2 more replies)
  2017-09-05  4:15 ` [rfc 3/3] powerpc/mce: hookup memory_failure " Balbir Singh
  2 siblings, 3 replies; 9+ messages in thread
From: Balbir Singh @ 2017-09-05  4:15 UTC (permalink / raw)
  To: npiggin, mahesh, alistair; +Cc: linuxppc-dev, Balbir Singh

Walk the page table for NIP and extract the instruction. Then
use the instruction to find the effective address via analyse_instr().

We might have page table walking races, but we expect them to
be rare, the physical address extraction is best effort. The idea
is to then hook up this infrastructure to memory failure eventually.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
---
 arch/powerpc/include/asm/mce.h  |  2 +-
 arch/powerpc/kernel/mce.c       |  6 ++++-
 arch/powerpc/kernel/mce_power.c | 60 +++++++++++++++++++++++++++++++++++++----
 3 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 75292c7..3a1226e 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -204,7 +204,7 @@ struct mce_error_info {
 
 extern void save_mce_event(struct pt_regs *regs, long handled,
 			   struct mce_error_info *mce_err, uint64_t nip,
-			   uint64_t addr);
+			   uint64_t addr, uint64_t phys_addr);
 extern int get_mce_event(struct machine_check_event *mce, bool release);
 extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index e254399..f41a75d 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -82,7 +82,7 @@ static void mce_set_error_info(struct machine_check_event *mce,
  */
 void save_mce_event(struct pt_regs *regs, long handled,
 		    struct mce_error_info *mce_err,
-		    uint64_t nip, uint64_t addr)
+		    uint64_t nip, uint64_t addr, uint64_t phys_addr)
 {
 	int index = __this_cpu_inc_return(mce_nest_count) - 1;
 	struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
@@ -140,6 +140,10 @@ void save_mce_event(struct pt_regs *regs, long handled,
 	} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
 		mce->u.ue_error.effective_address_provided = true;
 		mce->u.ue_error.effective_address = addr;
+		if (phys_addr != ULONG_MAX) {
+			mce->u.ue_error.physical_address_provided = true;
+			mce->u.ue_error.physical_address = phys_addr;
+		}
 	}
 	return;
 }
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index b76ca19..b77a698 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -27,6 +27,25 @@
 #include <asm/mmu.h>
 #include <asm/mce.h>
 #include <asm/machdep.h>
+#include <asm/pgtable.h>
+#include <asm/pte-walk.h>
+#include <asm/sstep.h>
+
+static unsigned long addr_to_pfn(struct mm_struct *mm, unsigned long addr)
+{
+	pte_t *ptep;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (mm == current->mm)
+		ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
+	else
+		ptep = find_init_mm_pte(addr, NULL);
+	local_irq_restore(flags);
+	if (!ptep)
+		return ULONG_MAX;
+	return pte_pfn(*ptep);
+}
 
 static void flush_tlb_206(unsigned int num_sets, unsigned int action)
 {
@@ -489,7 +508,8 @@ static int mce_handle_ierror(struct pt_regs *regs,
 
 static int mce_handle_derror(struct pt_regs *regs,
 		const struct mce_derror_table table[],
-		struct mce_error_info *mce_err, uint64_t *addr)
+		struct mce_error_info *mce_err, uint64_t *addr,
+		uint64_t *phys_addr)
 {
 	uint64_t dsisr = regs->dsisr;
 	int handled = 0;
@@ -555,7 +575,37 @@ static int mce_handle_derror(struct pt_regs *regs,
 		mce_err->initiator = table[i].initiator;
 		if (table[i].dar_valid)
 			*addr = regs->dar;
-
+		else if (mce_err->severity == MCE_SEV_ERROR_SYNC &&
+				table[i].error_type == MCE_ERROR_TYPE_UE) {
+			/*
+			 * Carefully look at the NIP to determine
+			 * the instruction to analyse. Reading the NIP
+			 * in real-mode is tricky and can lead to recursive
+			 * faults
+			 */
+			int instr;
+			struct mm_struct *mm;
+			unsigned long nip = regs->nip;
+			unsigned long pfn = 0, instr_addr;
+			struct instruction_op op;
+			struct pt_regs tmp = *regs;
+
+			if (user_mode(regs))
+				mm = current->mm;
+			else
+				mm = &init_mm;
+
+			pfn = addr_to_pfn(mm, nip);
+			if (pfn != ULONG_MAX) {
+				instr_addr = (pfn << PAGE_SHIFT) + (nip & ~PAGE_MASK);
+				instr = *(unsigned int *)(instr_addr);
+				if (!analyse_instr(&op, &tmp, instr)) {
+					pfn = addr_to_pfn(mm, op.ea);
+					*addr = op.ea;
+					*phys_addr = pfn;
+				}
+			}
+		}
 		found = 1;
 	}
 
@@ -592,19 +642,19 @@ static long mce_handle_error(struct pt_regs *regs,
 		const struct mce_ierror_table itable[])
 {
 	struct mce_error_info mce_err = { 0 };
-	uint64_t addr;
+	uint64_t addr, phys_addr;
 	uint64_t srr1 = regs->msr;
 	long handled;
 
 	if (SRR1_MC_LOADSTORE(srr1))
-		handled = mce_handle_derror(regs, dtable, &mce_err, &addr);
+		handled = mce_handle_derror(regs, dtable, &mce_err, &addr, &phys_addr);
 	else
 		handled = mce_handle_ierror(regs, itable, &mce_err, &addr);
 
 	if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
 		handled = mce_handle_ue_error(regs);
 
-	save_mce_event(regs, handled, &mce_err, regs->nip, addr);
+	save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr);
 
 	return handled;
 }
-- 
2.9.5

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [rfc 2/3] powerpc/mce: Extract physical_address for UE errors
  2017-09-05  4:15 ` [rfc 2/3] powerpc/mce: Extract physical_address for UE errors Balbir Singh
@ 2017-09-06  0:36   ` Nicholas Piggin
  2017-09-06  4:37     ` Balbir Singh
  2017-09-06 22:56   ` Benjamin Herrenschmidt
  2017-09-08  3:41   ` Mahesh Jagannath Salgaonkar
  2 siblings, 1 reply; 9+ messages in thread
From: Nicholas Piggin @ 2017-09-06  0:36 UTC (permalink / raw)
  To: Balbir Singh; +Cc: mahesh, alistair, linuxppc-dev, Paul Mackerras

On Tue,  5 Sep 2017 14:15:54 +1000
Balbir Singh <bsingharora@gmail.com> wrote:

> Walk the page table for NIP and extract the instruction. Then
> use the instruction to find the effective address via analyse_instr().
> 
> We might have page table walking races, but we expect them to
> be rare, the physical address extraction is best effort. The idea
> is to then hook up this infrastructure to memory failure eventually.

Cool. Too bad hardware doesn't give us the RA.

> 
> Signed-off-by: Balbir Singh <bsingharora@gmail.com>
> ---
>  arch/powerpc/include/asm/mce.h  |  2 +-
>  arch/powerpc/kernel/mce.c       |  6 ++++-
>  arch/powerpc/kernel/mce_power.c | 60 +++++++++++++++++++++++++++++++++++++----
>  3 files changed, 61 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
> index 75292c7..3a1226e 100644
> --- a/arch/powerpc/include/asm/mce.h
> +++ b/arch/powerpc/include/asm/mce.h
> @@ -204,7 +204,7 @@ struct mce_error_info {
>  
>  extern void save_mce_event(struct pt_regs *regs, long handled,
>  			   struct mce_error_info *mce_err, uint64_t nip,
> -			   uint64_t addr);
> +			   uint64_t addr, uint64_t phys_addr);
>  extern int get_mce_event(struct machine_check_event *mce, bool release);
>  extern void release_mce_event(void);
>  extern void machine_check_queue_event(void);
> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
> index e254399..f41a75d 100644
> --- a/arch/powerpc/kernel/mce.c
> +++ b/arch/powerpc/kernel/mce.c
> @@ -82,7 +82,7 @@ static void mce_set_error_info(struct machine_check_event *mce,
>   */
>  void save_mce_event(struct pt_regs *regs, long handled,
>  		    struct mce_error_info *mce_err,
> -		    uint64_t nip, uint64_t addr)
> +		    uint64_t nip, uint64_t addr, uint64_t phys_addr)
>  {
>  	int index = __this_cpu_inc_return(mce_nest_count) - 1;
>  	struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
> @@ -140,6 +140,10 @@ void save_mce_event(struct pt_regs *regs, long handled,
>  	} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
>  		mce->u.ue_error.effective_address_provided = true;
>  		mce->u.ue_error.effective_address = addr;
> +		if (phys_addr != ULONG_MAX) {
> +			mce->u.ue_error.physical_address_provided = true;
> +			mce->u.ue_error.physical_address = phys_addr;
> +		}
>  	}
>  	return;
>  }
> diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
> index b76ca19..b77a698 100644
> --- a/arch/powerpc/kernel/mce_power.c
> +++ b/arch/powerpc/kernel/mce_power.c
> @@ -27,6 +27,25 @@
>  #include <asm/mmu.h>
>  #include <asm/mce.h>
>  #include <asm/machdep.h>
> +#include <asm/pgtable.h>
> +#include <asm/pte-walk.h>
> +#include <asm/sstep.h>
> +
> +static unsigned long addr_to_pfn(struct mm_struct *mm, unsigned long addr)
> +{
> +	pte_t *ptep;
> +	unsigned long flags;
> +
> +	local_irq_save(flags);
> +	if (mm == current->mm)
> +		ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
> +	else
> +		ptep = find_init_mm_pte(addr, NULL);
> +	local_irq_restore(flags);
> +	if (!ptep)
> +		return ULONG_MAX;
> +	return pte_pfn(*ptep);

I think you need to check that it's still cacheable memory here?
!pte_speical && pfn <= highest_memmap_pfn?


> +}
>  
>  static void flush_tlb_206(unsigned int num_sets, unsigned int action)
>  {
> @@ -489,7 +508,8 @@ static int mce_handle_ierror(struct pt_regs *regs,
>  
>  static int mce_handle_derror(struct pt_regs *regs,
>  		const struct mce_derror_table table[],
> -		struct mce_error_info *mce_err, uint64_t *addr)
> +		struct mce_error_info *mce_err, uint64_t *addr,
> +		uint64_t *phys_addr)
>  {
>  	uint64_t dsisr = regs->dsisr;
>  	int handled = 0;
> @@ -555,7 +575,37 @@ static int mce_handle_derror(struct pt_regs *regs,
>  		mce_err->initiator = table[i].initiator;
>  		if (table[i].dar_valid)
>  			*addr = regs->dar;
> -
> +		else if (mce_err->severity == MCE_SEV_ERROR_SYNC &&
> +				table[i].error_type == MCE_ERROR_TYPE_UE) {
> +			/*
> +			 * Carefully look at the NIP to determine
> +			 * the instruction to analyse. Reading the NIP
> +			 * in real-mode is tricky and can lead to recursive
> +			 * faults
> +			 */

What recursive faults? If you ensure NIP is cacheable memory, I guess you
can get a recursive machine check from reading it, but that's probably
tolerable.

> +			int instr;
> +			struct mm_struct *mm;
> +			unsigned long nip = regs->nip;
> +			unsigned long pfn = 0, instr_addr;
> +			struct instruction_op op;
> +			struct pt_regs tmp = *regs;
> +
> +			if (user_mode(regs))
> +				mm = current->mm;
> +			else
> +				mm = &init_mm;
> +
> +			pfn = addr_to_pfn(mm, nip);
> +			if (pfn != ULONG_MAX) {
> +				instr_addr = (pfn << PAGE_SHIFT) + (nip & ~PAGE_MASK);
> +				instr = *(unsigned int *)(instr_addr);
> +				if (!analyse_instr(&op, &tmp, instr)) {
> +					pfn = addr_to_pfn(mm, op.ea);
> +					*addr = op.ea;
> +					*phys_addr = pfn;
> +				}

Instruction may no longer be a load/store at this point, right? Or
instruction or page tables could have changed so this does not point to
a valid pfn of cacheable memory. memory_failure() has some checks, but
I wouldn't mind if you put some checks in here so you can enumerate all
the ways this can go wrong :P

Hopefully after Paulus's instruction analyzer rework you'll be able to
avoid the pt_regs on stack, but that's probably okay for a backport.
MCEs have a lot of stack and don't use too much.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [rfc 2/3] powerpc/mce: Extract physical_address for UE errors
  2017-09-06  0:36   ` Nicholas Piggin
@ 2017-09-06  4:37     ` Balbir Singh
  0 siblings, 0 replies; 9+ messages in thread
From: Balbir Singh @ 2017-09-06  4:37 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Mahesh Jagannath Salgaonkar, Alistair Popple, linuxppc-dev,
	Paul Mackerras

On Wed, Sep 6, 2017 at 10:36 AM, Nicholas Piggin <npiggin@gmail.com> wrote:
> On Tue,  5 Sep 2017 14:15:54 +1000
> Balbir Singh <bsingharora@gmail.com> wrote:
>
>> Walk the page table for NIP and extract the instruction. Then
>> use the instruction to find the effective address via analyse_instr().
>>
>> We might have page table walking races, but we expect them to
>> be rare, the physical address extraction is best effort. The idea
>> is to then hook up this infrastructure to memory failure eventually.
>
> Cool. Too bad hardware doesn't give us the RA.
>
>>
>> Signed-off-by: Balbir Singh <bsingharora@gmail.com>
>> ---
>>  arch/powerpc/include/asm/mce.h  |  2 +-
>>  arch/powerpc/kernel/mce.c       |  6 ++++-
>>  arch/powerpc/kernel/mce_power.c | 60 +++++++++++++++++++++++++++++++++++++----
>>  3 files changed, 61 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
>> index 75292c7..3a1226e 100644
>> --- a/arch/powerpc/include/asm/mce.h
>> +++ b/arch/powerpc/include/asm/mce.h
>> @@ -204,7 +204,7 @@ struct mce_error_info {
>>
>>  extern void save_mce_event(struct pt_regs *regs, long handled,
>>                          struct mce_error_info *mce_err, uint64_t nip,
>> -                        uint64_t addr);
>> +                        uint64_t addr, uint64_t phys_addr);
>>  extern int get_mce_event(struct machine_check_event *mce, bool release);
>>  extern void release_mce_event(void);
>>  extern void machine_check_queue_event(void);
>> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
>> index e254399..f41a75d 100644
>> --- a/arch/powerpc/kernel/mce.c
>> +++ b/arch/powerpc/kernel/mce.c
>> @@ -82,7 +82,7 @@ static void mce_set_error_info(struct machine_check_event *mce,
>>   */
>>  void save_mce_event(struct pt_regs *regs, long handled,
>>                   struct mce_error_info *mce_err,
>> -                 uint64_t nip, uint64_t addr)
>> +                 uint64_t nip, uint64_t addr, uint64_t phys_addr)
>>  {
>>       int index = __this_cpu_inc_return(mce_nest_count) - 1;
>>       struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
>> @@ -140,6 +140,10 @@ void save_mce_event(struct pt_regs *regs, long handled,
>>       } else if (mce->error_type == MCE_ERROR_TYPE_UE) {
>>               mce->u.ue_error.effective_address_provided = true;
>>               mce->u.ue_error.effective_address = addr;
>> +             if (phys_addr != ULONG_MAX) {
>> +                     mce->u.ue_error.physical_address_provided = true;
>> +                     mce->u.ue_error.physical_address = phys_addr;
>> +             }
>>       }
>>       return;
>>  }
>> diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
>> index b76ca19..b77a698 100644
>> --- a/arch/powerpc/kernel/mce_power.c
>> +++ b/arch/powerpc/kernel/mce_power.c
>> @@ -27,6 +27,25 @@
>>  #include <asm/mmu.h>
>>  #include <asm/mce.h>
>>  #include <asm/machdep.h>
>> +#include <asm/pgtable.h>
>> +#include <asm/pte-walk.h>
>> +#include <asm/sstep.h>
>> +
>> +static unsigned long addr_to_pfn(struct mm_struct *mm, unsigned long addr)
>> +{
>> +     pte_t *ptep;
>> +     unsigned long flags;
>> +
>> +     local_irq_save(flags);
>> +     if (mm == current->mm)
>> +             ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
>> +     else
>> +             ptep = find_init_mm_pte(addr, NULL);
>> +     local_irq_restore(flags);
>> +     if (!ptep)
>> +             return ULONG_MAX;
>> +     return pte_pfn(*ptep);
>
> I think you need to check that it's still cacheable memory here?
> !pte_speical && pfn <= highest_memmap_pfn?
>

find_*pte will return a NULL PTE, so we do have a check there. !pte_special is a
good check to have, I'll add it


>
>> +}
>>
>>  static void flush_tlb_206(unsigned int num_sets, unsigned int action)
>>  {
>> @@ -489,7 +508,8 @@ static int mce_handle_ierror(struct pt_regs *regs,
>>
>>  static int mce_handle_derror(struct pt_regs *regs,
>>               const struct mce_derror_table table[],
>> -             struct mce_error_info *mce_err, uint64_t *addr)
>> +             struct mce_error_info *mce_err, uint64_t *addr,
>> +             uint64_t *phys_addr)
>>  {
>>       uint64_t dsisr = regs->dsisr;
>>       int handled = 0;
>> @@ -555,7 +575,37 @@ static int mce_handle_derror(struct pt_regs *regs,
>>               mce_err->initiator = table[i].initiator;
>>               if (table[i].dar_valid)
>>                       *addr = regs->dar;
>> -
>> +             else if (mce_err->severity == MCE_SEV_ERROR_SYNC &&
>> +                             table[i].error_type == MCE_ERROR_TYPE_UE) {
>> +                     /*
>> +                      * Carefully look at the NIP to determine
>> +                      * the instruction to analyse. Reading the NIP
>> +                      * in real-mode is tricky and can lead to recursive
>> +                      * faults
>> +                      */
>
> What recursive faults? If you ensure NIP is cacheable memory, I guess you
> can get a recursive machine check from reading it, but that's probably
> tolerable.


Yep, just wanted to call it out here.

>
>> +                     int instr;
>> +                     struct mm_struct *mm;
>> +                     unsigned long nip = regs->nip;
>> +                     unsigned long pfn = 0, instr_addr;
>> +                     struct instruction_op op;
>> +                     struct pt_regs tmp = *regs;
>> +
>> +                     if (user_mode(regs))
>> +                             mm = current->mm;
>> +                     else
>> +                             mm = &init_mm;
>> +
>> +                     pfn = addr_to_pfn(mm, nip);
>> +                     if (pfn != ULONG_MAX) {
>> +                             instr_addr = (pfn << PAGE_SHIFT) + (nip & ~PAGE_MASK);
>> +                             instr = *(unsigned int *)(instr_addr);
>> +                             if (!analyse_instr(&op, &tmp, instr)) {
>> +                                     pfn = addr_to_pfn(mm, op.ea);
>> +                                     *addr = op.ea;
>> +                                     *phys_addr = pfn;
>> +                             }
>
> Instruction may no longer be a load/store at this point, right? Or
> instruction or page tables could have changed so this does not point to
> a valid pfn of cacheable memory. memory_failure() has some checks, but
> I wouldn't mind if you put some checks in here so you can enumerate all
> the ways this can go wrong :P

OK.. I'll add a comment or a warning to indicate the error, I suspect at this
point, it means we raced w.r.t pte entry or we had a bad NIP/EA. It could
also mean the opcode was not a load store.

>
> Hopefully after Paulus's instruction analyzer rework you'll be able to
> avoid the pt_regs on stack, but that's probably okay for a backport.
> MCEs have a lot of stack and don't use too much.
>

Yep, I kept it so that it could be backported, but I can change it in
a follow-up
patch

Thanks for the detailed review!

Balbir Singh.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [rfc 2/3] powerpc/mce: Extract physical_address for UE errors
  2017-09-05  4:15 ` [rfc 2/3] powerpc/mce: Extract physical_address for UE errors Balbir Singh
  2017-09-06  0:36   ` Nicholas Piggin
@ 2017-09-06 22:56   ` Benjamin Herrenschmidt
  2017-09-07  1:52     ` Balbir Singh
  2017-09-08  3:41   ` Mahesh Jagannath Salgaonkar
  2 siblings, 1 reply; 9+ messages in thread
From: Benjamin Herrenschmidt @ 2017-09-06 22:56 UTC (permalink / raw)
  To: Balbir Singh, npiggin, mahesh, alistair; +Cc: linuxppc-dev

On Tue, 2017-09-05 at 14:15 +1000, Balbir Singh wrote:
>  void save_mce_event(struct pt_regs *regs, long handled,
>                     struct mce_error_info *mce_err,
> -                   uint64_t nip, uint64_t addr)
> +                   uint64_t nip, uint64_t addr, uint64_t phys_addr)
>  {
>         int index = __this_cpu_inc_return(mce_nest_count) - 1;
>         struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
> @@ -140,6 +140,10 @@ void save_mce_event(struct pt_regs *regs, long handled,
>         } else if (mce->error_type == MCE_ERROR_TYPE_UE) {
>                 mce->u.ue_error.effective_address_provided = true;
>                 mce->u.ue_error.effective_address = addr;
> +               if (phys_addr != ULONG_MAX) {
> +                       mce->u.ue_error.physical_address_provided = true;
> +                       mce->u.ue_error.physical_address = phys_addr;
> +               }
>         }
>         return;

Where is "addr" coming from ? Keep in mind that on P9 at least, a UE
will *not* give you an EA in DAR in most cases.

Ben.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [rfc 2/3] powerpc/mce: Extract physical_address for UE errors
  2017-09-06 22:56   ` Benjamin Herrenschmidt
@ 2017-09-07  1:52     ` Balbir Singh
  0 siblings, 0 replies; 9+ messages in thread
From: Balbir Singh @ 2017-09-07  1:52 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Nicholas Piggin, Mahesh Jagannath Salgaonkar, Alistair Popple,
	linuxppc-dev

On Thu, Sep 7, 2017 at 8:56 AM, Benjamin Herrenschmidt <benh@au1.ibm.com> wrote:
> On Tue, 2017-09-05 at 14:15 +1000, Balbir Singh wrote:
>>  void save_mce_event(struct pt_regs *regs, long handled,
>>                     struct mce_error_info *mce_err,
>> -                   uint64_t nip, uint64_t addr)
>> +                   uint64_t nip, uint64_t addr, uint64_t phys_addr)
>>  {
>>         int index = __this_cpu_inc_return(mce_nest_count) - 1;
>>         struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
>> @@ -140,6 +140,10 @@ void save_mce_event(struct pt_regs *regs, long handled,
>>         } else if (mce->error_type == MCE_ERROR_TYPE_UE) {
>>                 mce->u.ue_error.effective_address_provided = true;
>>                 mce->u.ue_error.effective_address = addr;
>> +               if (phys_addr != ULONG_MAX) {
>> +                       mce->u.ue_error.physical_address_provided = true;
>> +                       mce->u.ue_error.physical_address = phys_addr;
>> +               }
>>         }
>>         return;
>
> Where is "addr" coming from ? Keep in mind that on P9 at least, a UE
> will *not* give you an EA in DAR in most cases.

The EA is derived in mce_handle_derror() via page table walk and
analyse_instr(), its the best
we can do FWIK

Balbir Singh.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [rfc 2/3] powerpc/mce: Extract physical_address for UE errors
  2017-09-05  4:15 ` [rfc 2/3] powerpc/mce: Extract physical_address for UE errors Balbir Singh
  2017-09-06  0:36   ` Nicholas Piggin
  2017-09-06 22:56   ` Benjamin Herrenschmidt
@ 2017-09-08  3:41   ` Mahesh Jagannath Salgaonkar
  2 siblings, 0 replies; 9+ messages in thread
From: Mahesh Jagannath Salgaonkar @ 2017-09-08  3:41 UTC (permalink / raw)
  To: Balbir Singh, npiggin, alistair; +Cc: linuxppc-dev

On 09/05/2017 09:45 AM, Balbir Singh wrote:
> Walk the page table for NIP and extract the instruction. Then
> use the instruction to find the effective address via analyse_instr().
> 
> We might have page table walking races, but we expect them to
> be rare, the physical address extraction is best effort. The idea
> is to then hook up this infrastructure to memory failure eventually.
> 
> Signed-off-by: Balbir Singh <bsingharora@gmail.com>
> ---
>  arch/powerpc/include/asm/mce.h  |  2 +-
>  arch/powerpc/kernel/mce.c       |  6 ++++-
>  arch/powerpc/kernel/mce_power.c | 60 +++++++++++++++++++++++++++++++++++++----
>  3 files changed, 61 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
> index 75292c7..3a1226e 100644
> --- a/arch/powerpc/include/asm/mce.h
> +++ b/arch/powerpc/include/asm/mce.h
> @@ -204,7 +204,7 @@ struct mce_error_info {
> 
>  extern void save_mce_event(struct pt_regs *regs, long handled,
>  			   struct mce_error_info *mce_err, uint64_t nip,
> -			   uint64_t addr);
> +			   uint64_t addr, uint64_t phys_addr);
>  extern int get_mce_event(struct machine_check_event *mce, bool release);
>  extern void release_mce_event(void);
>  extern void machine_check_queue_event(void);
> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
> index e254399..f41a75d 100644
> --- a/arch/powerpc/kernel/mce.c
> +++ b/arch/powerpc/kernel/mce.c
> @@ -82,7 +82,7 @@ static void mce_set_error_info(struct machine_check_event *mce,
>   */
>  void save_mce_event(struct pt_regs *regs, long handled,
>  		    struct mce_error_info *mce_err,
> -		    uint64_t nip, uint64_t addr)
> +		    uint64_t nip, uint64_t addr, uint64_t phys_addr)
>  {
>  	int index = __this_cpu_inc_return(mce_nest_count) - 1;
>  	struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
> @@ -140,6 +140,10 @@ void save_mce_event(struct pt_regs *regs, long handled,
>  	} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
>  		mce->u.ue_error.effective_address_provided = true;
>  		mce->u.ue_error.effective_address = addr;
> +		if (phys_addr != ULONG_MAX) {
> +			mce->u.ue_error.physical_address_provided = true;
> +			mce->u.ue_error.physical_address = phys_addr;
> +		}
>  	}
>  	return;
>  }
> diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
> index b76ca19..b77a698 100644
> --- a/arch/powerpc/kernel/mce_power.c
> +++ b/arch/powerpc/kernel/mce_power.c
> @@ -27,6 +27,25 @@
>  #include <asm/mmu.h>
>  #include <asm/mce.h>
>  #include <asm/machdep.h>
> +#include <asm/pgtable.h>
> +#include <asm/pte-walk.h>
> +#include <asm/sstep.h>
> +
> +static unsigned long addr_to_pfn(struct mm_struct *mm, unsigned long addr)
> +{
> +	pte_t *ptep;
> +	unsigned long flags;
> +
> +	local_irq_save(flags);
> +	if (mm == current->mm)
> +		ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
> +	else
> +		ptep = find_init_mm_pte(addr, NULL);
> +	local_irq_restore(flags);
> +	if (!ptep)
> +		return ULONG_MAX;
> +	return pte_pfn(*ptep);
> +}
> 
>  static void flush_tlb_206(unsigned int num_sets, unsigned int action)
>  {
> @@ -489,7 +508,8 @@ static int mce_handle_ierror(struct pt_regs *regs,
> 
>  static int mce_handle_derror(struct pt_regs *regs,
>  		const struct mce_derror_table table[],
> -		struct mce_error_info *mce_err, uint64_t *addr)
> +		struct mce_error_info *mce_err, uint64_t *addr,
> +		uint64_t *phys_addr)
>  {
>  	uint64_t dsisr = regs->dsisr;
>  	int handled = 0;
> @@ -555,7 +575,37 @@ static int mce_handle_derror(struct pt_regs *regs,
>  		mce_err->initiator = table[i].initiator;
>  		if (table[i].dar_valid)
>  			*addr = regs->dar;
> -
> +		else if (mce_err->severity == MCE_SEV_ERROR_SYNC &&
> +				table[i].error_type == MCE_ERROR_TYPE_UE) {
> +			/*
> +			 * Carefully look at the NIP to determine
> +			 * the instruction to analyse. Reading the NIP
> +			 * in real-mode is tricky and can lead to recursive
> +			 * faults
> +			 */
> +			int instr;
> +			struct mm_struct *mm;
> +			unsigned long nip = regs->nip;
> +			unsigned long pfn = 0, instr_addr;
> +			struct instruction_op op;
> +			struct pt_regs tmp = *regs;
> +
> +			if (user_mode(regs))
> +				mm = current->mm;
> +			else
> +				mm = &init_mm;
> +
> +			pfn = addr_to_pfn(mm, nip);
> +			if (pfn != ULONG_MAX) {
> +				instr_addr = (pfn << PAGE_SHIFT) + (nip & ~PAGE_MASK);
> +				instr = *(unsigned int *)(instr_addr);
> +				if (!analyse_instr(&op, &tmp, instr)) {
> +					pfn = addr_to_pfn(mm, op.ea);
> +					*addr = op.ea;
> +					*phys_addr = pfn;
> +				}
> +			}
> +		}
>  		found = 1;
>  	}
> 
> @@ -592,19 +642,19 @@ static long mce_handle_error(struct pt_regs *regs,
>  		const struct mce_ierror_table itable[])
>  {
>  	struct mce_error_info mce_err = { 0 };
> -	uint64_t addr;
> +	uint64_t addr, phys_addr;

You may want to initialize phys_addr with ULONG_MAX.

>  	uint64_t srr1 = regs->msr;
>  	long handled;
> 
>  	if (SRR1_MC_LOADSTORE(srr1))
> -		handled = mce_handle_derror(regs, dtable, &mce_err, &addr);
> +		handled = mce_handle_derror(regs, dtable, &mce_err, &addr, &phys_addr);
>  	else
>  		handled = mce_handle_ierror(regs, itable, &mce_err, &addr);
> 
>  	if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
>  		handled = mce_handle_ue_error(regs);
> 
> -	save_mce_event(regs, handled, &mce_err, regs->nip, addr);
> +	save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr);
> 
>  	return handled;
>  }
> 

Thanks,
-Mahesh.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [rfc 3/3] powerpc/mce: hookup memory_failure for UE errors
  2017-09-05  4:15 [rfc 0/3] Revisit MCE handling for UE errors Balbir Singh
  2017-09-05  4:15 ` [rfc 1/3] powerpc/mce.c: Remove unused function get_mce_fault_addr() Balbir Singh
  2017-09-05  4:15 ` [rfc 2/3] powerpc/mce: Extract physical_address for UE errors Balbir Singh
@ 2017-09-05  4:15 ` Balbir Singh
  2 siblings, 0 replies; 9+ messages in thread
From: Balbir Singh @ 2017-09-05  4:15 UTC (permalink / raw)
  To: npiggin, mahesh, alistair; +Cc: linuxppc-dev, Balbir Singh

If we are in user space and hit a UE error, we now have the
basic infrastructure to walk the page tables and find out
the effective address that was accessed, since the DAR
is not valid.

We use a work_queue content to hookup the bad pfn, any
other context causes problems, since memory_failure itself
can call into schedule() via lru_drain_ bits.

We could probably poison the struct page to avoid a race
between detection and taking corrective action.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
---
 arch/powerpc/kernel/mce.c | 62 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 59 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index f41a75d..39986d5 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -39,11 +39,20 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
 static DEFINE_PER_CPU(int, mce_queue_count);
 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
 
+/* Queue for delayed MCE UE events. */
+static DEFINE_PER_CPU(int, mce_ue_count);
+static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_ue_event_queue);
+
 static void machine_check_process_queued_event(struct irq_work *work);
+void machine_check_ue_event(struct machine_check_event *evt);
+static void machine_process_ue_event(struct work_struct *work);
+
 static struct irq_work mce_event_process_work = {
         .func = machine_check_process_queued_event,
 };
 
+DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
+
 static void mce_set_error_info(struct machine_check_event *mce,
 			       struct mce_error_info *mce_err)
 {
@@ -143,6 +152,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
 		if (phys_addr != ULONG_MAX) {
 			mce->u.ue_error.physical_address_provided = true;
 			mce->u.ue_error.physical_address = phys_addr;
+			machine_check_ue_event(mce);
 		}
 	}
 	return;
@@ -197,6 +207,26 @@ void release_mce_event(void)
 	get_mce_event(NULL, true);
 }
 
+
+/*
+ * Queue up the MCE event which then can be handled later.
+ */
+void machine_check_ue_event(struct machine_check_event *evt)
+{
+	int index;
+
+	index = __this_cpu_inc_return(mce_ue_count) - 1;
+	/* If queue is full, just return for now. */
+	if (index >= MAX_MC_EVT) {
+		__this_cpu_dec(mce_ue_count);
+		return;
+	}
+	memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt));
+
+	/* Queue work to process this event later. */
+	schedule_work(&mce_ue_event_work);
+}
+
 /*
  * Queue up the MCE event which then can be handled later.
  */
@@ -219,7 +249,32 @@ void machine_check_queue_event(void)
 	/* Queue irq work to process this event later. */
 	irq_work_queue(&mce_event_process_work);
 }
-
+/*
+ * process pending MCE event from the mce event queue. This function will be
+ * called during syscall exit.
+ */
+static void machine_process_ue_event(struct work_struct *work)
+{
+	int index;
+	struct machine_check_event *evt;
+
+	while (__this_cpu_read(mce_ue_count) > 0) {
+		index = __this_cpu_read(mce_ue_count) - 1;
+		evt = this_cpu_ptr(&mce_ue_event_queue[index]);
+#ifdef CONFIG_MEMORY_FAILURE
+		/*
+		 * This should probably queued elsewhere, but
+		 * oh! well
+		 */
+		if (evt->error_type == MCE_ERROR_TYPE_UE) {
+			if (evt->u.ue_error.physical_address_provided)
+				memory_failure(evt->u.ue_error.physical_address,
+						SIGBUS, 0);
+		}
+#endif
+		__this_cpu_dec(mce_ue_count);
+	}
+}
 /*
  * process pending MCE event from the mce event queue. This function will be
  * called during syscall exit.
@@ -227,6 +282,7 @@ void machine_check_queue_event(void)
 static void machine_check_process_queued_event(struct irq_work *work)
 {
 	int index;
+	struct machine_check_event *evt;
 
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 
@@ -236,8 +292,8 @@ static void machine_check_process_queued_event(struct irq_work *work)
 	 */
 	while (__this_cpu_read(mce_queue_count) > 0) {
 		index = __this_cpu_read(mce_queue_count) - 1;
-		machine_check_print_event_info(
-				this_cpu_ptr(&mce_event_queue[index]), false);
+		evt = this_cpu_ptr(&mce_event_queue[index]);
+		machine_check_print_event_info(evt, false);
 		__this_cpu_dec(mce_queue_count);
 	}
 }
-- 
2.9.5

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2017-09-08  3:41 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-09-05  4:15 [rfc 0/3] Revisit MCE handling for UE errors Balbir Singh
2017-09-05  4:15 ` [rfc 1/3] powerpc/mce.c: Remove unused function get_mce_fault_addr() Balbir Singh
2017-09-05  4:15 ` [rfc 2/3] powerpc/mce: Extract physical_address for UE errors Balbir Singh
2017-09-06  0:36   ` Nicholas Piggin
2017-09-06  4:37     ` Balbir Singh
2017-09-06 22:56   ` Benjamin Herrenschmidt
2017-09-07  1:52     ` Balbir Singh
2017-09-08  3:41   ` Mahesh Jagannath Salgaonkar
2017-09-05  4:15 ` [rfc 3/3] powerpc/mce: hookup memory_failure " Balbir Singh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).