* [PATCH V2] sparc64: sun4v TLB error power off events
@ 2014-09-16 13:26 Bob Picco
2014-09-16 21:47 ` David Miller
` (7 more replies)
0 siblings, 8 replies; 9+ messages in thread
From: Bob Picco @ 2014-09-16 13:26 UTC (permalink / raw)
To: sparclinux
From: bob picco <bpicco@meloft.net>
We've witnessed a few TLB events causing the machine to power off because
of prom_halt. In one case it was some nfs related area during rmmod. Another
was an mmapper of /dev/mem. A more recent one is an ITLB issue with
a bad pagesize which could be a hardware bug. Bugs happen but we should
attempt to not power off the machine and/or hang it when possible.
This is a DTLB error from an mmapper of /dev/mem:
[root@sparcie ~]# SUN4V-DTLB: Error at TPC[fffff80100903e6c], tl 1
SUN4V-DTLB: TPC<0xfffff80100903e6c>
SUN4V-DTLB: O7[fffff801081979d0]
SUN4V-DTLB: O7<0xfffff801081979d0>
SUN4V-DTLB: vaddr[fffff80100000000] ctx[1250] pte[98000000000f0610] error[2]
.
This is recent mainline for ITLB:
[ 3708.179864] SUN4V-ITLB: TPC<0xfffffc010071cefc>
[ 3708.188866] SUN4V-ITLB: O7[fffffc010071cee8]
[ 3708.197377] SUN4V-ITLB: O7<0xfffffc010071cee8>
[ 3708.206539] SUN4V-ITLB: vaddr[e0003] ctx[1a3c] pte[2900000dcc800eeb] error[4]
.
Normally sun4v_itlb_error_report() and sun4v_dtlb_error_report() would call
prom_halt() and drop us to OF command prompt "ok". This isn't the case for
LDOMs and the machine powers off.
For the HV reported error of HV_ENORADDR for HV HV_MMU_MAP_ADDR_TRAP we cause
a SIGBUS error by qualifying it within do_sparc64_fault() for fault code mask
of FAULT_CODE_BAD_RA. This is done when trap level (%tl) is less or equal
one("1"). Otherwise, for %tl > 1, we proceed eventually to die_if_kernel().
The logic of this patch was partially inspired by David Miller's feedback.
Power off of large sparc64 machines is painful. Plus die_if_kernel provides
more context. A reset sequence isn't a brief period on large sparc64 but
better than power-off/power-on sequence.
Cc: sparclinux@vger.kernel.org
Signed-off-by: Bob Picco <bob.picco@oracle.com>
---
V2: bus error for cases at %tl <= 1.
for %tl > 1 always call die_if_kernel().
arch/sparc/include/asm/thread_info_64.h | 1 +
arch/sparc/kernel/sun4v_tlb_miss.S | 35 ++++++++++++++++++------------
arch/sparc/kernel/traps_64.c | 15 ++++++++-----
arch/sparc/mm/fault_64.c | 3 ++
4 files changed, 34 insertions(+), 20 deletions(-)
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index a5f01ac..f85dc85 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -102,6 +102,7 @@ struct thread_info {
#define FAULT_CODE_ITLB 0x04 /* Miss happened in I-TLB */
#define FAULT_CODE_WINFIXUP 0x08 /* Miss happened during spill/fill */
#define FAULT_CODE_BLKCOMMIT 0x10 /* Use blk-commit ASI in copy_page */
+#define FAULT_CODE_BAD_RA 0x20 /* Bad RA for sun4v */
#if PAGE_SHIFT = 13
#define THREAD_SIZE (2*PAGE_SIZE)
diff --git a/arch/sparc/kernel/sun4v_tlb_miss.S b/arch/sparc/kernel/sun4v_tlb_miss.S
index e0c09bf..6179e19 100644
--- a/arch/sparc/kernel/sun4v_tlb_miss.S
+++ b/arch/sparc/kernel/sun4v_tlb_miss.S
@@ -195,6 +195,11 @@
ldx [%g2 + TRAP_PER_CPU_PGD_PADDR], %g7
sun4v_itlb_error:
+ rdpr %tl, %g1
+ cmp %g1, 1
+ ble,pt %icc, sun4v_bad_ra
+ or %g0, FAULT_CODE_BAD_RA | FAULT_CODE_ITLB, %g1
+
sethi %hi(sun4v_err_itlb_vaddr), %g1
stx %g4, [%g1 + %lo(sun4v_err_itlb_vaddr)]
sethi %hi(sun4v_err_itlb_ctx), %g1
@@ -206,15 +211,10 @@
sethi %hi(sun4v_err_itlb_error), %g1
stx %o0, [%g1 + %lo(sun4v_err_itlb_error)]
+ sethi %hi(1f), %g7
rdpr %tl, %g4
- cmp %g4, 1
- ble,pt %icc, 1f
- sethi %hi(2f), %g7
ba,pt %xcc, etraptl1
- or %g7, %lo(2f), %g7
-
-1: ba,pt %xcc, etrap
-2: or %g7, %lo(2b), %g7
+1: or %g7, %lo(1f), %g7
mov %l4, %o1
call sun4v_itlb_error_report
add %sp, PTREGS_OFF, %o0
@@ -222,6 +222,11 @@
/* NOTREACHED */
sun4v_dtlb_error:
+ rdpr %tl, %g1
+ cmp %g1, 1
+ ble,pt %icc, sun4v_bad_ra
+ or %g0, FAULT_CODE_BAD_RA | FAULT_CODE_DTLB, %g1
+
sethi %hi(sun4v_err_dtlb_vaddr), %g1
stx %g4, [%g1 + %lo(sun4v_err_dtlb_vaddr)]
sethi %hi(sun4v_err_dtlb_ctx), %g1
@@ -233,21 +238,23 @@
sethi %hi(sun4v_err_dtlb_error), %g1
stx %o0, [%g1 + %lo(sun4v_err_dtlb_error)]
+ sethi %hi(1f), %g7
rdpr %tl, %g4
- cmp %g4, 1
- ble,pt %icc, 1f
- sethi %hi(2f), %g7
ba,pt %xcc, etraptl1
- or %g7, %lo(2f), %g7
-
-1: ba,pt %xcc, etrap
-2: or %g7, %lo(2b), %g7
+1: or %g7, %lo(1f), %g7
mov %l4, %o1
call sun4v_dtlb_error_report
add %sp, PTREGS_OFF, %o0
/* NOTREACHED */
+sun4v_bad_ra:
+ or %g0, %g4, %g5
+ ba,pt %xcc, sparc64_realfault_common
+ or %g1, %g0, %g4
+
+ /* NOTREACHED */
+
/* Instruction Access Exception, tl0. */
sun4v_iacc:
ldxa [%g0] ASI_SCRATCHPAD, %g2
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index fb6640e..981a769 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2104,6 +2104,11 @@ void sun4v_nonresum_overflow(struct pt_regs *regs)
atomic_inc(&sun4v_nonresum_oflow_cnt);
}
+static void sun4v_tlb_error(struct pt_regs *regs)
+{
+ die_if_kernel("TLB/TSB error", regs);
+}
+
unsigned long sun4v_err_itlb_vaddr;
unsigned long sun4v_err_itlb_ctx;
unsigned long sun4v_err_itlb_pte;
@@ -2111,8 +2116,7 @@ unsigned long sun4v_err_itlb_error;
void sun4v_itlb_error_report(struct pt_regs *regs, int tl)
{
- if (tl > 1)
- dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
+ dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
printk(KERN_EMERG "SUN4V-ITLB: Error at TPC[%lx], tl %d\n",
regs->tpc, tl);
@@ -2125,7 +2129,7 @@ void sun4v_itlb_error_report(struct pt_regs *regs, int tl)
sun4v_err_itlb_vaddr, sun4v_err_itlb_ctx,
sun4v_err_itlb_pte, sun4v_err_itlb_error);
- prom_halt();
+ sun4v_tlb_error(regs);
}
unsigned long sun4v_err_dtlb_vaddr;
@@ -2135,8 +2139,7 @@ unsigned long sun4v_err_dtlb_error;
void sun4v_dtlb_error_report(struct pt_regs *regs, int tl)
{
- if (tl > 1)
- dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
+ dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
printk(KERN_EMERG "SUN4V-DTLB: Error at TPC[%lx], tl %d\n",
regs->tpc, tl);
@@ -2149,7 +2152,7 @@ void sun4v_dtlb_error_report(struct pt_regs *regs, int tl)
sun4v_err_dtlb_vaddr, sun4v_err_dtlb_ctx,
sun4v_err_dtlb_pte, sun4v_err_dtlb_error);
- prom_halt();
+ sun4v_tlb_error(regs);
}
void hypervisor_tlbop_error(unsigned long err, unsigned long op)
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 587cd05..18fcd71 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -346,6 +346,9 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
down_read(&mm->mmap_sem);
}
+ if (fault_code & FAULT_CODE_BAD_RA)
+ goto do_sigbus;
+
vma = find_vma(mm, address);
if (!vma)
goto bad_area;
--
1.7.1
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH V2] sparc64: sun4v TLB error power off events
2014-09-16 13:26 [PATCH V2] sparc64: sun4v TLB error power off events Bob Picco
@ 2014-09-16 21:47 ` David Miller
2014-09-17 0:16 ` David Miller
` (6 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: David Miller @ 2014-09-16 21:47 UTC (permalink / raw)
To: sparclinux
From: Bob Picco <bpicco@meloft.net>
Date: Tue, 16 Sep 2014 09:26:47 -0400
I just looked into this a bit while reviewing this patch.
> A more recent one is an ITLB issue with a bad pagesize which could
> be a hardware bug.
The error is signalled solely by the hypervisor. Let's look at the
PTE it doesn't like.
> This is recent mainline for ITLB:
> [ 3708.179864] SUN4V-ITLB: TPC<0xfffffc010071cefc>
> [ 3708.188866] SUN4V-ITLB: O7[fffffc010071cee8]
> [ 3708.197377] SUN4V-ITLB: O7<0xfffffc010071cee8>
> [ 3708.206539] SUN4V-ITLB: vaddr[e0003] ctx[1a3c] pte[2900000dcc800eeb] error[4]
Indeed bad pagesize is signalled. The page size bits are in the low
3 bits of the PTE, which here is 0x3 which should encode a 4MB page.
However I notice that 0x8 is set, which is a reserved bit. That
shouldn't happen, and could be what the hypervisor really doesn't
like.
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH V2] sparc64: sun4v TLB error power off events
2014-09-16 13:26 [PATCH V2] sparc64: sun4v TLB error power off events Bob Picco
2014-09-16 21:47 ` David Miller
@ 2014-09-17 0:16 ` David Miller
2014-09-17 0:35 ` David Miller
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: David Miller @ 2014-09-17 0:16 UTC (permalink / raw)
To: sparclinux
From: David Miller <davem@davemloft.net>
Date: Tue, 16 Sep 2014 17:47:25 -0400 (EDT)
> However I notice that 0x8 is set, which is a reserved bit. That
> shouldn't happen, and could be what the hypervisor really doesn't
> like.
I just confirmed that this is why the hypervisor throws an error. It
actually inspects all of 0xf in the PTE as a size field (rather than
just 0x7) via the TTE_SIZE() macro (in include/sun4v/mmu.h), which
uses TTE_SZ_MASK (also in include/sun4v/mmu.h), which is defined as
0xf.
It then makes sure that only CPU supported sizes are set in the PTE by
using the TTE_VALIDSIZEARRAY (f.e. in greatlakes/ontario/include/mmu.h
or greatlakes/huron/include/mmu.h) bitmask, which is different for
each platform hypervisor build.
Anyways, we have to figure out why 0x8 is set in the PTE that we try
to load into ITLB. I haven't found any smoking guns yet.
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH V2] sparc64: sun4v TLB error power off events
2014-09-16 13:26 [PATCH V2] sparc64: sun4v TLB error power off events Bob Picco
2014-09-16 21:47 ` David Miller
2014-09-17 0:16 ` David Miller
@ 2014-09-17 0:35 ` David Miller
2014-09-17 0:49 ` David Miller
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: David Miller @ 2014-09-17 0:35 UTC (permalink / raw)
To: sparclinux
From: David Miller <davem@davemloft.net>
Date: Tue, 16 Sep 2014 20:16:51 -0400 (EDT)
> Anyways, we have to figure out why 0x8 is set in the PTE that we try
> to load into ITLB. I haven't found any smoking guns yet.
Looking more closely, the PTE 0x2900000dcc800eeb doesn't even have
the valid bit set.
Non-valid PTEs should not even get into the TSB, and non-valid PTEs
found via page table lookup should vector us to the full fault path.
There used to be a bug where we'd put non-valid TTEs into the TSB
but that was fixed by commit:
commit 18f38132528c3e603c66ea464727b29e9bbcb91b
Author: David S. Miller <davem@davemloft.net>
Date: Mon Aug 4 16:34:01 2014 -0700
sparc64: Do not insert non-valid PTEs into the TSB hash table.
Do you know if the kernels that triggered those ITLB BADRA errors had
that fix or not?
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH V2] sparc64: sun4v TLB error power off events
2014-09-16 13:26 [PATCH V2] sparc64: sun4v TLB error power off events Bob Picco
` (2 preceding siblings ...)
2014-09-17 0:35 ` David Miller
@ 2014-09-17 0:49 ` David Miller
2014-09-17 12:19 ` Bob Picco
` (3 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: David Miller @ 2014-09-17 0:49 UTC (permalink / raw)
To: sparclinux
From: Bob Picco <bpicco@meloft.net>
Date: Tue, 16 Sep 2014 09:26:47 -0400
> Normally sun4v_itlb_error_report() and sun4v_dtlb_error_report() would call
> prom_halt() and drop us to OF command prompt "ok". This isn't the case for
> LDOMs and the machine powers off.
>
> For the HV reported error of HV_ENORADDR for HV HV_MMU_MAP_ADDR_TRAP we cause
> a SIGBUS error by qualifying it within do_sparc64_fault() for fault code mask
> of FAULT_CODE_BAD_RA. This is done when trap level (%tl) is less or equal
> one("1"). Otherwise, for %tl > 1, we proceed eventually to die_if_kernel().
This patch is great, applied, thanks Bob.
I think I'd like to request a follow-on patch to this if you don't
mind.
If %tl > 1, the only situation we can possibly be in is to be taking a
TLB miss during a window spill/fill trap, in which case we should
up-trap back up through the spill/fill handler and into the long path
fault code via winfix_trampoline.
Then you can kill all of the {i,d}tlb error code paths.
Thanks!
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH V2] sparc64: sun4v TLB error power off events
2014-09-16 13:26 [PATCH V2] sparc64: sun4v TLB error power off events Bob Picco
` (3 preceding siblings ...)
2014-09-17 0:49 ` David Miller
@ 2014-09-17 12:19 ` Bob Picco
2014-09-17 12:24 ` Bob Picco
` (2 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Bob Picco @ 2014-09-17 12:19 UTC (permalink / raw)
To: sparclinux
David Miller wrote: [Tue Sep 16 2014, 05:47:25PM EDT]
> From: Bob Picco <bpicco@meloft.net>
> Date: Tue, 16 Sep 2014 09:26:47 -0400
>
> I just looked into this a bit while reviewing this patch.
Okay.
>
> > A more recent one is an ITLB issue with a bad pagesize which could
> > be a hardware bug.
>
> The error is signalled solely by the hypervisor. Let's look at the
> PTE it doesn't like.
I agree and forgot about this in the context of bus error.
Note, I've seen this issue once or twice. Perhaps only on my ~P0 T4-2.
I couldn't find the issue in my T5-2 log.
>
> > This is recent mainline for ITLB:
> > [ 3708.179864] SUN4V-ITLB: TPC<0xfffffc010071cefc>
> > [ 3708.188866] SUN4V-ITLB: O7[fffffc010071cee8]
> > [ 3708.197377] SUN4V-ITLB: O7<0xfffffc010071cee8>
> > [ 3708.206539] SUN4V-ITLB: vaddr[e0003] ctx[1a3c] pte[2900000dcc800eeb] error[4]
>
> Indeed bad pagesize is signalled. The page size bits are in the low
> 3 bits of the PTE, which here is 0x3 which should encode a 4MB page.
Yes.
>
> However I notice that 0x8 is set, which is a reserved bit. That
> shouldn't happen, and could be what the hypervisor really doesn't
> like.
Also _PAGE_VALID isn't set.
> --
> To unsubscribe from this list: send the line "unsubscribe sparclinux" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH V2] sparc64: sun4v TLB error power off events
2014-09-16 13:26 [PATCH V2] sparc64: sun4v TLB error power off events Bob Picco
` (4 preceding siblings ...)
2014-09-17 12:19 ` Bob Picco
@ 2014-09-17 12:24 ` Bob Picco
2014-09-17 12:40 ` Bob Picco
2014-09-17 16:00 ` David Miller
7 siblings, 0 replies; 9+ messages in thread
From: Bob Picco @ 2014-09-17 12:24 UTC (permalink / raw)
To: sparclinux
David Miller wrote: [Tue Sep 16 2014, 08:35:15PM EDT]
> From: David Miller <davem@davemloft.net>
> Date: Tue, 16 Sep 2014 20:16:51 -0400 (EDT)
>
> > Anyways, we have to figure out why 0x8 is set in the PTE that we try
> > to load into ITLB. I haven't found any smoking guns yet.
>
> Looking more closely, the PTE 0x2900000dcc800eeb doesn't even have
> the valid bit set.
Yes.
>
> Non-valid PTEs should not even get into the TSB, and non-valid PTEs
> found via page table lookup should vector us to the full fault path.
>
> There used to be a bug where we'd put non-valid TTEs into the TSB
> but that was fixed by commit:
>
> commit 18f38132528c3e603c66ea464727b29e9bbcb91b
> Author: David S. Miller <davem@davemloft.net>
> Date: Mon Aug 4 16:34:01 2014 -0700
>
> sparc64: Do not insert non-valid PTEs into the TSB hash table.
>
> Do you know if the kernels that triggered those ITLB BADRA errors had
> that fix or not?
I doubt it. I just checked my log for T4-2 and none of the kernels appears
close to this:
[bpicco@zareason linus.git]$ git describe --contains 18f38132528c3e603c66ea464727b29e9bbcb91b
v3.17-rc1~105^2~1^2~6
when the issue was revealed.
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH V2] sparc64: sun4v TLB error power off events
2014-09-16 13:26 [PATCH V2] sparc64: sun4v TLB error power off events Bob Picco
` (5 preceding siblings ...)
2014-09-17 12:24 ` Bob Picco
@ 2014-09-17 12:40 ` Bob Picco
2014-09-17 16:00 ` David Miller
7 siblings, 0 replies; 9+ messages in thread
From: Bob Picco @ 2014-09-17 12:40 UTC (permalink / raw)
To: sparclinux
David Miller wrote: [Tue Sep 16 2014, 08:49:43PM EDT]
> From: Bob Picco <bpicco@meloft.net>
> Date: Tue, 16 Sep 2014 09:26:47 -0400
>
> > Normally sun4v_itlb_error_report() and sun4v_dtlb_error_report() would call
> > prom_halt() and drop us to OF command prompt "ok". This isn't the case for
> > LDOMs and the machine powers off.
> >
> > For the HV reported error of HV_ENORADDR for HV HV_MMU_MAP_ADDR_TRAP we cause
> > a SIGBUS error by qualifying it within do_sparc64_fault() for fault code mask
> > of FAULT_CODE_BAD_RA. This is done when trap level (%tl) is less or equal
> > one("1"). Otherwise, for %tl > 1, we proceed eventually to die_if_kernel().
>
> This patch is great, applied, thanks Bob.
>
> I think I'd like to request a follow-on patch to this if you don't
> mind.
Fine by me.
>
> If %tl > 1, the only situation we can possibly be in is to be taking a
> TLB miss during a window spill/fill trap, in which case we should
> up-trap back up through the spill/fill handler and into the long path
> fault code via winfix_trampoline.
Yes I agree.
>
> Then you can kill all of the {i,d}tlb error code paths.
Let me look at this.
>
> Thanks!
you're welcome and thanx!
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH V2] sparc64: sun4v TLB error power off events
2014-09-16 13:26 [PATCH V2] sparc64: sun4v TLB error power off events Bob Picco
` (6 preceding siblings ...)
2014-09-17 12:40 ` Bob Picco
@ 2014-09-17 16:00 ` David Miller
7 siblings, 0 replies; 9+ messages in thread
From: David Miller @ 2014-09-17 16:00 UTC (permalink / raw)
To: sparclinux
From: Bob Picco <bpicco@meloft.net>
Date: Wed, 17 Sep 2014 08:24:23 -0400
> David Miller wrote: [Tue Sep 16 2014, 08:35:15PM EDT]
>> From: David Miller <davem@davemloft.net>
>> Date: Tue, 16 Sep 2014 20:16:51 -0400 (EDT)
>>
>> commit 18f38132528c3e603c66ea464727b29e9bbcb91b
>> Author: David S. Miller <davem@davemloft.net>
>> Date: Mon Aug 4 16:34:01 2014 -0700
>>
>> sparc64: Do not insert non-valid PTEs into the TSB hash table.
>>
>> Do you know if the kernels that triggered those ITLB BADRA errors had
>> that fix or not?
> I doubt it. I just checked my log for T4-2 and none of the kernels appears
> close to this:
> [bpicco@zareason linus.git]$ git describe --contains 18f38132528c3e603c66ea464727b29e9bbcb91b
> v3.17-rc1~105^2~1^2~6
> when the issue was revealed.
So I believe this bug should be fixed by the above commit, let me know
if you see this ever again.
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2014-09-17 16:00 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-09-16 13:26 [PATCH V2] sparc64: sun4v TLB error power off events Bob Picco
2014-09-16 21:47 ` David Miller
2014-09-17 0:16 ` David Miller
2014-09-17 0:35 ` David Miller
2014-09-17 0:49 ` David Miller
2014-09-17 12:19 ` Bob Picco
2014-09-17 12:24 ` Bob Picco
2014-09-17 12:40 ` Bob Picco
2014-09-17 16:00 ` David Miller
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.