[patch] MIPS64 R4k TLB refill CP0 hazards

Linux MIPS Architecture development
 help / color / mirror / Atom feed

* [patch] MIPS64 R4k TLB refill CP0 hazards
@ 2002-07-29 15:23 Maciej W. Rozycki
  2002-07-29 20:10 ` Carsten Langgaard
  0 siblings, 1 reply; 21+ messages in thread
From: Maciej W. Rozycki @ 2002-07-29 15:23 UTC (permalink / raw)
  To: Ralf Baechle, linux-mips, linux-mips

Hello,

 The except_vec1_r4k() function in arch/mips64/mm/tlbex-r4k.S is quite new
and seems specifically written to handle the EntryLo vs "tlbwr" R4k CP0
hazard by adding an extra "nop" before the "tlbwr" beyond what
except_vec1_r10k() puts.  Unfortunately, it does not work on my R4400SC
anyway.  OTOH, the 32-bit MIPS version does, so I tried bits from that for
MIPS64 and now the function works.

 Here is the resulting patch.  Since barring the hazard fragment the
functions are identical, I removed the redundant part and made
except_vec1_r4k() make use of the LOAD_PTE2 and PTE_RELOAD macros. 

 OK to apply?

  Maciej

-- 
+  Maciej W. Rozycki, Technical University of Gdansk, Poland   +
+--------------------------------------------------------------+
+        e-mail: macro@ds2.pg.gda.pl, PGP key available        +

patch-mips-2.4.19-rc1-20020726-mips64-tlbex-r4k-1
diff -up --recursive --new-file linux-mips-2.4.19-rc1-20020726.macro/arch/mips64/mm/tlbex-r4k.S linux-mips-2.4.19-rc1-20020726/arch/mips64/mm/tlbex-r4k.S
--- linux-mips-2.4.19-rc1-20020726.macro/arch/mips64/mm/tlbex-r4k.S	2002-07-25 02:57:02.000000000 +0000
+++ linux-mips-2.4.19-rc1-20020726/arch/mips64/mm/tlbex-r4k.S	2002-07-28 22:27:08.000000000 +0000
@@ -5,6 +5,7 @@
  *
  * Copyright (C) 2000 Silicon Graphics, Inc.
  * Written by Ulf Carlsson (ulfc@engr.sgi.com)
+ * Copyright (C) 2002  Maciej W. Rozycki
  */
 #include <linux/config.h>
 #include <linux/init.h>
@@ -23,7 +24,7 @@
 	 * that caused the fault in in PTR.
 	 */
 
-	.macro	LOAD_PTE2, ptr, tmp
+	.macro	LOAD_PTE2, ptr, tmp, kaddr
 #ifdef CONFIG_SMP
 	dmfc0	\ptr, CP0_CONTEXT
 	dmfc0	\tmp, CP0_BADVADDR
@@ -32,8 +33,8 @@
 	dmfc0	\tmp, CP0_BADVADDR
 	dla	\ptr, pgd_current
 #endif
-	bltz	\tmp, kaddr
-	ld	\ptr, (\ptr)
+	bltz	\tmp, \kaddr
+	 ld	\ptr, (\ptr)
 	dsrl	\tmp, (PGDIR_SHIFT-3)		# get pgd offset in bytes
 	andi	\tmp, ((PTRS_PER_PGD - 1)<<3)
 	daddu	\ptr, \tmp			# add in pgd offset
@@ -75,34 +76,16 @@ FEXPORT(except_vec0)
 	.align  5
 LEAF(except_vec1_r4k)
 	.set    noat
-	dla     k1, pgd_current
-	dmfc0   k0, CP0_BADVADDR
-	ld      k1, (k1)
-	bltz    k0, vmaddr
-	 dsrl   k0, (PGDIR_SHIFT-3)             # get pgd offset in bytes
-	andi    k0, ((PTRS_PER_PGD - 1)<<3)
-	daddu   k1, k0                          # add in pgd offset
-	dmfc0   k0, CP0_BADVADDR
-	ld      k1, (k1)                        # get pmd pointer
-	dsrl    k0, (PMD_SHIFT-3)               # get pmd offset in bytes
-	andi    k0, ((PTRS_PER_PMD - 1)<<3)
-	daddu   k1, k0                          # add in pmd offset
-	dmfc0   k0, CP0_XCONTEXT
-	andi    k0, 0xff0                       # get pte offset
-	ld      k1, (k1)                        # get pte pointer
-	daddu   k1, k0
-	ld      k0, 0(k1)                       # get even pte
-	ld      k1, 8(k1)                       # get odd pte
-	dsrl    k0, 6                           # convert to entrylo0
-	dmtc0   k0, CP0_ENTRYLO0                # load it
-	dsrl    k1, 6                           # convert to entrylo1
-	dmtc0   k1, CP0_ENTRYLO1                # load it
-	nop                                     # Need 2 cycles between mtc0
-	nop                                     #  and tlbwr (CP0 hazard).
+	LOAD_PTE2 k1 k0 9f
+	ld	k0, 0(k1)			# get even pte
+	ld	k1, 8(k1)			# get odd pte
+	PTE_RELOAD k0 k1
+	b	1f
 	tlbwr
+1:
 	nop
 	eret
-vmaddr:
+9:
 	dla     k0, handle_vmalloc_address
 	jr      k0
 	 nop
@@ -116,14 +99,14 @@ END(except_vec1_r4k)
 	.align	5
 LEAF(except_vec1_r10k)
 	.set	noat
-	LOAD_PTE2 k1 k0
+	LOAD_PTE2 k1 k0 9f
 	ld	k0, 0(k1)			# get even pte
 	ld	k1, 8(k1)			# get odd pte
 	PTE_RELOAD k0 k1
 	nop
 	tlbwr
 	eret
-kaddr:
+9:
 	dla	k0, handle_vmalloc_address	# MAPPED kernel needs this
 	jr	k0
 	 nop

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-07-29 15:23 [patch] MIPS64 R4k TLB refill CP0 hazards Maciej W. Rozycki
@ 2002-07-29 20:10 ` Carsten Langgaard
  2002-07-30  6:59   ` Carsten Langgaard
  0 siblings, 1 reply; 21+ messages in thread
From: Carsten Langgaard @ 2002-07-29 20:10 UTC (permalink / raw)
  To: Maciej W. Rozycki; +Cc: Ralf Baechle, linux-mips, linux-mips

"Maciej W. Rozycki" wrote:

> Hello,
>
>  The except_vec1_r4k() function in arch/mips64/mm/tlbex-r4k.S is quite new
> and seems specifically written to handle the EntryLo vs "tlbwr" R4k CP0
> hazard by adding an extra "nop" before the "tlbwr" beyond what
> except_vec1_r10k() puts.  Unfortunately, it does not work on my R4400SC
> anyway.  OTOH, the 32-bit MIPS version does, so I tried bits from that for
> MIPS64 and now the function works.
>
>  Here is the resulting patch.  Since barring the hazard fragment the
> functions are identical, I removed the redundant part and made
> except_vec1_r4k() make use of the LOAD_PTE2 and PTE_RELOAD macros.
>
>  OK to apply?

I'm the one who added the except_vec1_r4k function, it works fine on my 5Kc, 20Kc and RM5261.
I can't tell if your patch works for me, before trying it on one of the above CPUs, will do that tomorrow.


>
>   Maciej
>
> --
> +  Maciej W. Rozycki, Technical University of Gdansk, Poland   +
> +--------------------------------------------------------------+
> +        e-mail: macro@ds2.pg.gda.pl, PGP key available        +
>
> patch-mips-2.4.19-rc1-20020726-mips64-tlbex-r4k-1
> diff -up --recursive --new-file linux-mips-2.4.19-rc1-20020726.macro/arch/mips64/mm/tlbex-r4k.S linux-mips-2.4.19-rc1-20020726/arch/mips64/mm/tlbex-r4k.S
> --- linux-mips-2.4.19-rc1-20020726.macro/arch/mips64/mm/tlbex-r4k.S     2002-07-25 02:57:02.000000000 +0000
> +++ linux-mips-2.4.19-rc1-20020726/arch/mips64/mm/tlbex-r4k.S   2002-07-28 22:27:08.000000000 +0000
> @@ -5,6 +5,7 @@
>   *
>   * Copyright (C) 2000 Silicon Graphics, Inc.
>   * Written by Ulf Carlsson (ulfc@engr.sgi.com)
> + * Copyright (C) 2002  Maciej W. Rozycki
>   */
>  #include <linux/config.h>
>  #include <linux/init.h>
> @@ -23,7 +24,7 @@
>          * that caused the fault in in PTR.
>          */
>
> -       .macro  LOAD_PTE2, ptr, tmp
> +       .macro  LOAD_PTE2, ptr, tmp, kaddr
>  #ifdef CONFIG_SMP
>         dmfc0   \ptr, CP0_CONTEXT
>         dmfc0   \tmp, CP0_BADVADDR
> @@ -32,8 +33,8 @@
>         dmfc0   \tmp, CP0_BADVADDR
>         dla     \ptr, pgd_current
>  #endif
> -       bltz    \tmp, kaddr
> -       ld      \ptr, (\ptr)
> +       bltz    \tmp, \kaddr
> +        ld     \ptr, (\ptr)
>         dsrl    \tmp, (PGDIR_SHIFT-3)           # get pgd offset in bytes
>         andi    \tmp, ((PTRS_PER_PGD - 1)<<3)
>         daddu   \ptr, \tmp                      # add in pgd offset
> @@ -75,34 +76,16 @@ FEXPORT(except_vec0)
>         .align  5
>  LEAF(except_vec1_r4k)
>         .set    noat
> -       dla     k1, pgd_current
> -       dmfc0   k0, CP0_BADVADDR
> -       ld      k1, (k1)
> -       bltz    k0, vmaddr
> -        dsrl   k0, (PGDIR_SHIFT-3)             # get pgd offset in bytes
> -       andi    k0, ((PTRS_PER_PGD - 1)<<3)
> -       daddu   k1, k0                          # add in pgd offset
> -       dmfc0   k0, CP0_BADVADDR
> -       ld      k1, (k1)                        # get pmd pointer
> -       dsrl    k0, (PMD_SHIFT-3)               # get pmd offset in bytes
> -       andi    k0, ((PTRS_PER_PMD - 1)<<3)
> -       daddu   k1, k0                          # add in pmd offset
> -       dmfc0   k0, CP0_XCONTEXT
> -       andi    k0, 0xff0                       # get pte offset
> -       ld      k1, (k1)                        # get pte pointer
> -       daddu   k1, k0
> -       ld      k0, 0(k1)                       # get even pte
> -       ld      k1, 8(k1)                       # get odd pte
> -       dsrl    k0, 6                           # convert to entrylo0
> -       dmtc0   k0, CP0_ENTRYLO0                # load it
> -       dsrl    k1, 6                           # convert to entrylo1
> -       dmtc0   k1, CP0_ENTRYLO1                # load it
> -       nop                                     # Need 2 cycles between mtc0
> -       nop                                     #  and tlbwr (CP0 hazard).
> +       LOAD_PTE2 k1 k0 9f
> +       ld      k0, 0(k1)                       # get even pte
> +       ld      k1, 8(k1)                       # get odd pte
> +       PTE_RELOAD k0 k1
> +       b       1f
>         tlbwr
> +1:
>         nop
>         eret
> -vmaddr:
> +9:
>         dla     k0, handle_vmalloc_address
>         jr      k0
>          nop
> @@ -116,14 +99,14 @@ END(except_vec1_r4k)
>         .align  5
>  LEAF(except_vec1_r10k)
>         .set    noat
> -       LOAD_PTE2 k1 k0
> +       LOAD_PTE2 k1 k0 9f
>         ld      k0, 0(k1)                       # get even pte
>         ld      k1, 8(k1)                       # get odd pte
>         PTE_RELOAD k0 k1
>         nop
>         tlbwr
>         eret
> -kaddr:
> +9:
>         dla     k0, handle_vmalloc_address      # MAPPED kernel needs this
>         jr      k0
>          nop

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-07-29 20:10 ` Carsten Langgaard
@ 2002-07-30  6:59   ` Carsten Langgaard
  2002-07-30 11:29     ` Ralf Baechle
  0 siblings, 1 reply; 21+ messages in thread
From: Carsten Langgaard @ 2002-07-30  6:59 UTC (permalink / raw)
  To: Maciej W. Rozycki, Ralf Baechle, linux-mips, linux-mips

Carsten Langgaard wrote:

> "Maciej W. Rozycki" wrote:
>
> > Hello,
> >
> >  The except_vec1_r4k() function in arch/mips64/mm/tlbex-r4k.S is quite new
> > and seems specifically written to handle the EntryLo vs "tlbwr" R4k CP0
> > hazard by adding an extra "nop" before the "tlbwr" beyond what
> > except_vec1_r10k() puts.  Unfortunately, it does not work on my R4400SC
> > anyway.  OTOH, the 32-bit MIPS version does, so I tried bits from that for
> > MIPS64 and now the function works.
> >
> >  Here is the resulting patch.  Since barring the hazard fragment the
> > functions are identical, I removed the redundant part and made
> > except_vec1_r4k() make use of the LOAD_PTE2 and PTE_RELOAD macros.
> >
> >  OK to apply?
>
> I'm the one who added the except_vec1_r4k function, it works fine on my 5Kc, 20Kc and RM5261.
> I can't tell if your patch works for me, before trying it on one of the above CPUs, will do that tomorrow.
>

Your patch seems to work fine for me.
But now that we are looking at the TLB exception handler code, I have a few comments.
If the 2 exception handler functions are identical, maybe we could stick with just one function and make the hazard barrier depending on the
CPU configuration.
We have been discussing this before, but I really don't like the idea of solving the hazard problem with a branch. The branch will on some CPUs (especially if
they have a long pipeline) be a much bigger penalty than we actually wants to solve the hazard. On other CPU (with branch prediction) we may not even solve
the hazard problem.
The 'nop' I used is not the solution either, instead we should use 'ssnop' instructions, which will make sure we also solve the hazard problem on superscalar
CPUs.
We also need to have a hazard barrier in the code labeled "not_vmalloc".
What do you thing ?


>
> >
> >   Maciej
> >
> > --
> > +  Maciej W. Rozycki, Technical University of Gdansk, Poland   +
> > +--------------------------------------------------------------+
> > +        e-mail: macro@ds2.pg.gda.pl, PGP key available        +
> >
> > patch-mips-2.4.19-rc1-20020726-mips64-tlbex-r4k-1
> > diff -up --recursive --new-file linux-mips-2.4.19-rc1-20020726.macro/arch/mips64/mm/tlbex-r4k.S linux-mips-2.4.19-rc1-20020726/arch/mips64/mm/tlbex-r4k.S
> > --- linux-mips-2.4.19-rc1-20020726.macro/arch/mips64/mm/tlbex-r4k.S     2002-07-25 02:57:02.000000000 +0000
> > +++ linux-mips-2.4.19-rc1-20020726/arch/mips64/mm/tlbex-r4k.S   2002-07-28 22:27:08.000000000 +0000
> > @@ -5,6 +5,7 @@
> >   *
> >   * Copyright (C) 2000 Silicon Graphics, Inc.
> >   * Written by Ulf Carlsson (ulfc@engr.sgi.com)
> > + * Copyright (C) 2002  Maciej W. Rozycki
> >   */
> >  #include <linux/config.h>
> >  #include <linux/init.h>
> > @@ -23,7 +24,7 @@
> >          * that caused the fault in in PTR.
> >          */
> >
> > -       .macro  LOAD_PTE2, ptr, tmp
> > +       .macro  LOAD_PTE2, ptr, tmp, kaddr
> >  #ifdef CONFIG_SMP
> >         dmfc0   \ptr, CP0_CONTEXT
> >         dmfc0   \tmp, CP0_BADVADDR
> > @@ -32,8 +33,8 @@
> >         dmfc0   \tmp, CP0_BADVADDR
> >         dla     \ptr, pgd_current
> >  #endif
> > -       bltz    \tmp, kaddr
> > -       ld      \ptr, (\ptr)
> > +       bltz    \tmp, \kaddr
> > +        ld     \ptr, (\ptr)
> >         dsrl    \tmp, (PGDIR_SHIFT-3)           # get pgd offset in bytes
> >         andi    \tmp, ((PTRS_PER_PGD - 1)<<3)
> >         daddu   \ptr, \tmp                      # add in pgd offset
> > @@ -75,34 +76,16 @@ FEXPORT(except_vec0)
> >         .align  5
> >  LEAF(except_vec1_r4k)
> >         .set    noat
> > -       dla     k1, pgd_current
> > -       dmfc0   k0, CP0_BADVADDR
> > -       ld      k1, (k1)
> > -       bltz    k0, vmaddr
> > -        dsrl   k0, (PGDIR_SHIFT-3)             # get pgd offset in bytes
> > -       andi    k0, ((PTRS_PER_PGD - 1)<<3)
> > -       daddu   k1, k0                          # add in pgd offset
> > -       dmfc0   k0, CP0_BADVADDR
> > -       ld      k1, (k1)                        # get pmd pointer
> > -       dsrl    k0, (PMD_SHIFT-3)               # get pmd offset in bytes
> > -       andi    k0, ((PTRS_PER_PMD - 1)<<3)
> > -       daddu   k1, k0                          # add in pmd offset
> > -       dmfc0   k0, CP0_XCONTEXT
> > -       andi    k0, 0xff0                       # get pte offset
> > -       ld      k1, (k1)                        # get pte pointer
> > -       daddu   k1, k0
> > -       ld      k0, 0(k1)                       # get even pte
> > -       ld      k1, 8(k1)                       # get odd pte
> > -       dsrl    k0, 6                           # convert to entrylo0
> > -       dmtc0   k0, CP0_ENTRYLO0                # load it
> > -       dsrl    k1, 6                           # convert to entrylo1
> > -       dmtc0   k1, CP0_ENTRYLO1                # load it
> > -       nop                                     # Need 2 cycles between mtc0
> > -       nop                                     #  and tlbwr (CP0 hazard).
> > +       LOAD_PTE2 k1 k0 9f
> > +       ld      k0, 0(k1)                       # get even pte
> > +       ld      k1, 8(k1)                       # get odd pte
> > +       PTE_RELOAD k0 k1
> > +       b       1f
> >         tlbwr
> > +1:
> >         nop
> >         eret
> > -vmaddr:
> > +9:
> >         dla     k0, handle_vmalloc_address
> >         jr      k0
> >          nop
> > @@ -116,14 +99,14 @@ END(except_vec1_r4k)
> >         .align  5
> >  LEAF(except_vec1_r10k)
> >         .set    noat
> > -       LOAD_PTE2 k1 k0
> > +       LOAD_PTE2 k1 k0 9f
> >         ld      k0, 0(k1)                       # get even pte
> >         ld      k1, 8(k1)                       # get odd pte
> >         PTE_RELOAD k0 k1
> >         nop
> >         tlbwr
> >         eret
> > -kaddr:
> > +9:
> >         dla     k0, handle_vmalloc_address      # MAPPED kernel needs this
> >         jr      k0
> >          nop

--
_    _ ____  ___   Carsten Langgaard   Mailto:carstenl@mips.com
|\  /|||___)(___   MIPS Denmark        Direct: +45 4486 5527
| \/ |||    ____)  Lautrupvang 4B      Switch: +45 4486 5555
  TECHNOLOGIES     2750 Ballerup       Fax...: +45 4486 5556
                   Denmark             http://www.mips.com

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-07-30  6:59   ` Carsten Langgaard
@ 2002-07-30 11:29     ` Ralf Baechle
  2002-07-30 12:09       ` Carsten Langgaard
  2002-07-30 12:39       ` Kevin D. Kissell
  0 siblings, 2 replies; 21+ messages in thread
From: Ralf Baechle @ 2002-07-30 11:29 UTC (permalink / raw)
  To: Carsten Langgaard; +Cc: Maciej W. Rozycki, linux-mips, linux-mips

On Tue, Jul 30, 2002 at 08:59:17AM +0200, Carsten Langgaard wrote:

> We have been discussing this before, but I really don't like the idea of
> solving the hazard problem with a branch. The branch will on some CPUs
> (especially if they have a long pipeline) be a much bigger penalty than
> we actually wants to solve the hazard. On other CPU (with branch
> prediction) we may not even solve the hazard problem.

The branch - which is used by other OSes btw. - for the R4000 / R4400 where
this kind of taken branch implies a total delay of three cycles.  One for
the branch delay slot plus two extra cycles for the killed instructions
following the branch delay slot.  For R4600, R4700, R5000 and a bunch of
derivates I've verified that according to the documentation this extra
penalty of two cycles does not exist nor we need two extra cycles to handle
the hazard.  In other words the branch trick - which also is used by
some other commercial OS btw. - is providing best possible performance on
a wide range of processors.

> The 'nop' I used is not the solution either, instead we should use
> 'ssnop' instructions, which will make sure we also solve the hazard
> problem on superscalar CPUs.  We also need to have a hazard barrier in
> the code labeled "not_vmalloc".

Above trick was written with single issue CPUs in mind.  I'd have to
verify the pipeline timing again against CPU manuals but off my memory
at least SB1 and R1x000 are fully protected against the hazards in
question.

  Ralf

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-07-30 11:29     ` Ralf Baechle
@ 2002-07-30 12:09       ` Carsten Langgaard
  2002-07-30 12:44         ` Maciej W. Rozycki
  2002-07-30 12:39       ` Kevin D. Kissell
  1 sibling, 1 reply; 21+ messages in thread
From: Carsten Langgaard @ 2002-07-30 12:09 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: Maciej W. Rozycki, linux-mips, linux-mips

Ralf Baechle wrote:

> On Tue, Jul 30, 2002 at 08:59:17AM +0200, Carsten Langgaard wrote:
>
> > We have been discussing this before, but I really don't like the idea of
> > solving the hazard problem with a branch. The branch will on some CPUs
> > (especially if they have a long pipeline) be a much bigger penalty than
> > we actually wants to solve the hazard. On other CPU (with branch
> > prediction) we may not even solve the hazard problem.
>
> The branch - which is used by other OSes btw. - for the R4000 / R4400 where
> this kind of taken branch implies a total delay of three cycles.  One for
> the branch delay slot plus two extra cycles for the killed instructions
> following the branch delay slot.  For R4600, R4700, R5000 and a bunch of
> derivates I've verified that according to the documentation this extra
> penalty of two cycles does not exist nor we need two extra cycles to handle
> the hazard.  In other words the branch trick - which also is used by
> some other commercial OS btw. - is providing best possible performance on
> a wide range of processors.

If we are going to make the exception generic and usable for as many CPUs as
possible, I don't thing the branch trick is save.
Why not make a hazard barrier that contains 0, 1 or 2 'ssnop's depending on
the CPU configuration ?
This way we will have the exact number of 'ssnop' to solve the hazard, without
adding extra penalty for other CPUs.


>
> > The 'nop' I used is not the solution either, instead we should use
> > 'ssnop' instructions, which will make sure we also solve the hazard
> > problem on superscalar CPUs.  We also need to have a hazard barrier in
> > the code labeled "not_vmalloc".
>
> Above trick was written with single issue CPUs in mind.  I'd have to
> verify the pipeline timing again against CPU manuals but off my memory
> at least SB1 and R1x000 are fully protected against the hazards in
> question.
>

Yes, I guess that is true.
I guess most dual issue CPUs has fully protection against these type of
hazard, because it would be hard to say exactly how many 'ssnop' are need to
solve the hazard in software.


>
>   Ralf

--
_    _ ____  ___   Carsten Langgaard   Mailto:carstenl@mips.com
|\  /|||___)(___   MIPS Denmark        Direct: +45 4486 5527
| \/ |||    ____)  Lautrupvang 4B      Switch: +45 4486 5555
  TECHNOLOGIES     2750 Ballerup       Fax...: +45 4486 5556
                   Denmark             http://www.mips.com

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-07-30 12:09       ` Carsten Langgaard
@ 2002-07-30 12:44         ` Maciej W. Rozycki
  2002-07-30 22:47           ` Ralf Baechle
  0 siblings, 1 reply; 21+ messages in thread
From: Maciej W. Rozycki @ 2002-07-30 12:44 UTC (permalink / raw)
  To: Carsten Langgaard; +Cc: Ralf Baechle, linux-mips, linux-mips

On Tue, 30 Jul 2002, Carsten Langgaard wrote:

> > The branch - which is used by other OSes btw. - for the R4000 / R4400 where
> > this kind of taken branch implies a total delay of three cycles.  One for
> > the branch delay slot plus two extra cycles for the killed instructions
> > following the branch delay slot.  For R4600, R4700, R5000 and a bunch of
> > derivates I've verified that according to the documentation this extra
> > penalty of two cycles does not exist nor we need two extra cycles to handle
> > the hazard.  In other words the branch trick - which also is used by
> > some other commercial OS btw. - is providing best possible performance on
> > a wide range of processors.
> 
> If we are going to make the exception generic and usable for as many CPUs as
> possible, I don't thing the branch trick is save.
> Why not make a hazard barrier that contains 0, 1 or 2 'ssnop's depending on
> the CPU configuration ?
> This way we will have the exact number of 'ssnop' to solve the hazard, without
> adding extra penalty for other CPUs.

 Since the handler is critical for performance, it would be desireable to
have separate versions tuned for particular CPUs.  The branch for the
R4400 seems appropriate as it works unlike the documented code: the
R4000/R4400 manual as available from the MIPS site states a single
intervening instruction is needed before the last move to EntryLo and a
"tlbwr" or "tlbwi" (see Table F-1 and F-2).  So I conclude the branch is
really a workaround for a kind of erratum or a specification change. 

-- 
+  Maciej W. Rozycki, Technical University of Gdansk, Poland   +
+--------------------------------------------------------------+
+        e-mail: macro@ds2.pg.gda.pl, PGP key available        +

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-07-30 12:44         ` Maciej W. Rozycki
@ 2002-07-30 22:47           ` Ralf Baechle
  2002-07-31 11:34             ` Maciej W. Rozycki
  0 siblings, 1 reply; 21+ messages in thread
From: Ralf Baechle @ 2002-07-30 22:47 UTC (permalink / raw)
  To: Maciej W. Rozycki; +Cc: Carsten Langgaard, linux-mips, linux-mips

On Tue, Jul 30, 2002 at 02:44:32PM +0200, Maciej W. Rozycki wrote:

>  Since the handler is critical for performance, it would be desireable to
> have separate versions tuned for particular CPUs.  The branch for the
> R4400 seems appropriate as it works unlike the documented code: the
> R4000/R4400 manual as available from the MIPS site states a single
> intervening instruction is needed before the last move to EntryLo and a
> "tlbwr" or "tlbwi" (see Table F-1 and F-2).  So I conclude the branch is
> really a workaround for a kind of erratum or a specification change. 

Nope, on R4000 four cycles are needed between the tlbwr and a eret
instruction; on the R4600 just two.

  Ralf

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-07-30 22:47           ` Ralf Baechle
@ 2002-07-31 11:34             ` Maciej W. Rozycki
  2002-07-31 20:31               ` Ralf Baechle
  0 siblings, 1 reply; 21+ messages in thread
From: Maciej W. Rozycki @ 2002-07-31 11:34 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: Carsten Langgaard, linux-mips, linux-mips

On Wed, 31 Jul 2002, Ralf Baechle wrote:

> Nope, on R4000 four cycles are needed between the tlbwr and a eret
> instruction; on the R4600 just two.

 Ugh, I missed this entirely, thanks for pointing it out.  The doc implies
three cycles for the R4000 actually, though. 

-- 
+  Maciej W. Rozycki, Technical University of Gdansk, Poland   +
+--------------------------------------------------------------+
+        e-mail: macro@ds2.pg.gda.pl, PGP key available        +

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-07-31 11:34             ` Maciej W. Rozycki
@ 2002-07-31 20:31               ` Ralf Baechle
  2002-08-01 15:24                 ` Maciej W. Rozycki
  0 siblings, 1 reply; 21+ messages in thread
From: Ralf Baechle @ 2002-07-31 20:31 UTC (permalink / raw)
  To: Maciej W. Rozycki; +Cc: Carsten Langgaard, linux-mips, linux-mips

On Wed, Jul 31, 2002 at 01:34:17PM +0200, Maciej W. Rozycki wrote:

> > Nope, on R4000 four cycles are needed between the tlbwr and a eret
> > instruction; on the R4600 just two.
> 
>  Ugh, I missed this entirely, thanks for pointing it out.  The doc implies
> three cycles for the R4000 actually, though. 

I doublechecked the docs for the R4700 as well - just one cycle needed
between a tlbw and eret.

  Ralf

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-07-31 20:31               ` Ralf Baechle
@ 2002-08-01 15:24                 ` Maciej W. Rozycki
  2002-08-01 17:18                   ` Ralf Baechle
  0 siblings, 1 reply; 21+ messages in thread
From: Maciej W. Rozycki @ 2002-08-01 15:24 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: Carsten Langgaard, linux-mips, linux-mips

On Wed, 31 Jul 2002, Ralf Baechle wrote:

> I doublechecked the docs for the R4700 as well - just one cycle needed
> between a tlbw and eret.

 After looking at the generated assembly I discovered the handlers don't
fit in 128 bytes.  They didn't crash since I have modules disabled for
now, so the vmalloc path didn't get hit and the user path happened to fit,
but it was pure luck.  The path got hit before I fixed a bug in gas though
-- that's the explanation of the false cache error exceptions I used to
observe. 

 Here is a temporary corrected version I use now.  It works, but
ultimately the chosen handler should get copied somewhere above
KSEG0+0x200.  Also the "non_vmalloc" path looks bogus -- k0 gets loaded
with a random value from (k1) (if it doesn't happen to fault again), k1
retains the pointer and then both get loaded into the TLB -- intriguing... 

 I believe the patch should get applied for now to avoid surprises.  OK?

  Maciej

-- 
+  Maciej W. Rozycki, Technical University of Gdansk, Poland   +
+--------------------------------------------------------------+
+        e-mail: macro@ds2.pg.gda.pl, PGP key available        +

patch-mips-2.4.19-rc1-20020726-mips64-tlbex-r4k-2
diff -up --recursive --new-file linux-mips-2.4.19-rc1-20020726.macro/arch/mips64/mm/tlbex-r4k.S linux-mips-2.4.19-rc1-20020726/arch/mips64/mm/tlbex-r4k.S
--- linux-mips-2.4.19-rc1-20020726.macro/arch/mips64/mm/tlbex-r4k.S	2002-07-25 02:57:02.000000000 +0000
+++ linux-mips-2.4.19-rc1-20020726/arch/mips64/mm/tlbex-r4k.S	2002-07-31 23:20:21.000000000 +0000
@@ -5,6 +5,7 @@
  *
  * Copyright (C) 2000 Silicon Graphics, Inc.
  * Written by Ulf Carlsson (ulfc@engr.sgi.com)
+ * Copyright (C) 2002  Maciej W. Rozycki
  */
 #include <linux/config.h>
 #include <linux/init.h>
@@ -22,8 +23,7 @@
 	 * After this macro runs we have a pointer to the pte of the address
 	 * that caused the fault in in PTR.
 	 */
-
-	.macro	LOAD_PTE2, ptr, tmp
+	.macro	LOAD_PTE2, ptr, tmp, kaddr
 #ifdef CONFIG_SMP
 	dmfc0	\ptr, CP0_CONTEXT
 	dmfc0	\tmp, CP0_BADVADDR
@@ -32,8 +32,8 @@
 	dmfc0	\tmp, CP0_BADVADDR
 	dla	\ptr, pgd_current
 #endif
-	bltz	\tmp, kaddr
-	ld	\ptr, (\ptr)
+	bltz	\tmp, \kaddr
+	 ld	\ptr, (\ptr)
 	dsrl	\tmp, (PGDIR_SHIFT-3)		# get pgd offset in bytes
 	andi	\tmp, ((PTRS_PER_PGD - 1)<<3)
 	daddu	\ptr, \tmp			# add in pgd offset
@@ -48,6 +48,35 @@
 	daddu	\ptr, \tmp
 	.endm
 
+
+	/*
+	 * Ditto for the kernel table.
+	 */
+	.macro	LOAD_KPTE2, ptr, tmp, not_vmalloc
+	/*
+	 * First, determine that the address is in/above vmalloc range.
+	 */
+	dmfc0	\tmp, CP0_BADVADDR
+	dli	\ptr, VMALLOC_START
+
+	/*
+	 * Now find offset into kptbl.
+	 */
+	dsubu	\tmp, \tmp, \ptr
+	dla	\ptr, kptbl
+	dsrl	\tmp, (PAGE_SHIFT+1)		# get vpn2
+	dsll	\tmp, 4				# byte offset of pte
+	daddu	\ptr, \ptr, \tmp
+
+	/*
+	 * Determine that fault address is within vmalloc range.
+	 */
+	dla	\tmp, ekptbl
+	sltu	\tmp, \ptr, \tmp
+	beqz	\tmp, \not_vmalloc		# not vmalloc
+	.endm
+	
+
 	/*
 	 * This places the even/odd pte pair in the page table at the pte
 	 * entry pointed to by PTE into ENTRYLO0 and ENTRYLO1.
@@ -59,6 +88,7 @@
 	dmtc0	\pte1, CP0_ENTRYLO1		# load it
 	.endm
 
+
 	.text
 	.set	noreorder
 	.set	mips3
@@ -66,105 +96,93 @@
 	__INIT
 
 	.align	5
-FEXPORT(except_vec0)
+LEAF(except_vec0)
 	.set	noat
 	PANIC("Unused vector called")
 1:	b	1b
 	 nop
+END(except_vec0)
 
+
+	/*
+	 * TLB refill handler for the R4000.
+	 * Attention:  We may only use 32 instructions / 128 bytes.
+	 */
 	.align  5
 LEAF(except_vec1_r4k)
 	.set    noat
-	dla     k1, pgd_current
-	dmfc0   k0, CP0_BADVADDR
-	ld      k1, (k1)
-	bltz    k0, vmaddr
-	 dsrl   k0, (PGDIR_SHIFT-3)             # get pgd offset in bytes
-	andi    k0, ((PTRS_PER_PGD - 1)<<3)
-	daddu   k1, k0                          # add in pgd offset
-	dmfc0   k0, CP0_BADVADDR
-	ld      k1, (k1)                        # get pmd pointer
-	dsrl    k0, (PMD_SHIFT-3)               # get pmd offset in bytes
-	andi    k0, ((PTRS_PER_PMD - 1)<<3)
-	daddu   k1, k0                          # add in pmd offset
-	dmfc0   k0, CP0_XCONTEXT
-	andi    k0, 0xff0                       # get pte offset
-	ld      k1, (k1)                        # get pte pointer
-	daddu   k1, k0
-	ld      k0, 0(k1)                       # get even pte
-	ld      k1, 8(k1)                       # get odd pte
-	dsrl    k0, 6                           # convert to entrylo0
-	dmtc0   k0, CP0_ENTRYLO0                # load it
-	dsrl    k1, 6                           # convert to entrylo1
-	dmtc0   k1, CP0_ENTRYLO1                # load it
-	nop                                     # Need 2 cycles between mtc0
-	nop                                     #  and tlbwr (CP0 hazard).
-	tlbwr
-	nop
-	eret
-vmaddr:
-	dla     k0, handle_vmalloc_address
+	dla     k0, handle_vec1_r4k
 	jr      k0
 	 nop
 END(except_vec1_r4k)
-	
+
+	__FINIT
+
+	.align  5
+LEAF(handle_vec1_r4k)
+	.set    noat
+	LOAD_PTE2 k1 k0 9f
+	ld	k0, 0(k1)			# get even pte
+	ld	k1, 8(k1)			# get odd pte
+	PTE_RELOAD k0 k1
+	b	1f
+	 tlbwr
+1:	nop
+	eret
+
+9:						# handle the vmalloc range
+	LOAD_KPTE2 k1 k0 invalid_vmalloc_address
+	ld	k0, 0(k1)			# get even pte
+	ld	k1, 8(k1)			# get odd pte
+	PTE_RELOAD k0 k1
+	b	1f
+	 tlbwr
+1:	nop
+	eret
+END(handle_vec1_r4k)
+
+	__INIT
+
 	/*
 	 * TLB refill handler for the R10000.
 	 * Attention:  We may only use 32 instructions / 128 bytes.
 	 */
-
 	.align	5
 LEAF(except_vec1_r10k)
+	.set    noat
+	dla     k0, handle_vec1_r10k
+	jr      k0
+	 nop
+END(except_vec1_r10k)
+
+	__FINIT
+
+	.align	5
+LEAF(handle_vec1_r10k)
 	.set	noat
-	LOAD_PTE2 k1 k0
+	LOAD_PTE2 k1 k0 9f
 	ld	k0, 0(k1)			# get even pte
 	ld	k1, 8(k1)			# get odd pte
 	PTE_RELOAD k0 k1
 	nop
 	tlbwr
 	eret
-kaddr:
-	dla	k0, handle_vmalloc_address	# MAPPED kernel needs this
-	jr	k0
-	 nop
-	END(except_vec1_r10k)
-
-	__FINIT
 
-	.align	5
-LEAF(handle_vmalloc_address)
-	.set	noat
-	/*
-	 * First, determine that the address is in/above vmalloc range.
-	 */
-	dmfc0	k0, CP0_BADVADDR
-	dli	k1, VMALLOC_START
-
-	/*
-	 * Now find offset into kptbl.
-	 */
-	dsubu	k0, k0, k1
-	dla	k1, kptbl
-	dsrl	k0, (PAGE_SHIFT+1)		# get vpn2
-	dsll	k0, 4				# byte offset of pte
-	daddu	k1, k1, k0
-
-	/*
-	 * Determine that fault address is within vmalloc range.
-	 */
-	dla	k0, ekptbl
-	sltu	k0, k1, k0
-	beqz	k0, not_vmalloc
-
-	/*
-	 * Load cp0 registers.
-	 */
+9:						# handle the vmalloc range
+	LOAD_KPTE2 k1 k0 invalid_vmalloc_address
 	ld	k0, 0(k1)			# get even pte
 	ld	k1, 8(k1)			# get odd pte
-
-not_vmalloc:
 	PTE_RELOAD k0 k1
 	nop
 	tlbwr
 	eret
-	END(handle_vmalloc_address)
+END(handle_vec1_r10k)
+
+
+	.align	5
+LEAF(invalid_vmalloc_address)
+	.set	noat
+	PANIC("Invalid kernel address")
+1:	b	1b
+	 nop
+END(invalid_vmalloc_address)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-08-01 15:24                 ` Maciej W. Rozycki
@ 2002-08-01 17:18                   ` Ralf Baechle
  2002-08-02  9:32                     ` Carsten Langgaard
  0 siblings, 1 reply; 21+ messages in thread
From: Ralf Baechle @ 2002-08-01 17:18 UTC (permalink / raw)
  To: Maciej W. Rozycki; +Cc: Carsten Langgaard, linux-mips, linux-mips

On Thu, Aug 01, 2002 at 05:24:43PM +0200, Maciej W. Rozycki wrote:

>  After looking at the generated assembly I discovered the handlers don't
> fit in 128 bytes.  They didn't crash since I have modules disabled for
> now, so the vmalloc path didn't get hit and the user path happened to fit,
> but it was pure luck.  The path got hit before I fixed a bug in gas though
> -- that's the explanation of the false cache error exceptions I used to
> observe. 

Ouch.  It was a known problem but we simply ignored it for a while as that
handler just overwrites the cache error handler which normally should be
used extremly rarely, if at all.  The problem is somewhat itching by now
as we're supporting the SB1 core which in it's revision one may throw
spurious cache errors, so the handler is actually used ...

  Ralf

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-08-01 17:18                   ` Ralf Baechle
@ 2002-08-02  9:32                     ` Carsten Langgaard
  2002-08-02 11:05                       ` Ralf Baechle
  0 siblings, 1 reply; 21+ messages in thread
From: Carsten Langgaard @ 2002-08-02  9:32 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: Maciej W. Rozycki, linux-mips, linux-mips

Ralf Baechle wrote:

> On Thu, Aug 01, 2002 at 05:24:43PM +0200, Maciej W. Rozycki wrote:
>
> >  After looking at the generated assembly I discovered the handlers don't
> > fit in 128 bytes.  They didn't crash since I have modules disabled for
> > now, so the vmalloc path didn't get hit and the user path happened to fit,
> > but it was pure luck.  The path got hit before I fixed a bug in gas though
> > -- that's the explanation of the false cache error exceptions I used to
> > observe.
>
> Ouch.  It was a known problem but we simply ignored it for a while as that
> handler just overwrites the cache error handler which normally should be
> used extremly rarely, if at all.  The problem is somewhat itching by now
> as we're supporting the SB1 core which in it's revision one may throw
> spurious cache errors, so the handler is actually used ...
>

Maybe it's time for some intelligent check for the size of these exception
routine.

/Carsten



--
_    _ ____  ___   Carsten Langgaard   Mailto:carstenl@mips.com
|\  /|||___)(___   MIPS Denmark        Direct: +45 4486 5527
| \/ |||    ____)  Lautrupvang 4B      Switch: +45 4486 5555
  TECHNOLOGIES     2750 Ballerup       Fax...: +45 4486 5556
                   Denmark             http://www.mips.com

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-08-02  9:32                     ` Carsten Langgaard
@ 2002-08-02 11:05                       ` Ralf Baechle
  2002-08-02 11:09                         ` Carsten Langgaard
  0 siblings, 1 reply; 21+ messages in thread
From: Ralf Baechle @ 2002-08-02 11:05 UTC (permalink / raw)
  To: Carsten Langgaard; +Cc: Maciej W. Rozycki, linux-mips, linux-mips

On Fri, Aug 02, 2002 at 11:32:18AM +0200, Carsten Langgaard wrote:

> > >  After looking at the generated assembly I discovered the handlers don't
> > > fit in 128 bytes.  They didn't crash since I have modules disabled for
> > > now, so the vmalloc path didn't get hit and the user path happened to fit,
> > > but it was pure luck.  The path got hit before I fixed a bug in gas though
> > > -- that's the explanation of the false cache error exceptions I used to
> > > observe.
> >
> > Ouch.  It was a known problem but we simply ignored it for a while as that
> > handler just overwrites the cache error handler which normally should be
> > used extremly rarely, if at all.  The problem is somewhat itching by now
> > as we're supporting the SB1 core which in it's revision one may throw
> > spurious cache errors, so the handler is actually used ...
> >
> 
> Maybe it's time for some intelligent check for the size of these exception
> routine.

Easy trick at compile time which will just inflate the object code a little
bit:

        .align  5
LEAF(except_vec1_r4k)
	[...]
	END(except_vec1_r4k)
	.org	except_vec1_r4k + 0x80

This will result in an assembler error if the body of the except_vec1_r4k
function is bigger than 0x80.

  Ralf

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-08-02 11:05                       ` Ralf Baechle
@ 2002-08-02 11:09                         ` Carsten Langgaard
  0 siblings, 0 replies; 21+ messages in thread
From: Carsten Langgaard @ 2002-08-02 11:09 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: Maciej W. Rozycki, linux-mips, linux-mips

Ralf Baechle wrote:

> On Fri, Aug 02, 2002 at 11:32:18AM +0200, Carsten Langgaard wrote:
>
> > > >  After looking at the generated assembly I discovered the handlers don't
> > > > fit in 128 bytes.  They didn't crash since I have modules disabled for
> > > > now, so the vmalloc path didn't get hit and the user path happened to fit,
> > > > but it was pure luck.  The path got hit before I fixed a bug in gas though
> > > > -- that's the explanation of the false cache error exceptions I used to
> > > > observe.
> > >
> > > Ouch.  It was a known problem but we simply ignored it for a while as that
> > > handler just overwrites the cache error handler which normally should be
> > > used extremly rarely, if at all.  The problem is somewhat itching by now
> > > as we're supporting the SB1 core which in it's revision one may throw
> > > spurious cache errors, so the handler is actually used ...
> > >
> >
> > Maybe it's time for some intelligent check for the size of these exception
> > routine.
>
> Easy trick at compile time which will just inflate the object code a little
> bit:
>
>         .align  5
> LEAF(except_vec1_r4k)
>         [...]
>         END(except_vec1_r4k)
>         .org    except_vec1_r4k + 0x80
>
> This will result in an assembler error if the body of the except_vec1_r4k
> function is bigger than 0x80.
>

Great idea, please put it in the code.

/Carsten


--
_    _ ____  ___   Carsten Langgaard   Mailto:carstenl@mips.com
|\  /|||___)(___   MIPS Denmark        Direct: +45 4486 5527
| \/ |||    ____)  Lautrupvang 4B      Switch: +45 4486 5555
  TECHNOLOGIES     2750 Ballerup       Fax...: +45 4486 5556
                   Denmark             http://www.mips.com

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-07-30 11:29     ` Ralf Baechle
  2002-07-30 12:09       ` Carsten Langgaard
@ 2002-07-30 12:39       ` Kevin D. Kissell
  2002-07-30 12:39         ` Kevin D. Kissell
  2002-07-31  2:05         ` Ralf Baechle
  1 sibling, 2 replies; 21+ messages in thread
From: Kevin D. Kissell @ 2002-07-30 12:39 UTC (permalink / raw)
  To: Ralf Baechle, Carsten Langgaard; +Cc: Maciej W. Rozycki, linux-mips, linux-mips

From: "Ralf Baechle" <ralf@oss.sgi.com>
> On Tue, Jul 30, 2002 at 08:59:17AM +0200, Carsten Langgaard wrote:
> 
> > We have been discussing this before, but I really don't like the idea of
> > solving the hazard problem with a branch. The branch will on some CPUs
> > (especially if they have a long pipeline) be a much bigger penalty than
> > we actually wants to solve the hazard. On other CPU (with branch
> > prediction) we may not even solve the hazard problem.
> 
> The branch - which is used by other OSes btw. - for the R4000 / R4400 where
> this kind of taken branch implies a total delay of three cycles.  One for
> the branch delay slot plus two extra cycles for the killed instructions
> following the branch delay slot.  For R4600, R4700, R5000 and a bunch of
> derivates I've verified that according to the documentation this extra
> penalty of two cycles does not exist nor we need two extra cycles to handle
> the hazard.  In other words the branch trick - which also is used by
> some other commercial OS btw. - is providing best possible performance on
> a wide range of processors.

Which would be a fairly compelling argument if (a) we were constrained
for some reason to only have one handler and (b) the majority of MIPS
Linux systems being built had R4000/4400/4600/4700/5000 CPUs in
them.  But neither of those assumptions is true.  I don't see any cases
in the kernel of assembler functions being put into the .init segment of
the kernel image, but I would think that it could be (and anyway should
be) done with the various exception vectors, and in any case they are
dynamically installed based on the detected CPU.  If people using
old workstations want to use a branch-based timing hack in their
TLB handlers, that's all well and good.  But there is no guarantee that
the trick will work on all future (or even current) MIPS CPUs, and
I agree with Carsten that it is inappropriate for the generic or default
MIPS32 handlers.  I guess we need to propose a patch to allow
the Indy/Decstation crowd to retain their branch-based scheme,
but to quarantine it from the rest of the MIPS/Linux universe.

            Regards,

            Kevin K.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-07-30 12:39       ` Kevin D. Kissell
@ 2002-07-30 12:39         ` Kevin D. Kissell
  2002-07-31  2:05         ` Ralf Baechle
  1 sibling, 0 replies; 21+ messages in thread
From: Kevin D. Kissell @ 2002-07-30 12:39 UTC (permalink / raw)
  To: Ralf Baechle, Carsten Langgaard; +Cc: Maciej W. Rozycki, linux-mips, linux-mips

From: "Ralf Baechle" <ralf@oss.sgi.com>
> On Tue, Jul 30, 2002 at 08:59:17AM +0200, Carsten Langgaard wrote:
> 
> > We have been discussing this before, but I really don't like the idea of
> > solving the hazard problem with a branch. The branch will on some CPUs
> > (especially if they have a long pipeline) be a much bigger penalty than
> > we actually wants to solve the hazard. On other CPU (with branch
> > prediction) we may not even solve the hazard problem.
> 
> The branch - which is used by other OSes btw. - for the R4000 / R4400 where
> this kind of taken branch implies a total delay of three cycles.  One for
> the branch delay slot plus two extra cycles for the killed instructions
> following the branch delay slot.  For R4600, R4700, R5000 and a bunch of
> derivates I've verified that according to the documentation this extra
> penalty of two cycles does not exist nor we need two extra cycles to handle
> the hazard.  In other words the branch trick - which also is used by
> some other commercial OS btw. - is providing best possible performance on
> a wide range of processors.

Which would be a fairly compelling argument if (a) we were constrained
for some reason to only have one handler and (b) the majority of MIPS
Linux systems being built had R4000/4400/4600/4700/5000 CPUs in
them.  But neither of those assumptions is true.  I don't see any cases
in the kernel of assembler functions being put into the .init segment of
the kernel image, but I would think that it could be (and anyway should
be) done with the various exception vectors, and in any case they are
dynamically installed based on the detected CPU.  If people using
old workstations want to use a branch-based timing hack in their
TLB handlers, that's all well and good.  But there is no guarantee that
the trick will work on all future (or even current) MIPS CPUs, and
I agree with Carsten that it is inappropriate for the generic or default
MIPS32 handlers.  I guess we need to propose a patch to allow
the Indy/Decstation crowd to retain their branch-based scheme,
but to quarantine it from the rest of the MIPS/Linux universe.

            Regards,

            Kevin K.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-07-30 12:39       ` Kevin D. Kissell
  2002-07-30 12:39         ` Kevin D. Kissell
@ 2002-07-31  2:05         ` Ralf Baechle
  2002-07-31  7:28           ` Kevin D. Kissell
  1 sibling, 1 reply; 21+ messages in thread
From: Ralf Baechle @ 2002-07-31  2:05 UTC (permalink / raw)
  To: Kevin D. Kissell
  Cc: Carsten Langgaard, Maciej W. Rozycki, linux-mips, linux-mips

On Tue, Jul 30, 2002 at 02:39:24PM +0200, Kevin D. Kissell wrote:

> > following the branch delay slot.  For R4600, R4700, R5000 and a bunch of
> > derivates I've verified that according to the documentation this extra
> > penalty of two cycles does not exist nor we need two extra cycles to handle
> > the hazard.  In other words the branch trick - which also is used by
> > some other commercial OS btw. - is providing best possible performance on
> > a wide range of processors.
> 
> Which would be a fairly compelling argument if (a) we were constrained
> for some reason to only have one handler and (b) the majority of MIPS
> Linux systems being built had R4000/4400/4600/4700/5000 CPUs in
> them.  But neither of those assumptions is true.  I don't see any cases
> in the kernel of assembler functions being put into the .init segment of
> the kernel image, but I would think that it could be (and anyway should
> be) done with the various exception vectors, and in any case they are
> dynamically installed based on the detected CPU.  If people using
> old workstations want to use a branch-based timing hack in their
> TLB handlers, that's all well and good.  But there is no guarantee that
> the trick will work on all future (or even current) MIPS CPUs, and
> I agree with Carsten that it is inappropriate for the generic or default
> MIPS32 handlers.  I guess we need to propose a patch to allow
> the Indy/Decstation crowd to retain their branch-based scheme,
> but to quarantine it from the rest of the MIPS/Linux universe.

Basically we have two groups of interrupt handlers.  Some contain
workarounds for hardware bugs; the rest are very similar except having
to handle different hazards.  I was already thinking about building the
actuall exception handlers from a piece of code that inserts the right
number of (ss)nops etc. as required into the right place, thereby
producing an optimal handler for every CPU.

  Ralf

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-07-31  2:05         ` Ralf Baechle
@ 2002-07-31  7:28           ` Kevin D. Kissell
  2002-07-31  7:28             ` Kevin D. Kissell
  2002-07-31 11:49             ` Maciej W. Rozycki
  0 siblings, 2 replies; 21+ messages in thread
From: Kevin D. Kissell @ 2002-07-31  7:28 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: Carsten Langgaard, Maciej W. Rozycki, linux-mips, linux-mips

From: "Ralf Baechle" <ralf@oss.sgi.com>:
> Basically we have two groups of interrupt handlers.  Some contain
> workarounds for hardware bugs; the rest are very similar except having
> to handle different hazards.  I was already thinking about building the
> actuall exception handlers from a piece of code that inserts the right
> number of (ss)nops etc. as required into the right place, thereby
> producing an optimal handler for every CPU.

I really don't think that's a good idea.  That implies that we
could no longer simply inspect the exception handlers in
the source code or disassembled kernel binary file to 
analyse them for correctness, and I think it would lead
to unnecessary and hard-to-find bugs.  My personal
recommendation would be to keep the model we have
today, wherein handlers are selected at boot time from
some set of candidates built into the kernel binary, with
the slight modification that the templates be loaded into 
the init segment, so that the memory consumed can be
reclaimed at run time.  That would eliminate the only
argument I can see against having a larger set of 
statically-built optimized handlers.  The current
selection process is ad-hoc based on CPU ID.
We could easily formalize that a bit, and even
provide a boot command line option to override
the automatic selection with something "safer".

            Regards,

            Kevin K.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-07-31  7:28           ` Kevin D. Kissell
@ 2002-07-31  7:28             ` Kevin D. Kissell
  2002-07-31 11:49             ` Maciej W. Rozycki
  1 sibling, 0 replies; 21+ messages in thread
From: Kevin D. Kissell @ 2002-07-31  7:28 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: Carsten Langgaard, Maciej W. Rozycki, linux-mips, linux-mips

From: "Ralf Baechle" <ralf@oss.sgi.com>:
> Basically we have two groups of interrupt handlers.  Some contain
> workarounds for hardware bugs; the rest are very similar except having
> to handle different hazards.  I was already thinking about building the
> actuall exception handlers from a piece of code that inserts the right
> number of (ss)nops etc. as required into the right place, thereby
> producing an optimal handler for every CPU.

I really don't think that's a good idea.  That implies that we
could no longer simply inspect the exception handlers in
the source code or disassembled kernel binary file to 
analyse them for correctness, and I think it would lead
to unnecessary and hard-to-find bugs.  My personal
recommendation would be to keep the model we have
today, wherein handlers are selected at boot time from
some set of candidates built into the kernel binary, with
the slight modification that the templates be loaded into 
the init segment, so that the memory consumed can be
reclaimed at run time.  That would eliminate the only
argument I can see against having a larger set of 
statically-built optimized handlers.  The current
selection process is ad-hoc based on CPU ID.
We could easily formalize that a bit, and even
provide a boot command line option to override
the automatic selection with something "safer".

            Regards,

            Kevin K.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-07-31  7:28           ` Kevin D. Kissell
  2002-07-31  7:28             ` Kevin D. Kissell
@ 2002-07-31 11:49             ` Maciej W. Rozycki
  2002-07-31 18:22               ` Ralf Baechle
  1 sibling, 1 reply; 21+ messages in thread
From: Maciej W. Rozycki @ 2002-07-31 11:49 UTC (permalink / raw)
  To: Kevin D. Kissell; +Cc: Ralf Baechle, Carsten Langgaard, linux-mips, linux-mips

On Wed, 31 Jul 2002, Kevin D. Kissell wrote:

> I really don't think that's a good idea.  That implies that we
> could no longer simply inspect the exception handlers in
> the source code or disassembled kernel binary file to 
> analyse them for correctness, and I think it would lead
> to unnecessary and hard-to-find bugs.  My personal
> recommendation would be to keep the model we have
> today, wherein handlers are selected at boot time from
> some set of candidates built into the kernel binary, with

 Well, as long as we don't have an insane number of variations (say 32+),
I tend to agree.  Thanks to macros, maintaining source code is not that
hard.  If we ever reach the sanity limit, we may rearrange the source
again.

> the slight modification that the templates be loaded into 
> the init segment, so that the memory consumed can be
> reclaimed at run time.  That would eliminate the only

 That already happens now.  Except from the vmalloc path, which could
likely be handled this way as well, by copying the appropriate handler
to KSEG0 somewhere above standard exception vectors.  That would have the
micro-optimization advantage, we could use the "b" instruction, instead of
the much longer "dla/jr" pair.  Still possibly we can have a single
vmalloc handler only as the epilogue should be the same as for the user
path -- we need have to find a way to hook a jump back somehow in this
case.

> argument I can see against having a larger set of 
> statically-built optimized handlers.  The current
> selection process is ad-hoc based on CPU ID.
> We could easily formalize that a bit, and even

 Well, the current approach seems appropriate.  Only a comment here and
there might be useful, to explain why a particular handler is used (with
an erratum text included if applicable).

> provide a boot command line option to override
> the automatic selection with something "safer".

 Hmm, I think that's an overkill, although for debugging purposes, a
single extremely conservative handler (possibly with some status output to
the log) might be selectable as an alternative.

  Maciej

-- 
+  Maciej W. Rozycki, Technical University of Gdansk, Poland   +
+--------------------------------------------------------------+
+        e-mail: macro@ds2.pg.gda.pl, PGP key available        +

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch] MIPS64 R4k TLB refill CP0 hazards
  2002-07-31 11:49             ` Maciej W. Rozycki
@ 2002-07-31 18:22               ` Ralf Baechle
  0 siblings, 0 replies; 21+ messages in thread
From: Ralf Baechle @ 2002-07-31 18:22 UTC (permalink / raw)
  To: Maciej W. Rozycki
  Cc: Kevin D. Kissell, Carsten Langgaard, linux-mips, linux-mips

On Wed, Jul 31, 2002 at 01:49:57PM +0200, Maciej W. Rozycki wrote:

>  Hmm, I think that's an overkill, although for debugging purposes, a
> single extremely conservative handler (possibly with some status output to
> the log) might be selectable as an alternative.

Look at the C variation of the exception handler in the mips64 code.  It
was pretty useful to add debugging checks during early mips64 development.

  Ralf

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2002-08-02 11:08 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-07-29 15:23 [patch] MIPS64 R4k TLB refill CP0 hazards Maciej W. Rozycki
2002-07-29 20:10 ` Carsten Langgaard
2002-07-30  6:59   ` Carsten Langgaard
2002-07-30 11:29     ` Ralf Baechle
2002-07-30 12:09       ` Carsten Langgaard
2002-07-30 12:44         ` Maciej W. Rozycki
2002-07-30 22:47           ` Ralf Baechle
2002-07-31 11:34             ` Maciej W. Rozycki
2002-07-31 20:31               ` Ralf Baechle
2002-08-01 15:24                 ` Maciej W. Rozycki
2002-08-01 17:18                   ` Ralf Baechle
2002-08-02  9:32                     ` Carsten Langgaard
2002-08-02 11:05                       ` Ralf Baechle
2002-08-02 11:09                         ` Carsten Langgaard
2002-07-30 12:39       ` Kevin D. Kissell
2002-07-30 12:39         ` Kevin D. Kissell
2002-07-31  2:05         ` Ralf Baechle
2002-07-31  7:28           ` Kevin D. Kissell
2002-07-31  7:28             ` Kevin D. Kissell
2002-07-31 11:49             ` Maciej W. Rozycki
2002-07-31 18:22               ` Ralf Baechle

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox