All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch] pg-r4k.c cp0 hazards for R4000/R4400
@ 2004-01-26 17:12 Maciej W. Rozycki
  2004-02-02 15:08 ` Ralf Baechle
  0 siblings, 1 reply; 6+ messages in thread
From: Maciej W. Rozycki @ 2004-01-26 17:12 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: linux-mips

Ralf,

 The R4000/R4400 has a coprocessor 0 hazard when a P-cache operation is
less than two non-load, non-cache instructions apart from a store to the
same line.  For processors without a secondary cache, the code in pg-r4k.c
currently issues a Create Dirty Exclusive D-cache operation and then
immediately executes consecutive stores to the same line, therefore 
fulfilling the conditions for the hazard.

 The following patch changes the problematic operations to be performed on 
the cache line following the one to be written immediately.  It is safe to 
do so, because the cache operations are only a performance hint and are 
not required for data coherency.  However it is essential not to bypass 
the end of the page, so the trailing area of the page is excluded from 
these cache operation, similarly to what has already been done for 
prefetching.

 Actually, I'd like to optimize the functions a bit further, specifically 
to avoid multiple cacheops to the same line (if you don't mind), but 
currently I'd like to apply this change to assure correct operation.  As I 
have no non-SC R4000/R4400 system, this was untested, but perhaps studying 
the problem covered by the -scache patch sent previously will show if the 
hazard is indeed avoided.

 The patch also increases the buffers a bit for three reasons:

1. copy_page_array is already too small for the 128-byte S-cache line 
case. ;-)

2. The trail for non-SC R4000/R4400 increases buffer consumption and I was 
too lazy to calculate the requirements.

3. The planned optimization will likely require a little bit more space as 
well.

BTW, I was unable to reproduce your instruction count calculation for the
prefetch case; other results seem OK.

 OK to apply?

  Maciej

-- 
+  Maciej W. Rozycki, Technical University of Gdansk, Poland   +
+--------------------------------------------------------------+
+        e-mail: macro@ds2.pg.gda.pl, PGP key available        +

patch-mips-2.4.24-pre2-20040116-mips-pg-r4k-hazard-7
diff -up --recursive --new-file linux-mips-2.4.24-pre2-20040116.macro/arch/mips/mm/pg-r4k.c linux-mips-2.4.24-pre2-20040116/arch/mips/mm/pg-r4k.c
--- linux-mips-2.4.24-pre2-20040116.macro/arch/mips/mm/pg-r4k.c	2004-01-26 16:05:38.000000000 +0000
+++ linux-mips-2.4.24-pre2-20040116/arch/mips/mm/pg-r4k.c	2004-01-26 16:06:21.000000000 +0000
@@ -34,7 +34,7 @@
  * With prefetching, 16 byte strides			0xa0 bytes
  */
 
-static unsigned int clear_page_array[0xa0 / 4];
+static unsigned int clear_page_array[0x100 / 4];
 
 void clear_page(void * page) __attribute__((alias("clear_page_array")));
 
@@ -46,7 +46,7 @@ void clear_page(void * page) __attribute
  * R4600 v2.0:						0x84 bytes
  * With prefetching, 16 byte strides			0xb8 bytes
  */
-static unsigned int copy_page_array[0xb8 / 4];
+static unsigned int copy_page_array[0x100 / 4];
 
 void copy_page(void *to, void *from) __attribute__((alias("copy_page_array")));
 
@@ -159,7 +159,7 @@ static inline void build_cdex(void)
 	mi.c_format.rs         = 4;		/* $a0 */
 	mi.c_format.c_op       = 3;		/* Create Dirty Exclusive */
 	mi.c_format.cache      = 1;		/* Data Cache */
-	mi.c_format.simmediate = store_offset;
+	mi.c_format.simmediate = store_offset + cpu_dcache_line_size();
 
 	*epc++ = mi.word;
 }
@@ -300,6 +300,8 @@ static inline void build_jr_ra(void)
 
 void __init build_clear_page(void)
 {
+	int lead_size, loop_size;
+
 	epc = (unsigned int *) &clear_page_array;
 
 	if (cpu_has_prefetch) {
@@ -316,7 +318,20 @@ void __init build_clear_page(void)
 		}
 	}
 
-	build_addiu_a2_a0(PAGE_SIZE - (cpu_has_prefetch ? pref_offset_clear : 0));
+	if (cpu_has_prefetch)
+		lead_size = PAGE_SIZE - pref_offset_clear;
+	else if (cpu_has_cache_cdex_p && !cpu_has_cache_cdex_s) {
+		loop_size = 4;
+		if (cpu_has_64bit_registers)
+			loop_size *= 2;
+		loop_size *= 8;
+		if (loop_size < cpu_dcache_line_size())
+			loop_size = cpu_dcache_line_size();
+		lead_size = PAGE_SIZE - loop_size;
+	} else
+		lead_size = PAGE_SIZE;
+
+	build_addiu_a2_a0(lead_size);
 
 	if (R4600_V2_HIT_CACHEOP_WAR && ((read_c0_prid() & 0xfff0) == 0x2020)) {
 		*epc++ = 0x40026000;		/* mfc0    $v0, $12	*/
@@ -354,8 +369,8 @@ dest = epc;
 	build_bne(dest);
 	 build_store_reg(0);
 
-	if (cpu_has_prefetch && pref_offset_clear) {
-		build_addiu_a2_a0(pref_offset_clear);
+	if (lead_size < PAGE_SIZE) {
+		build_addiu_a2_a0(PAGE_SIZE - lead_size);
 	dest = epc;
 		__build_store_reg(0);
 		__build_store_reg(0);
@@ -383,9 +398,26 @@ dest = epc;
 
 void __init build_copy_page(void)
 {
+	int lead_size, loop_size;
+
 	epc = (unsigned int *) &copy_page_array;
 
-	build_addiu_a2_a0(PAGE_SIZE - (cpu_has_prefetch ? pref_offset_copy : 0));
+	if (cpu_has_prefetch)
+		lead_size = PAGE_SIZE - pref_offset_copy;
+	else if (cpu_has_cache_cdex_p && !cpu_has_cache_cdex_s) {
+		loop_size = 4;
+#ifdef CONFIG_MIPS64
+		loop_size *= 2;
+#endif
+		loop_size *= 8;
+		if (loop_size < cpu_dcache_line_size())
+			loop_size = cpu_dcache_line_size();
+		lead_size = PAGE_SIZE - loop_size;
+	} else
+		lead_size = PAGE_SIZE;
+
+	build_addiu_a2_a0(lead_size);
+
 
 	if (R4600_V2_HIT_CACHEOP_WAR && ((read_c0_prid() & 0xfff0) == 0x2020)) {
 		*epc++ = 0x40026000;		/* mfc0    $v0, $12	*/
@@ -440,8 +472,8 @@ dest = epc;
 	build_bne(dest);
 	 build_store_reg(11);
 
-	if (cpu_has_prefetch && pref_offset_copy) {
-		build_addiu_a2_a0(pref_offset_copy);
+	if (lead_size < PAGE_SIZE) {
+		build_addiu_a2_a0(PAGE_SIZE - lead_size);
 	dest = epc;
 		__build_load_reg( 8);
 		__build_load_reg( 9);
diff -up --recursive --new-file linux-mips-2.4.24-pre2-20040116.macro/arch/mips64/mm/pg-r4k.c linux-mips-2.4.24-pre2-20040116/arch/mips64/mm/pg-r4k.c
--- linux-mips-2.4.24-pre2-20040116.macro/arch/mips64/mm/pg-r4k.c	2004-01-26 16:05:38.000000000 +0000
+++ linux-mips-2.4.24-pre2-20040116/arch/mips64/mm/pg-r4k.c	2004-01-26 16:06:21.000000000 +0000
@@ -34,7 +34,7 @@
  * With prefetching, 16 byte strides			0xa0 bytes
  */
 
-static unsigned int clear_page_array[0xa0 / 4];
+static unsigned int clear_page_array[0x100 / 4];
 
 void clear_page(void * page) __attribute__((alias("clear_page_array")));
 
@@ -46,7 +46,7 @@ void clear_page(void * page) __attribute
  * R4600 v2.0:						0x84 bytes
  * With prefetching, 16 byte strides			0xb8 bytes
  */
-static unsigned int copy_page_array[0xb8 / 4];
+static unsigned int copy_page_array[0x100 / 4];
 
 void copy_page(void *to, void *from) __attribute__((alias("copy_page_array")));
 
@@ -159,7 +159,7 @@ static inline void build_cdex(void)
 	mi.c_format.rs         = 4;		/* $a0 */
 	mi.c_format.c_op       = 3;		/* Create Dirty Exclusive */
 	mi.c_format.cache      = 1;		/* Data Cache */
-	mi.c_format.simmediate = store_offset;
+	mi.c_format.simmediate = store_offset + cpu_dcache_line_size();
 
 	*epc++ = mi.word;
 }
@@ -300,6 +300,8 @@ static inline void build_jr_ra(void)
 
 void __init build_clear_page(void)
 {
+	int lead_size, loop_size;
+
 	epc = (unsigned int *) &clear_page_array;
 
 	if (cpu_has_prefetch) {
@@ -316,7 +318,20 @@ void __init build_clear_page(void)
 		}
 	}
 
-	build_addiu_a2_a0(PAGE_SIZE - (cpu_has_prefetch ? pref_offset_clear : 0));
+	if (cpu_has_prefetch)
+		lead_size = PAGE_SIZE - pref_offset_clear;
+	else if (cpu_has_cache_cdex_p && !cpu_has_cache_cdex_s) {
+		loop_size = 4;
+		if (cpu_has_64bit_registers)
+			loop_size *= 2;
+		loop_size *= 8;
+		if (loop_size < cpu_dcache_line_size())
+			loop_size = cpu_dcache_line_size();
+		lead_size = PAGE_SIZE - loop_size;
+	} else
+		lead_size = PAGE_SIZE;
+
+	build_addiu_a2_a0(lead_size);
 
 	if (R4600_V2_HIT_CACHEOP_WAR && ((read_c0_prid() & 0xfff0) == 0x2020)) {
 		*epc++ = 0x40026000;		/* mfc0    $v0, $12	*/
@@ -354,8 +369,8 @@ dest = epc;
 	build_bne(dest);
 	 build_store_reg(0);
 
-	if (cpu_has_prefetch && pref_offset_clear) {
-		build_addiu_a2_a0(pref_offset_clear);
+	if (lead_size < PAGE_SIZE) {
+		build_addiu_a2_a0(PAGE_SIZE - lead_size);
 	dest = epc;
 		__build_store_reg(0);
 		__build_store_reg(0);
@@ -383,9 +398,26 @@ dest = epc;
 
 void __init build_copy_page(void)
 {
+	int lead_size, loop_size;
+
 	epc = (unsigned int *) &copy_page_array;
 
-	build_addiu_a2_a0(PAGE_SIZE - (cpu_has_prefetch ? pref_offset_copy : 0));
+	if (cpu_has_prefetch)
+		lead_size = PAGE_SIZE - pref_offset_copy;
+	else if (cpu_has_cache_cdex_p && !cpu_has_cache_cdex_s) {
+		loop_size = 4;
+#ifdef CONFIG_MIPS64
+		loop_size *= 2;
+#endif
+		loop_size *= 8;
+		if (loop_size < cpu_dcache_line_size())
+			loop_size = cpu_dcache_line_size();
+		lead_size = PAGE_SIZE - loop_size;
+	} else
+		lead_size = PAGE_SIZE;
+
+	build_addiu_a2_a0(lead_size);
+
 
 	if (R4600_V2_HIT_CACHEOP_WAR && ((read_c0_prid() & 0xfff0) == 0x2020)) {
 		*epc++ = 0x40026000;		/* mfc0    $v0, $12	*/
@@ -440,8 +472,8 @@ dest = epc;
 	build_bne(dest);
 	 build_store_reg(11);
 
-	if (cpu_has_prefetch && pref_offset_copy) {
-		build_addiu_a2_a0(pref_offset_copy);
+	if (lead_size < PAGE_SIZE) {
+		build_addiu_a2_a0(PAGE_SIZE - lead_size);
 	dest = epc;
 		__build_load_reg( 8);
 		__build_load_reg( 9);

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [patch] pg-r4k.c cp0 hazards for R4000/R4400
  2004-01-26 17:12 [patch] pg-r4k.c cp0 hazards for R4000/R4400 Maciej W. Rozycki
@ 2004-02-02 15:08 ` Ralf Baechle
  2004-02-02 18:38   ` Karsten Merker
  2004-02-03 15:11   ` Maciej W. Rozycki
  0 siblings, 2 replies; 6+ messages in thread
From: Ralf Baechle @ 2004-02-02 15:08 UTC (permalink / raw)
  To: Maciej W. Rozycki; +Cc: linux-mips

On Mon, Jan 26, 2004 at 06:12:43PM +0100, Maciej W. Rozycki wrote:

>  The R4000/R4400 has a coprocessor 0 hazard when a P-cache operation is
> less than two non-load, non-cache instructions apart from a store to the
> same line.  For processors without a secondary cache, the code in pg-r4k.c
> currently issues a Create Dirty Exclusive D-cache operation and then
> immediately executes consecutive stores to the same line, therefore 
> fulfilling the conditions for the hazard.

The wording is "There must be two non-load, non-CACHE instructions between
a store and a CACHE instruction directed to the same primary cache line as
the store."  My interpretation of that is the problem only exists if a
store instruction is followed by a cache instruction, not if the
cache instruction is followed by the store?  I've not found any hints in
the manual to verify or falsify this theory.  In case you're right we've
violated this hazard since almost beginning of the time, see
http://www.linux-mips.org/cvsweb/linux/arch/mips/mm/Attic/pg-r4k.S?rev=1.9
and I've not heared of any problems arising from this.

Any DECstations using the R4[04]00PC CPU variant?

>  The following patch changes the problematic operations to be performed on 
> the cache line following the one to be written immediately.  It is safe to 
> do so, because the cache operations are only a performance hint and are 
> not required for data coherency.  However it is essential not to bypass 
> the end of the page, so the trailing area of the page is excluded from 
> these cache operation, similarly to what has already been done for 
> prefetching.
> 
>  Actually, I'd like to optimize the functions a bit further, specifically 
> to avoid multiple cacheops to the same line (if you don't mind),

Please do so - cacheops are fairly expensive, we must use them as
intelligently as possible.

> but 
> currently I'd like to apply this change to assure correct operation.  As I 
> have no non-SC R4000/R4400 system, this was untested, but perhaps studying 
> the problem covered by the -scache patch sent previously will show if the 
> hazard is indeed avoided.

Have you actually been hit by that hazard?  

>  The patch also increases the buffers a bit for three reasons:
> 
> 1. copy_page_array is already too small for the 128-byte S-cache line 
> case. ;-)
>
> 2. The trail for non-SC R4000/R4400 increases buffer consumption and I was 
> too lazy to calculate the requirements.
> 
> 3. The planned optimization will likely require a little bit more space as 
> well.
> 
> BTW, I was unable to reproduce your instruction count calculation for the
> prefetch case; other results seem OK.

Yep, the patch I just checked in increased the allocation size to handle
the worst case appropriately.  In the end I think we really want to trade
memory for performance here.

>  OK to apply?

Most of your changes conflict with my fixes, so I guess that need redoing ...

  Ralf

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [patch] pg-r4k.c cp0 hazards for R4000/R4400
  2004-02-02 15:08 ` Ralf Baechle
@ 2004-02-02 18:38   ` Karsten Merker
  2004-02-03 15:11   ` Maciej W. Rozycki
  1 sibling, 0 replies; 6+ messages in thread
From: Karsten Merker @ 2004-02-02 18:38 UTC (permalink / raw)
  To: linux-mips

On Mon, Feb 02, 2004 at 04:08:06PM +0100, Ralf Baechle wrote:

> Any DECstations using the R4[04]00PC CPU variant?

Not that I know of - only R4[04]00SC.

Regards,
Karsten
-- 
#include <standard_disclaimer>
Nach Paragraph 28 Abs. 3 Bundesdatenschutzgesetz widerspreche ich der Nutzung
oder Uebermittlung meiner Daten fuer Werbezwecke oder fuer die Markt- oder
Meinungsforschung.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [patch] pg-r4k.c cp0 hazards for R4000/R4400
  2004-02-02 15:08 ` Ralf Baechle
  2004-02-02 18:38   ` Karsten Merker
@ 2004-02-03 15:11   ` Maciej W. Rozycki
  2004-02-03 15:42     ` Ralf Baechle
  1 sibling, 1 reply; 6+ messages in thread
From: Maciej W. Rozycki @ 2004-02-03 15:11 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: linux-mips

On Mon, 2 Feb 2004, Ralf Baechle wrote:

> >  The R4000/R4400 has a coprocessor 0 hazard when a P-cache operation is
> > less than two non-load, non-cache instructions apart from a store to the
> > same line.  For processors without a secondary cache, the code in pg-r4k.c
> > currently issues a Create Dirty Exclusive D-cache operation and then
> > immediately executes consecutive stores to the same line, therefore 
> > fulfilling the conditions for the hazard.
> 
> The wording is "There must be two non-load, non-CACHE instructions between
> a store and a CACHE instruction directed to the same primary cache line as
> the store."  My interpretation of that is the problem only exists if a
> store instruction is followed by a cache instruction, not if the
> cache instruction is followed by the store?  I've not found any hints in
> the manual to verify or falsify this theory.  In case you're right we've

 I'm pretty sure the hazard is in both directions -- why?  Because it's 
marked both in the "CP0 Data Used, Stage Used" and the "CP0 Data Written, 
Stage Available" column.  I interpret that as a requirement for the CACHE 
instructions to "start using data" two instructions ahead and "finish 
writing data" two instructions after itself.  If your assumption was true, 
I'd put the marking only in the former column.  Of course that does not 
mean the table is correct, but I'd assume so, for safety, if not anything 
else.

> violated this hazard since almost beginning of the time, see
> http://www.linux-mips.org/cvsweb/linux/arch/mips/mm/Attic/pg-r4k.S?rev=1.9
> and I've not heared of any problems arising from this.

 Perhaps it wasn't really tested.  Have you ever run the code on a PC 
variant?  Has anyone else?

> Any DECstations using the R4[04]00PC CPU variant?

 None.  That would normally be a downgrade in memory throughput as the
R2k/R3k DECstations used to have 64kB of I-cache + 64kB of D-cache,
typically, and sometimes (with the 33MHz variant) even 128kB of D-cache.

> > but 
> > currently I'd like to apply this change to assure correct operation.  As I 
> > have no non-SC R4000/R4400 system, this was untested, but perhaps studying 
> > the problem covered by the -scache patch sent previously will show if the 
> > hazard is indeed avoided.
> 
> Have you actually been hit by that hazard?  

 I suppose so -- without the "mips-pg-r4k-scache" patch my system is very
unstable and the difference is essentially that in addition to
Create_Dirty_Excl_SD there are additional Create_Dirty_Excl_D ones, that,
apart from being a performance hit, shouldn't have any effect.  I have to
investigate it further yet.

> >  OK to apply?
> 
> Most of your changes conflict with my fixes, so I guess that need redoing ...

 I can see you've already resolved a few problems -- I'll check if any
remained.

  Maciej

-- 
+  Maciej W. Rozycki, Technical University of Gdansk, Poland   +
+--------------------------------------------------------------+
+        e-mail: macro@ds2.pg.gda.pl, PGP key available        +

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [patch] pg-r4k.c cp0 hazards for R4000/R4400
  2004-02-03 15:11   ` Maciej W. Rozycki
@ 2004-02-03 15:42     ` Ralf Baechle
  2004-02-03 16:03       ` Maciej W. Rozycki
  0 siblings, 1 reply; 6+ messages in thread
From: Ralf Baechle @ 2004-02-03 15:42 UTC (permalink / raw)
  To: Maciej W. Rozycki; +Cc: linux-mips

On Tue, Feb 03, 2004 at 04:11:43PM +0100, Maciej W. Rozycki wrote:

>  I'm pretty sure the hazard is in both directions -- why?  Because it's 
> marked both in the "CP0 Data Used, Stage Used" and the "CP0 Data Written, 
> Stage Available" column.  I interpret that as a requirement for the CACHE 
> instructions to "start using data" two instructions ahead and "finish 
> writing data" two instructions after itself.  If your assumption was true, 
> I'd put the marking only in the former column.  Of course that does not 
> mean the table is correct, but I'd assume so, for safety, if not anything 
> else.

I was just trying to explain why the PC variants used to work fine.

> > violated this hazard since almost beginning of the time, see
> > http://www.linux-mips.org/cvsweb/linux/arch/mips/mm/Attic/pg-r4k.S?rev=1.9
> > and I've not heared of any problems arising from this.
> 
>  Perhaps it wasn't really tested.  Have you ever run the code on a PC 
> variant?  Has anyone else?

Yes, it has.  Olivetti M700-10, around 2.2 or so.  The code used at that
time in arch/mips/mm/r4xx0.c was not much different from what pg-r4k.c is
generating now that is it violates this hazard.

> > Any DECstations using the R4[04]00PC CPU variant?
> 
>  None.  That would normally be a downgrade in memory throughput as the
> R2k/R3k DECstations used to have 64kB of I-cache + 64kB of D-cache,
> typically, and sometimes (with the 33MHz variant) even 128kB of D-cache.

>  I suppose so -- without the "mips-pg-r4k-scache" patch my system is very
> unstable and the difference is essentially that in addition to
> Create_Dirty_Excl_SD there are additional Create_Dirty_Excl_D ones, that,
> apart from being a performance hit, shouldn't have any effect.  I have to
> investigate it further yet.

Interesting, I was expecting somewhat better performance from the
combination of both.  Anyway, what is now in CVS is tested on R4400SC V4.0.

  Ralf

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [patch] pg-r4k.c cp0 hazards for R4000/R4400
  2004-02-03 15:42     ` Ralf Baechle
@ 2004-02-03 16:03       ` Maciej W. Rozycki
  0 siblings, 0 replies; 6+ messages in thread
From: Maciej W. Rozycki @ 2004-02-03 16:03 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: linux-mips

On Tue, 3 Feb 2004, Ralf Baechle wrote:

> >  Perhaps it wasn't really tested.  Have you ever run the code on a PC 
> > variant?  Has anyone else?
> 
> Yes, it has.  Olivetti M700-10, around 2.2 or so.  The code used at that
> time in arch/mips/mm/r4xx0.c was not much different from what pg-r4k.c is
> generating now that is it violates this hazard.

 OK then.

> >  I suppose so -- without the "mips-pg-r4k-scache" patch my system is very
> > unstable and the difference is essentially that in addition to
> > Create_Dirty_Excl_SD there are additional Create_Dirty_Excl_D ones, that,
> > apart from being a performance hit, shouldn't have any effect.  I have to
> > investigate it further yet.
> 
> Interesting, I was expecting somewhat better performance from the
> combination of both.  Anyway, what is now in CVS is tested on R4400SC V4.0.

 After rereading the description I agree with you, so I am going to get at 
the code again soon.

-- 
+  Maciej W. Rozycki, Technical University of Gdansk, Poland   +
+--------------------------------------------------------------+
+        e-mail: macro@ds2.pg.gda.pl, PGP key available        +

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2004-02-03 16:03 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-01-26 17:12 [patch] pg-r4k.c cp0 hazards for R4000/R4400 Maciej W. Rozycki
2004-02-02 15:08 ` Ralf Baechle
2004-02-02 18:38   ` Karsten Merker
2004-02-03 15:11   ` Maciej W. Rozycki
2004-02-03 15:42     ` Ralf Baechle
2004-02-03 16:03       ` Maciej W. Rozycki

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.