[Qemu-devel] SH: Fix movca.l/ocbi emulation.

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] SH: Fix movca.l/ocbi emulation.
@ 2008-12-11 18:34 Vladimir Prus
  2009-01-14 20:45 ` Edgar E. Iglesias
  2009-01-18 16:38 ` Shin-ichiro KAWASAKI
  0 siblings, 2 replies; 7+ messages in thread
From: Vladimir Prus @ 2008-12-11 18:34 UTC (permalink / raw)
  To: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 919 bytes --]


This patch fixes the emulation of movca.l and ocbi
instructions. movca.l is documented to allocate a cache
like, and write into it. ocbi invalidates a cache line.
So, given code like:

			asm volatile("movca.l r0, @%0\n\t"
			     "movca.l r0, @%1\n\t"
			     "ocbi @%0\n\t"
			     "ocbi @%1" : :
			     "r" (a0), "r" (a1));

Nothing is actually written to memory. Code like this can be
found in arch/sh/mm/cache-sh4.c and is used to flush the cache.

Current QEMU implements movca.l the same way as ordinary move,
and the code above just corrupts memory. Doing full cache emulation
is out of question, so this patch implements a hack. Stores
done by movca.l are delayed. If we execute an instructions that is
neither movca.l nor ocbi, we flush all pending stores. If we execute
ocbi, we look at pending stores and delete a store to the invalidated
address. This appears to work fairly well in practice.

- Volodya

[-- Attachment #2: 0004-Fix-movcal.l-ocbi-emulation.patch --]
[-- Type: text/x-diff, Size: 9115 bytes --]

From 2b28d4213e3b16a93d53eb2e4d522c5824de1647 Mon Sep 17 00:00:00 2001
From: Vladimir Prus <vladimir@codesourcery.com>
Date: Tue, 11 Nov 2008 12:29:02 +0300
Subject: [PATCH] Fix movcal.l/ocbi emulation.
To: qemu-devel@nongnu.org
X-KMail-Transport: CodeSourcery
X-KMail-Identity: 901867920

	* target-sh4/cpu.h (store_request_t): New.
	(CPUSH4State): New fields store_requests and store_request_tail.
	* target-sh4/helper.h (helper_movcal, herlper_do_stores, helper_ocbi): New.
	* target-sh4/op_helper.c (helper_movcal, herlper_do_stores)
	(helper_ocbi): New.
	* target-sh4/translate.c (DisasContext): New field has_movcal.
	(sh4_defs): Update CVS for SH7785.
	(cpu_sh4_init): Initialize env->store_request_tail;
	(_decode_opc): Flush pending movca.l-originated stores.
	Make use of helper_movcal and helper_ocbi.
	(gen_intermediate_code_internal): Initialize has_movcal to 1.
---
 cpu-exec.c             |    2 +-
 target-sh4/cpu.h       |   17 +++++++++++++--
 target-sh4/helper.h    |    4 +++
 target-sh4/op_helper.c |   49 ++++++++++++++++++++++++++++++++++++++++++++++++
 target-sh4/translate.c |   44 +++++++++++++++++++++++++++++++++++++++---
 5 files changed, 108 insertions(+), 8 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index 9a35a59..64b0845 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -174,7 +174,7 @@ static inline TranslationBlock *tb_find_fast(void)
     /* we record a subset of the CPU state. It will
        always be the same before a given translated block
        is executed. */
-    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);            
     tb = env->tb_jmp_cache[tb_jmp_cache_hash_func(pc)];
     if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
                  tb->flags != flags)) {
diff --git a/target-sh4/cpu.h b/target-sh4/cpu.h
index ae434d1..eed3b1b 100644
--- a/target-sh4/cpu.h
+++ b/target-sh4/cpu.h
@@ -93,6 +93,12 @@ enum sh_features {
     SH_FEATURE_SH4A = 1,
 };
 
+typedef struct store_request_t {
+    uint32_t address;
+    uint32_t value;
+    struct store_request_t *next;
+} store_request_t;
+
 typedef struct CPUSH4State {
     int id;			/* CPU model */
 
@@ -141,6 +147,8 @@ typedef struct CPUSH4State {
     tlb_t itlb[ITLB_SIZE];	/* instruction translation table */
     void *intc_handle;
     int intr_at_halt;		/* SR_BL ignored during sleep */
+    store_request_t *store_requests;
+    store_request_t **store_request_tail;
 } CPUSH4State;
 
 CPUSH4State *cpu_sh4_init(const char *cpu_model);
@@ -281,16 +289,19 @@ static inline void cpu_pc_from_tb(CPUState *env, TranslationBlock *tb)
     env->flags = tb->flags;
 }
 
+#define TB_FLAG_PENDING_MOVCA  (1 << 4)
+
 static inline void cpu_get_tb_cpu_state(CPUState *env, target_ulong *pc,
                                         target_ulong *cs_base, int *flags)
 {
     *pc = env->pc;
     *cs_base = 0;
     *flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL
-                    | DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME))   /* Bits  0- 3 */
-            | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
+                    | DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME))     /* Bits  0- 3 */
+            | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))    /* Bits 19-21 */
             | (env->sr & (SR_MD | SR_RB))                      /* Bits 29-30 */
-            | (env->sr & SR_FD);                               /* Bit 15 */
+            | (env->sr & SR_FD)                                /* Bit 15 */
+            | (env->store_requests ? TB_FLAG_PENDING_MOVCA : 0); /* Bit 4 */
 }
 
 #endif				/* _CPU_SH4_H */
diff --git a/target-sh4/helper.h b/target-sh4/helper.h
index 631e7e1..d995688 100644
--- a/target-sh4/helper.h
+++ b/target-sh4/helper.h
@@ -9,6 +9,10 @@ DEF_HELPER_0(debug, void)
 DEF_HELPER_1(sleep, void, i32)
 DEF_HELPER_1(trapa, void, i32)
 
+DEF_HELPER_2(movcal, void, i32, i32)
+DEF_HELPER_0(do_stores, void)
+DEF_HELPER_1(ocbi, void, i32)
+
 DEF_HELPER_2(addv, i32, i32, i32)
 DEF_HELPER_2(addc, i32, i32, i32)
 DEF_HELPER_2(subv, i32, i32, i32)
diff --git a/target-sh4/op_helper.c b/target-sh4/op_helper.c
index 6352219..b4982b0 100644
--- a/target-sh4/op_helper.c
+++ b/target-sh4/op_helper.c
@@ -122,6 +122,55 @@ void helper_trapa(uint32_t tra)
     cpu_loop_exit();
 }
 
+void helper_movcal(uint32_t address, uint32_t value)
+{
+  store_request_t *r = (store_request_t *)malloc (sizeof(store_request_t));
+  r->address = address;
+  r->value = value;
+  r->next = NULL;
+
+  *(env->store_request_tail) = r;
+  env->store_request_tail = &(r->next);
+}
+
+void helper_do_stores(void)
+{
+  store_request_t *current = env->store_requests;
+
+  while(current)
+    {
+      uint32_t a = current->address, v = current->value;
+      store_request_t *next = current->next;
+      free (current);
+      env->store_requests = current = next;
+      if (current == 0)
+	env->store_request_tail = &(env->store_requests);
+
+      stl_data(a, v);
+    } 
+}
+
+void helper_ocbi(uint32_t address)
+{
+  store_request_t **current = &(env->store_requests);
+  while (*current)
+    {
+      if ((*current)->address == address)
+	{
+	  store_request_t *next = (*current)->next;
+
+	  if (next == 0)
+	    {
+	      env->store_request_tail = current;
+	    }
+
+	  free (*current);
+	  *current = next;
+	  break;
+	}
+    }
+}
+
 uint32_t helper_addc(uint32_t arg0, uint32_t arg1)
 {
     uint32_t tmp0, tmp1;
diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index ba9db14..949cb06 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -50,6 +50,7 @@ typedef struct DisasContext {
     uint32_t delayed_pc;
     int singlestep_enabled;
     uint32_t features;
+    int has_movcal;
 } DisasContext;
 
 #if defined(CONFIG_USER_ONLY)
@@ -278,6 +279,7 @@ CPUSH4State *cpu_sh4_init(const char *cpu_model)
 	return NULL;
     env->features = def->features;
     cpu_exec_init(env);
+    env->store_request_tail = &(env->store_requests);
     sh4_translate_init();
     env->cpu_model_str = cpu_model;
     cpu_sh4_reset(env);
@@ -490,6 +492,40 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 
 static void _decode_opc(DisasContext * ctx)
 {
+    /* This code tries to make movcal emulation sufficiently
+       accurate for Linux purposes.  This instruction writes
+       memory, and prior to that, always allocates a cache line.
+       It is used in two contexts:
+       - in memcpy, where data is copied in blocks, the first write
+       of to a block uses movca.l. I presume this is because writing
+       all data into cache, and then having the data sent into memory
+       later, via store buffer, is faster than, in case of write-through
+       cache configuration, to wait for memory write on each store.
+       - in arch/sh/mm/cache-sh4.c, movcal.l + ocbi combination is used
+       to flush the cache. Here, the data written by movcal.l is never
+       written to memory, and the data written is just bogus.
+
+       To simulate this, we keep a list of store requests initiated
+       by movcal.l, see env->store_requests.  movcal.l only adds new entry
+       to this list.  When we see an instruction that is neither movca.l
+       nor ocbi, we perform the stores recorded in this list.  When we see
+       ocbi, we check if the stores list has the address being invalidated.
+       If so, we remove the address from the list.  
+
+       To optimize, we only try to flush stores when we're at the start of
+       TB, or if we already saw movca.l in this TB and did not flush stores
+       yet.  */
+    if (ctx->has_movcal)
+	{
+	  int opcode = ctx->opcode & 0xf0ff;
+	  if (opcode != 0x0093 /* ocbi */
+	      && opcode != 0x00c3 /* movca.l */)
+	      {
+		  gen_helper_do_stores ();
+		  ctx->has_movcal = 0;
+	      }
+	}
+
 #if 0
     fprintf(stderr, "Translating opcode 0x%04x\n", ctx->opcode);
 #endif
@@ -1529,7 +1565,8 @@ static void _decode_opc(DisasContext * ctx)
 	}
 	return;
     case 0x00c3:		/* movca.l R0,@Rm */
-	tcg_gen_qemu_st32(REG(0), REG(B11_8), ctx->memidx);
+	gen_helper_movcal (REG(B11_8), REG(0));	
+	ctx->has_movcal = 1;
 	return;
     case 0x40a9:
 	/* MOVUA.L @Rm,R0 (Rm) -> R0
@@ -1578,9 +1615,7 @@ static void _decode_opc(DisasContext * ctx)
 	    break;
     case 0x0093:		/* ocbi @Rn */
 	{
-	    TCGv dummy = tcg_temp_new();
-	    tcg_gen_qemu_ld32s(dummy, REG(B11_8), ctx->memidx);
-	    tcg_temp_free(dummy);
+	    gen_helper_ocbi (REG(B11_8));
 	}
 	return;
     case 0x00a3:		/* ocbp @Rn */
@@ -1858,6 +1893,7 @@ gen_intermediate_code_internal(CPUState * env, TranslationBlock * tb,
     ctx.tb = tb;
     ctx.singlestep_enabled = env->singlestep_enabled;
     ctx.features = env->features;
+    ctx.has_movcal = (tb->flags & TB_FLAG_PENDING_MOVCA);
 
 #ifdef DEBUG_DISAS
     if (loglevel & CPU_LOG_TB_CPU) {
-- 
1.5.3.5


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] SH: Fix movca.l/ocbi emulation.
  2008-12-11 18:34 [Qemu-devel] SH: Fix movca.l/ocbi emulation Vladimir Prus
@ 2009-01-14 20:45 ` Edgar E. Iglesias
  2009-03-31 20:58   ` Vladimir Prus
  2009-01-18 16:38 ` Shin-ichiro KAWASAKI
  1 sibling, 1 reply; 7+ messages in thread
From: Edgar E. Iglesias @ 2009-01-14 20:45 UTC (permalink / raw)
  To: Vladimir Prus; +Cc: Shin-ichiro KAWASAKI, qemu-devel

On Thu, Dec 11, 2008 at 09:34:30PM +0300, Vladimir Prus wrote:
> 
> This patch fixes the emulation of movca.l and ocbi
> instructions. movca.l is documented to allocate a cache
> like, and write into it. ocbi invalidates a cache line.
> So, given code like:
> 
> 			asm volatile("movca.l r0, @%0\n\t"
> 			     "movca.l r0, @%1\n\t"
> 			     "ocbi @%0\n\t"
> 			     "ocbi @%1" : :
> 			     "r" (a0), "r" (a1));
> 
> Nothing is actually written to memory. Code like this can be
> found in arch/sh/mm/cache-sh4.c and is used to flush the cache.
> 
> Current QEMU implements movca.l the same way as ordinary move,
> and the code above just corrupts memory. Doing full cache emulation
> is out of question, so this patch implements a hack. Stores
> done by movca.l are delayed. If we execute an instructions that is
> neither movca.l nor ocbi, we flush all pending stores. If we execute
> ocbi, we look at pending stores and delete a store to the invalidated
> address. This appears to work fairly well in practice.
> 
> - Volodya

Hello and thanks for the patch.

If you wonder why the late sudden interest in this, see the following
thread :)
http://lists.gnu.org/archive/html/qemu-devel/2009-01/msg00460.html

I've got a few comments on your patch. Lets start with the general ones.

It would be good if the patch handled when the data cache gets disabled or
put into writethrough mode. There are also memory areas that are not
cache:able (TLB feature). Supporting the latter can get messy for very
little win, IMO not needed.

In the other thread discussing this issue, I raised conerns about the
delayed stores beeing issued on insns where they normaly wouldn't. Let me
give you an example of the kind of thing I'm thinking of.

movca
---             <-- Page boundary, broken TB.
mov             <-- I TLB happens to miss. SH jumps to TLB miss handler.
xxx             <-  I TLB refill handlers first insn issues delayed store
                    and the delayed store happens to fault!

That scenario could cause a D-TLB fault inside a I-TLB refill handler. Some
archs don't handle that. If it's a real problem for sh (which I beleive it
is) you could turn the stake and make the movca into a load + store, saving
the overwritten memory value. ocbi could then just bring back the old state.
I beleive that approach would not break the exception rules of sh.



> From 2b28d4213e3b16a93d53eb2e4d522c5824de1647 Mon Sep 17 00:00:00 2001
> From: Vladimir Prus <vladimir@codesourcery.com>
> Date: Tue, 11 Nov 2008 12:29:02 +0300
> Subject: [PATCH] Fix movcal.l/ocbi emulation.
> To: qemu-devel@nongnu.org
> X-KMail-Transport: CodeSourcery
> X-KMail-Identity: 901867920
> 
> 	* target-sh4/cpu.h (store_request_t): New.
> 	(CPUSH4State): New fields store_requests and store_request_tail.
> 	* target-sh4/helper.h (helper_movcal, herlper_do_stores, helper_ocbi): New.
> 	* target-sh4/op_helper.c (helper_movcal, herlper_do_stores)
> 	(helper_ocbi): New.
> 	* target-sh4/translate.c (DisasContext): New field has_movcal.
> 	(sh4_defs): Update CVS for SH7785.
> 	(cpu_sh4_init): Initialize env->store_request_tail;
> 	(_decode_opc): Flush pending movca.l-originated stores.
> 	Make use of helper_movcal and helper_ocbi.
> 	(gen_intermediate_code_internal): Initialize has_movcal to 1.
> ---
>  cpu-exec.c             |    2 +-
>  target-sh4/cpu.h       |   17 +++++++++++++--
>  target-sh4/helper.h    |    4 +++
>  target-sh4/op_helper.c |   49 ++++++++++++++++++++++++++++++++++++++++++++++++
>  target-sh4/translate.c |   44 +++++++++++++++++++++++++++++++++++++++---
>  5 files changed, 108 insertions(+), 8 deletions(-)
> 
> diff --git a/cpu-exec.c b/cpu-exec.c
> index 9a35a59..64b0845 100644
> --- a/cpu-exec.c
> +++ b/cpu-exec.c
> @@ -174,7 +174,7 @@ static inline TranslationBlock *tb_find_fast(void)
>      /* we record a subset of the CPU state. It will
>         always be the same before a given translated block
>         is executed. */
> -    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
> +    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);            
>      tb = env->tb_jmp_cache[tb_jmp_cache_hash_func(pc)];
>      if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
>                   tb->flags != flags)) {
> diff --git a/target-sh4/cpu.h b/target-sh4/cpu.h
> index ae434d1..eed3b1b 100644
> --- a/target-sh4/cpu.h
> +++ b/target-sh4/cpu.h
> @@ -93,6 +93,12 @@ enum sh_features {
>      SH_FEATURE_SH4A = 1,
>  };
>  
> +typedef struct store_request_t {
> +    uint32_t address;
> +    uint32_t value;
> +    struct store_request_t *next;
> +} store_request_t;
> +
>  typedef struct CPUSH4State {
>      int id;			/* CPU model */
>  
> @@ -141,6 +147,8 @@ typedef struct CPUSH4State {
>      tlb_t itlb[ITLB_SIZE];	/* instruction translation table */
>      void *intc_handle;
>      int intr_at_halt;		/* SR_BL ignored during sleep */
> +    store_request_t *store_requests;
> +    store_request_t **store_request_tail;
>  } CPUSH4State;
>  
>  CPUSH4State *cpu_sh4_init(const char *cpu_model);
> @@ -281,16 +289,19 @@ static inline void cpu_pc_from_tb(CPUState *env, TranslationBlock *tb)
>      env->flags = tb->flags;
>  }
>  
> +#define TB_FLAG_PENDING_MOVCA  (1 << 4)
> +
>  static inline void cpu_get_tb_cpu_state(CPUState *env, target_ulong *pc,
>                                          target_ulong *cs_base, int *flags)
>  {
>      *pc = env->pc;
>      *cs_base = 0;
>      *flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL
> -                    | DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME))   /* Bits  0- 3 */
> -            | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
> +                    | DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME))     /* Bits  0- 3 */
> +            | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))    /* Bits 19-21 */
>              | (env->sr & (SR_MD | SR_RB))                      /* Bits 29-30 */
> -            | (env->sr & SR_FD);                               /* Bit 15 */
> +            | (env->sr & SR_FD)                                /* Bit 15 */
> +            | (env->store_requests ? TB_FLAG_PENDING_MOVCA : 0); /* Bit 4 */
>  }
>  
>  #endif				/* _CPU_SH4_H */
> diff --git a/target-sh4/helper.h b/target-sh4/helper.h
> index 631e7e1..d995688 100644
> --- a/target-sh4/helper.h
> +++ b/target-sh4/helper.h
> @@ -9,6 +9,10 @@ DEF_HELPER_0(debug, void)
>  DEF_HELPER_1(sleep, void, i32)
>  DEF_HELPER_1(trapa, void, i32)
>  
> +DEF_HELPER_2(movcal, void, i32, i32)
> +DEF_HELPER_0(do_stores, void)
> +DEF_HELPER_1(ocbi, void, i32)
> +
>  DEF_HELPER_2(addv, i32, i32, i32)
>  DEF_HELPER_2(addc, i32, i32, i32)
>  DEF_HELPER_2(subv, i32, i32, i32)
> diff --git a/target-sh4/op_helper.c b/target-sh4/op_helper.c
> index 6352219..b4982b0 100644
> --- a/target-sh4/op_helper.c
> +++ b/target-sh4/op_helper.c
> @@ -122,6 +122,55 @@ void helper_trapa(uint32_t tra)
>      cpu_loop_exit();
>  }
>  
> +void helper_movcal(uint32_t address, uint32_t value)
> +{
> +  store_request_t *r = (store_request_t *)malloc (sizeof(store_request_t));
> +  r->address = address;
> +  r->value = value;
> +  r->next = NULL;
> +
> +  *(env->store_request_tail) = r;
> +  env->store_request_tail = &(r->next);
> +}
> +
> +void helper_do_stores(void)
> +{
> +  store_request_t *current = env->store_requests;
> +
> +  while(current)
> +    {
> +      uint32_t a = current->address, v = current->value;
> +      store_request_t *next = current->next;
> +      free (current);
> +      env->store_requests = current = next;
> +      if (current == 0)
> +	env->store_request_tail = &(env->store_requests);
> +
> +      stl_data(a, v);

Shouldn't you remove the delayed store record _after_ stl() returns?
Otherwise you might loose delayed stores in case of TLB exceptions or?



> +    } 
> +}
> +
> +void helper_ocbi(uint32_t address)
> +{
> +  store_request_t **current = &(env->store_requests);
> +  while (*current)
> +    {
> +      if ((*current)->address == address)
> +	{


Nitpick:
It may not be very significant but you can in an easy way catch a few more
movca+ocbi use cases by not comparing line offsets. I.e, just mask off
the last five bits from the compared addresses to zap all recorded movca's
made to the same cache-line ocbi is invalidating.



> +	  store_request_t *next = (*current)->next;
> +
> +	  if (next == 0)
> +	    {
> +	      env->store_request_tail = current;
> +	    }
> +
> +	  free (*current);
> +	  *current = next;
> +	  break;
> +	}
> +    }
> +}
> +
>  uint32_t helper_addc(uint32_t arg0, uint32_t arg1)
>  {
>      uint32_t tmp0, tmp1;
> diff --git a/target-sh4/translate.c b/target-sh4/translate.c
> index ba9db14..949cb06 100644
> --- a/target-sh4/translate.c
> +++ b/target-sh4/translate.c
> @@ -50,6 +50,7 @@ typedef struct DisasContext {
>      uint32_t delayed_pc;
>      int singlestep_enabled;
>      uint32_t features;
> +    int has_movcal;
>  } DisasContext;
>  
>  #if defined(CONFIG_USER_ONLY)
> @@ -278,6 +279,7 @@ CPUSH4State *cpu_sh4_init(const char *cpu_model)
>  	return NULL;
>      env->features = def->features;
>      cpu_exec_init(env);
> +    env->store_request_tail = &(env->store_requests);
>      sh4_translate_init();
>      env->cpu_model_str = cpu_model;
>      cpu_sh4_reset(env);
> @@ -490,6 +492,40 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
>  
>  static void _decode_opc(DisasContext * ctx)
>  {
> +    /* This code tries to make movcal emulation sufficiently
> +       accurate for Linux purposes.  This instruction writes
> +       memory, and prior to that, always allocates a cache line.
> +       It is used in two contexts:
> +       - in memcpy, where data is copied in blocks, the first write
> +       of to a block uses movca.l. I presume this is because writing
> +       all data into cache, and then having the data sent into memory
> +       later, via store buffer, is faster than, in case of write-through
> +       cache configuration, to wait for memory write on each store.

I dont think this is the reason for using movca. In fact, according to the
documentation I've got, movca behaves like a normal mov for caches in
writethrough mode. I beleive the reason is to avoid the line fetch in case
the store from a normal mov misses the cache in writeback mode. Anyway,
there's no reason to speculate, IMO we can just remove the comment on why
and just leave the note about it beeing used for fast block copies.



> +       - in arch/sh/mm/cache-sh4.c, movcal.l + ocbi combination is used
> +       to flush the cache. Here, the data written by movcal.l is never
> +       written to memory, and the data written is just bogus.

Agreed.

Cheers

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] SH: Fix movca.l/ocbi emulation.
  2009-01-14 20:45 ` Edgar E. Iglesias
@ 2009-03-31 20:58   ` Vladimir Prus
  2009-04-01 12:44     ` Edgar E. Iglesias
  0 siblings, 1 reply; 7+ messages in thread
From: Vladimir Prus @ 2009-03-31 20:58 UTC (permalink / raw)
  To: Edgar E. Iglesias; +Cc: Shin-ichiro KAWASAKI, qemu-devel

[-- Attachment #1: Type: Text/Plain, Size: 5515 bytes --]

On Wednesday 14 January 2009 23:45:11 Edgar E. Iglesias wrote:
> On Thu, Dec 11, 2008 at 09:34:30PM +0300, Vladimir Prus wrote:
> > 
> > This patch fixes the emulation of movca.l and ocbi
> > instructions. movca.l is documented to allocate a cache
> > like, and write into it. ocbi invalidates a cache line.
> > So, given code like:
> > 
> > 			asm volatile("movca.l r0, @%0\n\t"
> > 			     "movca.l r0, @%1\n\t"
> > 			     "ocbi @%0\n\t"
> > 			     "ocbi @%1" : :
> > 			     "r" (a0), "r" (a1));
> > 
> > Nothing is actually written to memory. Code like this can be
> > found in arch/sh/mm/cache-sh4.c and is used to flush the cache.
> > 
> > Current QEMU implements movca.l the same way as ordinary move,
> > and the code above just corrupts memory. Doing full cache emulation
> > is out of question, so this patch implements a hack. Stores
> > done by movca.l are delayed. If we execute an instructions that is
> > neither movca.l nor ocbi, we flush all pending stores. If we execute
> > ocbi, we look at pending stores and delete a store to the invalidated
> > address. This appears to work fairly well in practice.
> > 
> > - Volodya
> 
> Hello and thanks for the patch.
> 
> If you wonder why the late sudden interest in this, see the following
> thread :)
> http://lists.gnu.org/archive/html/qemu-devel/2009-01/msg00460.html

Hi Edgar,

apologies for belated reply.

> I've got a few comments on your patch. Lets start with the general ones.
> 
> It would be good if the patch handled when the data cache gets disabled or
> put into writethrough mode. There are also memory areas that are not
> cache:able (TLB feature). Supporting the latter can get messy for very
> little win, IMO not needed.

I am afraid I might not be the best person to accurately catch all cases
where cache might be enabled or disabled. KAWASAKI-san has posted a function
that can be used to check if caching is enabled, in:

	http://article.gmane.org/gmane.comp.emulators.qemu/36348

Does that one address your suggestion? If so, I can incorporate that function,
and checking for it.

> In the other thread discussing this issue, I raised conerns about the
> delayed stores beeing issued on insns where they normaly wouldn't. Let me
> give you an example of the kind of thing I'm thinking of.
> 
> movca
> ---             <-- Page boundary, broken TB.
> mov             <-- I TLB happens to miss. SH jumps to TLB miss handler.
> xxx             <-  I TLB refill handlers first insn issues delayed store
>                     and the delayed store happens to fault!
> 
> That scenario could cause a D-TLB fault inside a I-TLB refill handler. Some
> archs don't handle that. If it's a real problem for sh (which I beleive it
> is) you could turn the stake and make the movca into a load + store, saving
> the overwritten memory value. ocbi could then just bring back the old state.
> I beleive that approach would not break the exception rules of sh.

I think your approach will indeed be more reliable, so I've reworked my patch
accordingly.

> > +void helper_do_stores(void)
> > +{
> > +  store_request_t *current = env->store_requests;
> > +
> > +  while(current)
> > +    {
> > +      uint32_t a = current->address, v = current->value;
> > +      store_request_t *next = current->next;
> > +      free (current);
> > +      env->store_requests = current = next;
> > +      if (current == 0)
> > +	env->store_request_tail = &(env->store_requests);
> > +
> > +      stl_data(a, v);
> 
> Shouldn't you remove the delayed store record _after_ stl() returns?
> Otherwise you might loose delayed stores in case of TLB exceptions or?

I recall that I have explicitly used this ordering, but I no longer 
remember exactly why. Anyway, this is not an issue after patch is
revised as you suggested above.
> 
> 
> 
> > +    } 
> > +}
> > +
> > +void helper_ocbi(uint32_t address)
> > +{
> > +  store_request_t **current = &(env->store_requests);
> > +  while (*current)
> > +    {
> > +      if ((*current)->address == address)
> > +	{
> 
> 
> Nitpick:
> It may not be very significant but you can in an easy way catch a few more
> movca+ocbi use cases by not comparing line offsets. I.e, just mask off
> the last five bits from the compared addresses to zap all recorded movca's
> made to the same cache-line ocbi is invalidating.

I did so.



> >  static void _decode_opc(DisasContext * ctx)
> >  {
> > +    /* This code tries to make movcal emulation sufficiently
> > +       accurate for Linux purposes.  This instruction writes
> > +       memory, and prior to that, always allocates a cache line.
> > +       It is used in two contexts:
> > +       - in memcpy, where data is copied in blocks, the first write
> > +       of to a block uses movca.l. I presume this is because writing
> > +       all data into cache, and then having the data sent into memory
> > +       later, via store buffer, is faster than, in case of write-through
> > +       cache configuration, to wait for memory write on each store.
> 
> I dont think this is the reason for using movca. In fact, according to the
> documentation I've got, movca behaves like a normal mov for caches in
> writethrough mode. I beleive the reason is to avoid the line fetch in case
> the store from a normal mov misses the cache in writeback mode. Anyway,
> there's no reason to speculate, IMO we can just remove the comment on why
> and just leave the note about it beeing used for fast block copies.

OK.

I am attaching a revised patch -- what do you think?

- Volodya


[-- Attachment #2: movcal-ocbi.diff --]
[-- Type: text/x-patch, Size: 8808 bytes --]

commit 1902e15049a9179ced0c924c5ce7c43f69e74001
Author: Vladimir Prus <vladimir@codesourcery.com>
Date:   Tue Nov 11 12:29:02 2008 +0300

    Fix movcal.l/ocbi emulation.
    
    	* target-sh4/cpu.h (memory_content_t): New.
    	(CPUSH4State): New fields movcal_backup and movcal_backup_tail.
    	* target-sh4/helper.h (helper_movcal)
    	(helper_discard_movcal_backup, helper_ocbi): New.
    	* target-sh4/op_helper.c (helper_movcal)
    	(helper_discard_movcal_backup, helper_ocbi): New.
    	* target-sh4/translate.c (DisasContext): New field has_movcal.
    	(sh4_defs): Update CVS for SH7785.
    	(cpu_sh4_init): Initialize env->movcal_backup_tail.
    	(_decode_opc): Discard movca.l-backup.
    	Make use of helper_movcal and helper_ocbi.
    	(gen_intermediate_code_internal): Initialize has_movcal to 1.

diff --git a/cpu-exec.c b/cpu-exec.c
index cf7c1fb..b0f0b5e 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -174,7 +174,7 @@ static inline TranslationBlock *tb_find_fast(void)
     /* we record a subset of the CPU state. It will
        always be the same before a given translated block
        is executed. */
-    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);            
     tb = env->tb_jmp_cache[tb_jmp_cache_hash_func(pc)];
     if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
                  tb->flags != flags)) {
diff --git a/target-sh4/cpu.h b/target-sh4/cpu.h
index e9eee2c..67741f6 100644
--- a/target-sh4/cpu.h
+++ b/target-sh4/cpu.h
@@ -100,6 +100,12 @@ enum sh_features {
     SH_FEATURE_BCR3_AND_BCR4 = 2,
 };
 
+typedef struct memory_content_t {
+    uint32_t address;
+    uint32_t value;
+    struct memory_content_t *next;
+} memory_content_t;
+
 typedef struct CPUSH4State {
     int id;			/* CPU model */
 
@@ -149,6 +155,8 @@ typedef struct CPUSH4State {
     tlb_t itlb[ITLB_SIZE];	/* instruction translation table */
     void *intc_handle;
     int intr_at_halt;		/* SR_BL ignored during sleep */
+    memory_content_t *movcal_backup;
+    memory_content_t **movcal_backup_tail;
 } CPUSH4State;
 
 CPUSH4State *cpu_sh4_init(const char *cpu_model);
@@ -294,16 +302,19 @@ static inline void cpu_pc_from_tb(CPUState *env, TranslationBlock *tb)
     env->flags = tb->flags;
 }
 
+#define TB_FLAG_PENDING_MOVCA  (1 << 4)
+
 static inline void cpu_get_tb_cpu_state(CPUState *env, target_ulong *pc,
                                         target_ulong *cs_base, int *flags)
 {
     *pc = env->pc;
     *cs_base = 0;
     *flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL
-                    | DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME))   /* Bits  0- 3 */
-            | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
+                    | DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME))     /* Bits  0- 3 */
+            | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))    /* Bits 19-21 */
             | (env->sr & (SR_MD | SR_RB))                      /* Bits 29-30 */
-            | (env->sr & SR_FD);                               /* Bit 15 */
+            | (env->sr & SR_FD)                                /* Bit 15 */
+            | (env->movcal_backup ? TB_FLAG_PENDING_MOVCA : 0); /* Bit 4 */
 }
 
 #endif				/* _CPU_SH4_H */
diff --git a/target-sh4/helper.h b/target-sh4/helper.h
index e665185..4b2fcdd 100644
--- a/target-sh4/helper.h
+++ b/target-sh4/helper.h
@@ -9,6 +9,10 @@ DEF_HELPER_0(debug, void)
 DEF_HELPER_1(sleep, void, i32)
 DEF_HELPER_1(trapa, void, i32)
 
+DEF_HELPER_2(movcal, void, i32, i32)
+DEF_HELPER_0(discard_movcal_backup, void)
+DEF_HELPER_1(ocbi, void, i32)
+
 DEF_HELPER_2(addv, i32, i32, i32)
 DEF_HELPER_2(addc, i32, i32, i32)
 DEF_HELPER_2(subv, i32, i32, i32)
diff --git a/target-sh4/op_helper.c b/target-sh4/op_helper.c
index 84e1ad3..e6b71a0 100644
--- a/target-sh4/op_helper.c
+++ b/target-sh4/op_helper.c
@@ -18,6 +18,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA  02110-1301 USA
  */
 #include <assert.h>
+#include <stdlib.h>
 #include "exec.h"
 #include "helper.h"
 
@@ -122,6 +123,55 @@ void helper_trapa(uint32_t tra)
     cpu_loop_exit();
 }
 
+void helper_movcal(uint32_t address, uint32_t value)
+{
+    memory_content_t *r = (memory_content_t *)malloc (sizeof(memory_content_t));
+    r->address = address;
+    r->value = value;
+    r->next = NULL;
+
+    *(env->movcal_backup_tail) = r;
+    env->movcal_backup_tail = &(r->next);
+}
+
+void helper_discard_movcal_backup(void)
+{
+    memory_content_t *current = env->movcal_backup;
+
+    while(current)
+    {
+        memory_content_t *next = current->next;
+        free (current);
+        env->movcal_backup = current = next;
+        if (current == 0)
+            env->movcal_backup_tail = &(env->movcal_backup);
+    } 
+}
+
+void helper_ocbi(uint32_t address)
+{
+    memory_content_t **current = &(env->movcal_backup);
+    while (*current)
+    {
+        uint32_t a = (*current)->address;
+        if ((a & ~0x1F) == (address & ~0x1F))
+	{
+            memory_content_t *next = (*current)->next;
+
+            stl_data(a, (*current)->value);
+
+            if (next == 0)
+	    {
+                env->movcal_backup_tail = current;
+	    }
+
+            free (*current);
+            *current = next;
+            break;
+	}
+    }
+}
+
 uint32_t helper_addc(uint32_t arg0, uint32_t arg1)
 {
     uint32_t tmp0, tmp1;
diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index cc9f886..4ced176 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -50,6 +50,7 @@ typedef struct DisasContext {
     uint32_t delayed_pc;
     int singlestep_enabled;
     uint32_t features;
+    int has_movcal;
 } DisasContext;
 
 #if defined(CONFIG_USER_ONLY)
@@ -283,6 +284,7 @@ CPUSH4State *cpu_sh4_init(const char *cpu_model)
     env = qemu_mallocz(sizeof(CPUSH4State));
     env->features = def->features;
     cpu_exec_init(env);
+    env->movcal_backup_tail = &(env->movcal_backup);
     sh4_translate_init();
     env->cpu_model_str = cpu_model;
     cpu_sh4_reset(env);
@@ -495,6 +497,37 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 
 static void _decode_opc(DisasContext * ctx)
 {
+    /* This code tries to make movcal emulation sufficiently
+       accurate for Linux purposes.  This instruction writes
+       memory, and prior to that, always allocates a cache line.
+       It is used in two contexts:
+       - in memcpy, where data is copied in blocks, the first write
+       of to a block uses movca.l for performance.
+       - in arch/sh/mm/cache-sh4.c, movcal.l + ocbi combination is used
+       to flush the cache. Here, the data written by movcal.l is never
+       written to memory, and the data written is just bogus.
+
+       To simulate this, we simulate movcal.l, we store the value to memory,
+       but we also remember the previous content. If we see ocbi, we check
+       if movcal.l for that address was done previously. If so, the write should
+       not have hit the memory, so we restore the previous content.
+       When we see an instruction that is neither movca.l
+       nor ocbi, the previous content is discarded.
+
+       To optimize, we only try to flush stores when we're at the start of
+       TB, or if we already saw movca.l in this TB and did not flush stores
+       yet.  */
+    if (ctx->has_movcal)
+	{
+	  int opcode = ctx->opcode & 0xf0ff;
+	  if (opcode != 0x0093 /* ocbi */
+	      && opcode != 0x00c3 /* movca.l */)
+	      {
+		  gen_helper_discard_movcal_backup ();
+		  ctx->has_movcal = 0;
+	      }
+	}
+
 #if 0
     fprintf(stderr, "Translating opcode 0x%04x\n", ctx->opcode);
 #endif
@@ -1545,7 +1578,13 @@ static void _decode_opc(DisasContext * ctx)
 	}
 	return;
     case 0x00c3:		/* movca.l R0,@Rm */
-	tcg_gen_qemu_st32(REG(0), REG(B11_8), ctx->memidx);
+        {
+            TCGv val = tcg_temp_new();
+            tcg_gen_qemu_ld32u(val, REG(B11_8), ctx->memidx);
+            gen_helper_movcal (REG(B11_8), val);            
+            tcg_gen_qemu_st32(REG(0), REG(B11_8), ctx->memidx);
+        }
+        ctx->has_movcal = 1;
 	return;
     case 0x40a9:
 	/* MOVUA.L @Rm,R0 (Rm) -> R0
@@ -1594,9 +1633,7 @@ static void _decode_opc(DisasContext * ctx)
 	    break;
     case 0x0093:		/* ocbi @Rn */
 	{
-	    TCGv dummy = tcg_temp_new();
-	    tcg_gen_qemu_ld32s(dummy, REG(B11_8), ctx->memidx);
-	    tcg_temp_free(dummy);
+	    gen_helper_ocbi (REG(B11_8));
 	}
 	return;
     case 0x00a3:		/* ocbp @Rn */
@@ -1876,6 +1913,7 @@ gen_intermediate_code_internal(CPUState * env, TranslationBlock * tb,
     ctx.tb = tb;
     ctx.singlestep_enabled = env->singlestep_enabled;
     ctx.features = env->features;
+    ctx.has_movcal = (tb->flags & TB_FLAG_PENDING_MOVCA);
 
 #ifdef DEBUG_DISAS
     qemu_log_mask(CPU_LOG_TB_CPU,

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] SH: Fix movca.l/ocbi emulation.
  2009-03-31 20:58   ` Vladimir Prus
@ 2009-04-01 12:44     ` Edgar E. Iglesias
  2009-04-01 13:36       ` Vladimir Prus
  0 siblings, 1 reply; 7+ messages in thread
From: Edgar E. Iglesias @ 2009-04-01 12:44 UTC (permalink / raw)
  To: Vladimir Prus; +Cc: Shin-ichiro KAWASAKI, qemu-devel, Edgar E. Iglesias

On Wed, Apr 01, 2009 at 12:58:06AM +0400, Vladimir Prus wrote:
> On Wednesday 14 January 2009 23:45:11 Edgar E. Iglesias wrote:
> > On Thu, Dec 11, 2008 at 09:34:30PM +0300, Vladimir Prus wrote:
> > > 
> > > This patch fixes the emulation of movca.l and ocbi
> > > instructions. movca.l is documented to allocate a cache
> > > like, and write into it. ocbi invalidates a cache line.
> > > So, given code like:
> > > 
> > > 			asm volatile("movca.l r0, @%0\n\t"
> > > 			     "movca.l r0, @%1\n\t"
> > > 			     "ocbi @%0\n\t"
> > > 			     "ocbi @%1" : :
> > > 			     "r" (a0), "r" (a1));
> > > 
> > > Nothing is actually written to memory. Code like this can be
> > > found in arch/sh/mm/cache-sh4.c and is used to flush the cache.
> > > 
> > > Current QEMU implements movca.l the same way as ordinary move,
> > > and the code above just corrupts memory. Doing full cache emulation
> > > is out of question, so this patch implements a hack. Stores
> > > done by movca.l are delayed. If we execute an instructions that is
> > > neither movca.l nor ocbi, we flush all pending stores. If we execute
> > > ocbi, we look at pending stores and delete a store to the invalidated
> > > address. This appears to work fairly well in practice.
> > > 
> > > - Volodya
> > 
> > Hello and thanks for the patch.
> > 
> > If you wonder why the late sudden interest in this, see the following
> > thread :)
> > http://lists.gnu.org/archive/html/qemu-devel/2009-01/msg00460.html
> 
> Hi Edgar,
> 
> apologies for belated reply.

Hi, Don't worry about it..


> 
> > I've got a few comments on your patch. Lets start with the general ones.
> > 
> > It would be good if the patch handled when the data cache gets disabled or
> > put into writethrough mode. There are also memory areas that are not
> > cache:able (TLB feature). Supporting the latter can get messy for very
> > little win, IMO not needed.
> 
> I am afraid I might not be the best person to accurately catch all cases
> where cache might be enabled or disabled. KAWASAKI-san has posted a function
> that can be used to check if caching is enabled, in:
> 
> 	http://article.gmane.org/gmane.comp.emulators.qemu/36348
> 
> Does that one address your suggestion? If so, I can incorporate that function,
> and checking for it.

I dont have the manuals here but yes, I think something like that will be
needed. On other archs I've seen ppl do memcpy to IO areas, don't know
about sh but to be on the safe side we better not do any funny stuff
with uncached stores.

[cut]
> I am attaching a revised patch -- what do you think?

Thanks it looks good, at least I think it will handle more of the
realistic use cases.

A few inlined minor comments on the cosmetics though:


> 
> - Volodya
> 

> commit 1902e15049a9179ced0c924c5ce7c43f69e74001
> Author: Vladimir Prus <vladimir@codesourcery.com>
> Date:   Tue Nov 11 12:29:02 2008 +0300
> 
>     Fix movcal.l/ocbi emulation.
>     
>     	* target-sh4/cpu.h (memory_content_t): New.
>     	(CPUSH4State): New fields movcal_backup and movcal_backup_tail.
>     	* target-sh4/helper.h (helper_movcal)
>     	(helper_discard_movcal_backup, helper_ocbi): New.
>     	* target-sh4/op_helper.c (helper_movcal)
>     	(helper_discard_movcal_backup, helper_ocbi): New.
>     	* target-sh4/translate.c (DisasContext): New field has_movcal.
>     	(sh4_defs): Update CVS for SH7785.
>     	(cpu_sh4_init): Initialize env->movcal_backup_tail.
>     	(_decode_opc): Discard movca.l-backup.
>     	Make use of helper_movcal and helper_ocbi.
>     	(gen_intermediate_code_internal): Initialize has_movcal to 1.
> 
> diff --git a/cpu-exec.c b/cpu-exec.c
> index cf7c1fb..b0f0b5e 100644
> --- a/cpu-exec.c
> +++ b/cpu-exec.c
> @@ -174,7 +174,7 @@ static inline TranslationBlock *tb_find_fast(void)
>      /* we record a subset of the CPU state. It will
>         always be the same before a given translated block
>         is executed. */
> -    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
> +    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);            
                                                          ^^^
Stray spaces.


>      tb = env->tb_jmp_cache[tb_jmp_cache_hash_func(pc)];
>      if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
>                   tb->flags != flags)) {
> diff --git a/target-sh4/cpu.h b/target-sh4/cpu.h
> index e9eee2c..67741f6 100644
> --- a/target-sh4/cpu.h
> +++ b/target-sh4/cpu.h
> @@ -100,6 +100,12 @@ enum sh_features {
>      SH_FEATURE_BCR3_AND_BCR4 = 2,
>  };
>  
> +typedef struct memory_content_t {
> +    uint32_t address;
> +    uint32_t value;
> +    struct memory_content_t *next;
> +} memory_content_t;
> +

Please drop the _t (reserved names).


>  typedef struct CPUSH4State {
>      int id;			/* CPU model */
>  
> @@ -149,6 +155,8 @@ typedef struct CPUSH4State {
>      tlb_t itlb[ITLB_SIZE];	/* instruction translation table */
>      void *intc_handle;
>      int intr_at_halt;		/* SR_BL ignored during sleep */
> +    memory_content_t *movcal_backup;
> +    memory_content_t **movcal_backup_tail;

Do you really need the _tail member?
Seems to me like you're only maintaining it but not really using it.



>  } CPUSH4State;
>  
>  CPUSH4State *cpu_sh4_init(const char *cpu_model);
> @@ -294,16 +302,19 @@ static inline void cpu_pc_from_tb(CPUState *env, TranslationBlock *tb)
>      env->flags = tb->flags;
>  }
>  
> +#define TB_FLAG_PENDING_MOVCA  (1 << 4)
> +
>  static inline void cpu_get_tb_cpu_state(CPUState *env, target_ulong *pc,
>                                          target_ulong *cs_base, int *flags)
>  {
>      *pc = env->pc;
>      *cs_base = 0;
>      *flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL
> -                    | DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME))   /* Bits  0- 3 */
> -            | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
> +                    | DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME))     /* Bits  0- 3 */
> +            | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))    /* Bits 19-21 */
>              | (env->sr & (SR_MD | SR_RB))                      /* Bits 29-30 */
> -            | (env->sr & SR_FD);                               /* Bit 15 */
> +            | (env->sr & SR_FD)                                /* Bit 15 */
> +            | (env->movcal_backup ? TB_FLAG_PENDING_MOVCA : 0); /* Bit 4 */
>  }
>  
>  #endif				/* _CPU_SH4_H */
> diff --git a/target-sh4/helper.h b/target-sh4/helper.h
> index e665185..4b2fcdd 100644
> --- a/target-sh4/helper.h
> +++ b/target-sh4/helper.h
> @@ -9,6 +9,10 @@ DEF_HELPER_0(debug, void)
>  DEF_HELPER_1(sleep, void, i32)
>  DEF_HELPER_1(trapa, void, i32)
>  
> +DEF_HELPER_2(movcal, void, i32, i32)
> +DEF_HELPER_0(discard_movcal_backup, void)
> +DEF_HELPER_1(ocbi, void, i32)
> +
>  DEF_HELPER_2(addv, i32, i32, i32)
>  DEF_HELPER_2(addc, i32, i32, i32)
>  DEF_HELPER_2(subv, i32, i32, i32)
> diff --git a/target-sh4/op_helper.c b/target-sh4/op_helper.c
> index 84e1ad3..e6b71a0 100644
> --- a/target-sh4/op_helper.c
> +++ b/target-sh4/op_helper.c
> @@ -18,6 +18,7 @@
>   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA  02110-1301 USA
>   */
>  #include <assert.h>
> +#include <stdlib.h>
>  #include "exec.h"
>  #include "helper.h"
>  
> @@ -122,6 +123,55 @@ void helper_trapa(uint32_t tra)
>      cpu_loop_exit();
>  }
>  
> +void helper_movcal(uint32_t address, uint32_t value)
> +{
> +    memory_content_t *r = (memory_content_t *)malloc (sizeof(memory_content_t));
> +    r->address = address;
> +    r->value = value;
> +    r->next = NULL;
> +
> +    *(env->movcal_backup_tail) = r;
> +    env->movcal_backup_tail = &(r->next);
> +}
> +
> +void helper_discard_movcal_backup(void)
> +{
> +    memory_content_t *current = env->movcal_backup;
> +
> +    while(current)
> +    {
> +        memory_content_t *next = current->next;
> +        free (current);
> +        env->movcal_backup = current = next;
> +        if (current == 0)
> +            env->movcal_backup_tail = &(env->movcal_backup);
> +    } 
> +}
> +
> +void helper_ocbi(uint32_t address)
> +{
> +    memory_content_t **current = &(env->movcal_backup);
> +    while (*current)
> +    {
> +        uint32_t a = (*current)->address;
> +        if ((a & ~0x1F) == (address & ~0x1F))
> +	{
> +            memory_content_t *next = (*current)->next;
> +
> +            stl_data(a, (*current)->value);
> +
> +            if (next == 0)
> +	    {
> +                env->movcal_backup_tail = current;
> +	    }
> +
> +            free (*current);
> +            *current = next;
> +            break;
> +	}

The indentation got messed up here.

Cheers

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] SH: Fix movca.l/ocbi emulation.
  2009-04-01 12:44     ` Edgar E. Iglesias
@ 2009-04-01 13:36       ` Vladimir Prus
  2009-04-01 13:48         ` Edgar E. Iglesias
  0 siblings, 1 reply; 7+ messages in thread
From: Vladimir Prus @ 2009-04-01 13:36 UTC (permalink / raw)
  To: Edgar E. Iglesias; +Cc: Shin-ichiro KAWASAKI, qemu-devel

[-- Attachment #1: Type: Text/Plain, Size: 3531 bytes --]

On Wednesday 01 April 2009 16:44:49 Edgar E. Iglesias wrote:

Hi Edgar,


> > I am afraid I might not be the best person to accurately catch all cases
> > where cache might be enabled or disabled. KAWASAKI-san has posted a function
> > that can be used to check if caching is enabled, in:
> > 
> > 	http://article.gmane.org/gmane.comp.emulators.qemu/36348
> > 
> > Does that one address your suggestion? If so, I can incorporate that function,
> > and checking for it.
> 
> I dont have the manuals here but yes, I think something like that will be
> needed. On other archs I've seen ppl do memcpy to IO areas, don't know
> about sh but to be on the safe side we better not do any funny stuff
> with uncached stores.

I have added the check, using the routine provided by Kawasaki-san.

> > diff --git a/cpu-exec.c b/cpu-exec.c
> > index cf7c1fb..b0f0b5e 100644
> > --- a/cpu-exec.c
> > +++ b/cpu-exec.c
> > @@ -174,7 +174,7 @@ static inline TranslationBlock *tb_find_fast(void)
> >      /* we record a subset of the CPU state. It will
> >         always be the same before a given translated block
> >         is executed. */
> > -    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
> > +    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);            
>                                                           ^^^
> Stray spaces.

Doh -- I had unsaved emacs buffer with this issue fixed.

> >      tb = env->tb_jmp_cache[tb_jmp_cache_hash_func(pc)];
> >      if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
> >                   tb->flags != flags)) {
> > diff --git a/target-sh4/cpu.h b/target-sh4/cpu.h
> > index e9eee2c..67741f6 100644
> > --- a/target-sh4/cpu.h
> > +++ b/target-sh4/cpu.h
> > @@ -100,6 +100,12 @@ enum sh_features {
> >      SH_FEATURE_BCR3_AND_BCR4 = 2,
> >  };
> >  
> > +typedef struct memory_content_t {
> > +    uint32_t address;
> > +    uint32_t value;
> > +    struct memory_content_t *next;
> > +} memory_content_t;
> > +
> 
> Please drop the _t (reserved names).

Fixed.

> >  typedef struct CPUSH4State {
> >      int id;			/* CPU model */
> >  
> > @@ -149,6 +155,8 @@ typedef struct CPUSH4State {
> >      tlb_t itlb[ITLB_SIZE];	/* instruction translation table */
> >      void *intc_handle;
> >      int intr_at_halt;		/* SR_BL ignored during sleep */
> > +    memory_content_t *movcal_backup;
> > +    memory_content_t **movcal_backup_tail;
> 
> Do you really need the _tail member?
> Seems to me like you're only maintaining it but not really using it.

See helper_movcal:

	*(env->movcal_backup_tail) = r;

I think having movcal backup list in-order is good. Say, if we have two
movcal to the same address and ocbi, we want to restore the saved when
the first movcal was executed, and given that helper_ocbi traversed
the list from the front we need to put elements in-order.


> > +void helper_ocbi(uint32_t address)
> > +{
> > +    memory_content_t **current = &(env->movcal_backup);
> > +    while (*current)
> > +    {
> > +        uint32_t a = (*current)->address;
> > +        if ((a & ~0x1F) == (address & ~0x1F))
> > +	{
> > +            memory_content_t *next = (*current)->next;
> > +
> > +            stl_data(a, (*current)->value);
> > +
> > +            if (next == 0)
> > +	    {
> > +                env->movcal_backup_tail = current;
> > +	    }
> > +
> > +            free (*current);
> > +            *current = next;
> > +            break;
> > +	}
> 
> The indentation got messed up here.

I have fixed it. Revised patch is attached.

Thanks,
Volodya


[-- Attachment #2: movcal-2.diff --]
[-- Type: text/x-patch, Size: 9997 bytes --]

commit d5caa92b7e792c526306b3438e697bfab3dd0c03
Author: Vladimir Prus <vladimir@codesourcery.com>
Date:   Tue Nov 11 12:29:02 2008 +0300

    Fix movcal.l/ocbi emulation.
    
    	* target-sh4/cpu.h (memory_content_t): New.
    	(CPUSH4State): New fields movcal_backup and movcal_backup_tail.
    	* target-sh4/helper.h (helper_movcal)
    	(helper_discard_movcal_backup, helper_ocbi): New.
    	* target-sh4/op_helper.c (helper_movcal)
    	(helper_discard_movcal_backup, helper_ocbi): New.
    	* target-sh4/translate.c (DisasContext): New field has_movcal.
    	(sh4_defs): Update CVS for SH7785.
    	(cpu_sh4_init): Initialize env->movcal_backup_tail.
    	(_decode_opc): Discard movca.l-backup.
    	Make use of helper_movcal and helper_ocbi.
    	(gen_intermediate_code_internal): Initialize has_movcal to 1.

diff --git a/target-sh4/cpu.h b/target-sh4/cpu.h
index e9eee2c..d754512 100644
--- a/target-sh4/cpu.h
+++ b/target-sh4/cpu.h
@@ -100,6 +100,12 @@ enum sh_features {
     SH_FEATURE_BCR3_AND_BCR4 = 2,
 };
 
+typedef struct memory_content {
+    uint32_t address;
+    uint32_t value;
+    struct memory_content *next;
+} memory_content;
+
 typedef struct CPUSH4State {
     int id;			/* CPU model */
 
@@ -149,6 +155,8 @@ typedef struct CPUSH4State {
     tlb_t itlb[ITLB_SIZE];	/* instruction translation table */
     void *intc_handle;
     int intr_at_halt;		/* SR_BL ignored during sleep */
+    memory_content *movcal_backup;
+    memory_content **movcal_backup_tail;
 } CPUSH4State;
 
 CPUSH4State *cpu_sh4_init(const char *cpu_model);
@@ -163,6 +171,8 @@ void sh4_cpu_list(FILE *f, int (*cpu_fprintf)(FILE *f, const char *fmt, ...));
 void cpu_sh4_write_mmaped_utlb_addr(CPUSH4State *s, target_phys_addr_t addr,
 				    uint32_t mem_value);
 
+int cpu_sh4_is_cached(CPUSH4State * env, target_ulong addr);
+
 static inline void cpu_set_tls(CPUSH4State *env, target_ulong newtls)
 {
   env->gbr = newtls;
@@ -294,16 +304,19 @@ static inline void cpu_pc_from_tb(CPUState *env, TranslationBlock *tb)
     env->flags = tb->flags;
 }
 
+#define TB_FLAG_PENDING_MOVCA  (1 << 4)
+
 static inline void cpu_get_tb_cpu_state(CPUState *env, target_ulong *pc,
                                         target_ulong *cs_base, int *flags)
 {
     *pc = env->pc;
     *cs_base = 0;
     *flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL
-                    | DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME))   /* Bits  0- 3 */
-            | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
+                    | DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME))     /* Bits  0- 3 */
+            | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))    /* Bits 19-21 */
             | (env->sr & (SR_MD | SR_RB))                      /* Bits 29-30 */
-            | (env->sr & SR_FD);                               /* Bit 15 */
+            | (env->sr & SR_FD)                                /* Bit 15 */
+            | (env->movcal_backup ? TB_FLAG_PENDING_MOVCA : 0); /* Bit 4 */
 }
 
 #endif				/* _CPU_SH4_H */
diff --git a/target-sh4/helper.c b/target-sh4/helper.c
index f1feca7..5fa4fb4 100644
--- a/target-sh4/helper.c
+++ b/target-sh4/helper.c
@@ -652,4 +652,48 @@ void cpu_sh4_write_mmaped_utlb_addr(CPUSH4State *s, target_phys_addr_t addr,
     }
 }
 
+int cpu_sh4_is_cached(CPUSH4State * env, target_ulong addr)
+{
+    int n;
+    int use_asid = (env->mmucr & MMUCR_SV) == 0 || (env->sr & SR_MD) == 0;
+
+    /* check area */
+    if (env->sr & SR_MD) {
+        /* For previledged mode, P2 and P4 area is not cachable. */
+        if ((0xA0000000 <= addr && addr < 0xC0000000) || 0xE0000000 <= addr)
+            return 0;
+    } else {
+        /* For user mode, only U0 area is cachable. */
+        if (0x80000000 <= addr)
+            return 0;
+    }
+
+    /*
+     * TODO : Evaluate CCR and check if the cache is on or off.
+     *        Now CCR is not in CPUSH4State, but in SH7750State.
+     *        When you move the ccr inot CPUSH4State, the code will be
+     *        as follows.
+     */
+#if 0
+    /* check if operand cache is enabled or not. */
+    if (!(env->ccr & 1))
+        return 0;
+#endif
+
+    /* if MMU is off, no check for TLB. */
+    if (env->mmucr & MMUCR_AT)
+        return 1;
+
+    /* check TLB */
+    n = find_tlb_entry(env, addr, env->itlb, ITLB_SIZE, use_asid);
+    if (n >= 0)
+        return env->itlb[n].c;
+
+    n = find_tlb_entry(env, addr, env->utlb, UTLB_SIZE, use_asid);
+    if (n >= 0)
+        return env->utlb[n].c;
+
+    return 0;
+}
+
 #endif
diff --git a/target-sh4/helper.h b/target-sh4/helper.h
index e665185..4b2fcdd 100644
--- a/target-sh4/helper.h
+++ b/target-sh4/helper.h
@@ -9,6 +9,10 @@ DEF_HELPER_0(debug, void)
 DEF_HELPER_1(sleep, void, i32)
 DEF_HELPER_1(trapa, void, i32)
 
+DEF_HELPER_2(movcal, void, i32, i32)
+DEF_HELPER_0(discard_movcal_backup, void)
+DEF_HELPER_1(ocbi, void, i32)
+
 DEF_HELPER_2(addv, i32, i32, i32)
 DEF_HELPER_2(addc, i32, i32, i32)
 DEF_HELPER_2(subv, i32, i32, i32)
diff --git a/target-sh4/op_helper.c b/target-sh4/op_helper.c
index 84e1ad3..90c94dc 100644
--- a/target-sh4/op_helper.c
+++ b/target-sh4/op_helper.c
@@ -18,6 +18,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA  02110-1301 USA
  */
 #include <assert.h>
+#include <stdlib.h>
 #include "exec.h"
 #include "helper.h"
 
@@ -122,6 +123,57 @@ void helper_trapa(uint32_t tra)
     cpu_loop_exit();
 }
 
+void helper_movcal(uint32_t address, uint32_t value)
+{
+    if (cpu_sh4_is_cached (env, address))
+    {
+	memory_content *r = (memory_content *)malloc (sizeof(memory_content));
+	r->address = address;
+	r->value = value;
+	r->next = NULL;
+
+	*(env->movcal_backup_tail) = r;
+	env->movcal_backup_tail = &(r->next);
+    }
+}
+
+void helper_discard_movcal_backup(void)
+{
+    memory_content *current = env->movcal_backup;
+
+    while(current)
+    {
+	memory_content *next = current->next;
+	free (current);
+	env->movcal_backup = current = next;
+	if (current == 0)
+	    env->movcal_backup_tail = &(env->movcal_backup);
+    } 
+}
+
+void helper_ocbi(uint32_t address)
+{
+    memory_content **current = &(env->movcal_backup);
+    while (*current)
+    {
+	uint32_t a = (*current)->address;
+	if ((a & ~0x1F) == (address & ~0x1F))
+	{
+	    memory_content *next = (*current)->next;
+	    stl_data(a, (*current)->value);
+	    
+	    if (next == 0)
+	    {
+		env->movcal_backup_tail = current;
+	    }
+
+	    free (*current);
+	    *current = next;
+	    break;
+	}
+    }
+}
+
 uint32_t helper_addc(uint32_t arg0, uint32_t arg1)
 {
     uint32_t tmp0, tmp1;
diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index cc9f886..4ced176 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -50,6 +50,7 @@ typedef struct DisasContext {
     uint32_t delayed_pc;
     int singlestep_enabled;
     uint32_t features;
+    int has_movcal;
 } DisasContext;
 
 #if defined(CONFIG_USER_ONLY)
@@ -283,6 +284,7 @@ CPUSH4State *cpu_sh4_init(const char *cpu_model)
     env = qemu_mallocz(sizeof(CPUSH4State));
     env->features = def->features;
     cpu_exec_init(env);
+    env->movcal_backup_tail = &(env->movcal_backup);
     sh4_translate_init();
     env->cpu_model_str = cpu_model;
     cpu_sh4_reset(env);
@@ -495,6 +497,37 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 
 static void _decode_opc(DisasContext * ctx)
 {
+    /* This code tries to make movcal emulation sufficiently
+       accurate for Linux purposes.  This instruction writes
+       memory, and prior to that, always allocates a cache line.
+       It is used in two contexts:
+       - in memcpy, where data is copied in blocks, the first write
+       of to a block uses movca.l for performance.
+       - in arch/sh/mm/cache-sh4.c, movcal.l + ocbi combination is used
+       to flush the cache. Here, the data written by movcal.l is never
+       written to memory, and the data written is just bogus.
+
+       To simulate this, we simulate movcal.l, we store the value to memory,
+       but we also remember the previous content. If we see ocbi, we check
+       if movcal.l for that address was done previously. If so, the write should
+       not have hit the memory, so we restore the previous content.
+       When we see an instruction that is neither movca.l
+       nor ocbi, the previous content is discarded.
+
+       To optimize, we only try to flush stores when we're at the start of
+       TB, or if we already saw movca.l in this TB and did not flush stores
+       yet.  */
+    if (ctx->has_movcal)
+	{
+	  int opcode = ctx->opcode & 0xf0ff;
+	  if (opcode != 0x0093 /* ocbi */
+	      && opcode != 0x00c3 /* movca.l */)
+	      {
+		  gen_helper_discard_movcal_backup ();
+		  ctx->has_movcal = 0;
+	      }
+	}
+
 #if 0
     fprintf(stderr, "Translating opcode 0x%04x\n", ctx->opcode);
 #endif
@@ -1545,7 +1578,13 @@ static void _decode_opc(DisasContext * ctx)
 	}
 	return;
     case 0x00c3:		/* movca.l R0,@Rm */
-	tcg_gen_qemu_st32(REG(0), REG(B11_8), ctx->memidx);
+        {
+            TCGv val = tcg_temp_new();
+            tcg_gen_qemu_ld32u(val, REG(B11_8), ctx->memidx);
+            gen_helper_movcal (REG(B11_8), val);            
+            tcg_gen_qemu_st32(REG(0), REG(B11_8), ctx->memidx);
+        }
+        ctx->has_movcal = 1;
 	return;
     case 0x40a9:
 	/* MOVUA.L @Rm,R0 (Rm) -> R0
@@ -1594,9 +1633,7 @@ static void _decode_opc(DisasContext * ctx)
 	    break;
     case 0x0093:		/* ocbi @Rn */
 	{
-	    TCGv dummy = tcg_temp_new();
-	    tcg_gen_qemu_ld32s(dummy, REG(B11_8), ctx->memidx);
-	    tcg_temp_free(dummy);
+	    gen_helper_ocbi (REG(B11_8));
 	}
 	return;
     case 0x00a3:		/* ocbp @Rn */
@@ -1876,6 +1913,7 @@ gen_intermediate_code_internal(CPUState * env, TranslationBlock * tb,
     ctx.tb = tb;
     ctx.singlestep_enabled = env->singlestep_enabled;
     ctx.features = env->features;
+    ctx.has_movcal = (tb->flags & TB_FLAG_PENDING_MOVCA);
 
 #ifdef DEBUG_DISAS
     qemu_log_mask(CPU_LOG_TB_CPU,

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] SH: Fix movca.l/ocbi emulation.
  2009-04-01 13:36       ` Vladimir Prus
@ 2009-04-01 13:48         ` Edgar E. Iglesias
  0 siblings, 0 replies; 7+ messages in thread
From: Edgar E. Iglesias @ 2009-04-01 13:48 UTC (permalink / raw)
  To: Vladimir Prus; +Cc: Shin-ichiro KAWASAKI, qemu-devel, Edgar E. Iglesias

On Wed, Apr 01, 2009 at 05:36:43PM +0400, Vladimir Prus wrote:
> On Wednesday 01 April 2009 16:44:49 Edgar E. Iglesias wrote:
> 
> Hi Edgar,
> 
> 
> > > I am afraid I might not be the best person to accurately catch all cases
> > > where cache might be enabled or disabled. KAWASAKI-san has posted a function
> > > that can be used to check if caching is enabled, in:
> > > 
> > > 	http://article.gmane.org/gmane.comp.emulators.qemu/36348
> > > 
> > > Does that one address your suggestion? If so, I can incorporate that function,
> > > and checking for it.
> > 
> > I dont have the manuals here but yes, I think something like that will be
> > needed. On other archs I've seen ppl do memcpy to IO areas, don't know
> > about sh but to be on the safe side we better not do any funny stuff
> > with uncached stores.
> 
> I have added the check, using the routine provided by Kawasaki-san.

[cut]
 
> > >  typedef struct CPUSH4State {
> > >      int id;			/* CPU model */
> > >  
> > > @@ -149,6 +155,8 @@ typedef struct CPUSH4State {
> > >      tlb_t itlb[ITLB_SIZE];	/* instruction translation table */
> > >      void *intc_handle;
> > >      int intr_at_halt;		/* SR_BL ignored during sleep */
> > > +    memory_content_t *movcal_backup;
> > > +    memory_content_t **movcal_backup_tail;
> > 
> > Do you really need the _tail member?
> > Seems to me like you're only maintaining it but not really using it.
> 
> See helper_movcal:
> 
> 	*(env->movcal_backup_tail) = r;
> 
> I think having movcal backup list in-order is good. Say, if we have two
> movcal to the same address and ocbi, we want to restore the saved when
> the first movcal was executed, and given that helper_ocbi traversed
> the list from the front we need to put elements in-order.
> 

I see, thanks for clarifying that.


> > > +void helper_ocbi(uint32_t address)
> > > +{
> > > +    memory_content_t **current = &(env->movcal_backup);
> > > +    while (*current)
> > > +    {
> > > +        uint32_t a = (*current)->address;
> > > +        if ((a & ~0x1F) == (address & ~0x1F))
> > > +	{
> > > +            memory_content_t *next = (*current)->next;
> > > +
> > > +            stl_data(a, (*current)->value);
> > > +
> > > +            if (next == 0)
> > > +	    {
> > > +                env->movcal_backup_tail = current;
> > > +	    }
> > > +
> > > +            free (*current);
> > > +            *current = next;
> > > +            break;
> > > +	}
> > 
> > The indentation got messed up here.
> 
> I have fixed it. Revised patch is attached.


Thanks, ATM I don't have any tools/kernel to test with but AFAICS it
looks OK.

Acked-by: Edgar E. Iglesias <edgar.iglesias@gmail.com>



> commit d5caa92b7e792c526306b3438e697bfab3dd0c03
> Author: Vladimir Prus <vladimir@codesourcery.com>
> Date:   Tue Nov 11 12:29:02 2008 +0300
> 
>     Fix movcal.l/ocbi emulation.
>     
>     	* target-sh4/cpu.h (memory_content_t): New.
>     	(CPUSH4State): New fields movcal_backup and movcal_backup_tail.
>     	* target-sh4/helper.h (helper_movcal)
>     	(helper_discard_movcal_backup, helper_ocbi): New.
>     	* target-sh4/op_helper.c (helper_movcal)
>     	(helper_discard_movcal_backup, helper_ocbi): New.
>     	* target-sh4/translate.c (DisasContext): New field has_movcal.
>     	(sh4_defs): Update CVS for SH7785.
>     	(cpu_sh4_init): Initialize env->movcal_backup_tail.
>     	(_decode_opc): Discard movca.l-backup.
>     	Make use of helper_movcal and helper_ocbi.
>     	(gen_intermediate_code_internal): Initialize has_movcal to 1.
> 
> diff --git a/target-sh4/cpu.h b/target-sh4/cpu.h
> index e9eee2c..d754512 100644
> --- a/target-sh4/cpu.h
> +++ b/target-sh4/cpu.h
> @@ -100,6 +100,12 @@ enum sh_features {
>      SH_FEATURE_BCR3_AND_BCR4 = 2,
>  };
>  
> +typedef struct memory_content {
> +    uint32_t address;
> +    uint32_t value;
> +    struct memory_content *next;
> +} memory_content;
> +
>  typedef struct CPUSH4State {
>      int id;			/* CPU model */
>  
> @@ -149,6 +155,8 @@ typedef struct CPUSH4State {
>      tlb_t itlb[ITLB_SIZE];	/* instruction translation table */
>      void *intc_handle;
>      int intr_at_halt;		/* SR_BL ignored during sleep */
> +    memory_content *movcal_backup;
> +    memory_content **movcal_backup_tail;
>  } CPUSH4State;
>  
>  CPUSH4State *cpu_sh4_init(const char *cpu_model);
> @@ -163,6 +171,8 @@ void sh4_cpu_list(FILE *f, int (*cpu_fprintf)(FILE *f, const char *fmt, ...));
>  void cpu_sh4_write_mmaped_utlb_addr(CPUSH4State *s, target_phys_addr_t addr,
>  				    uint32_t mem_value);
>  
> +int cpu_sh4_is_cached(CPUSH4State * env, target_ulong addr);
> +
>  static inline void cpu_set_tls(CPUSH4State *env, target_ulong newtls)
>  {
>    env->gbr = newtls;
> @@ -294,16 +304,19 @@ static inline void cpu_pc_from_tb(CPUState *env, TranslationBlock *tb)
>      env->flags = tb->flags;
>  }
>  
> +#define TB_FLAG_PENDING_MOVCA  (1 << 4)
> +
>  static inline void cpu_get_tb_cpu_state(CPUState *env, target_ulong *pc,
>                                          target_ulong *cs_base, int *flags)
>  {
>      *pc = env->pc;
>      *cs_base = 0;
>      *flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL
> -                    | DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME))   /* Bits  0- 3 */
> -            | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
> +                    | DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME))     /* Bits  0- 3 */
> +            | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))    /* Bits 19-21 */
>              | (env->sr & (SR_MD | SR_RB))                      /* Bits 29-30 */
> -            | (env->sr & SR_FD);                               /* Bit 15 */
> +            | (env->sr & SR_FD)                                /* Bit 15 */
> +            | (env->movcal_backup ? TB_FLAG_PENDING_MOVCA : 0); /* Bit 4 */
>  }
>  
>  #endif				/* _CPU_SH4_H */
> diff --git a/target-sh4/helper.c b/target-sh4/helper.c
> index f1feca7..5fa4fb4 100644
> --- a/target-sh4/helper.c
> +++ b/target-sh4/helper.c
> @@ -652,4 +652,48 @@ void cpu_sh4_write_mmaped_utlb_addr(CPUSH4State *s, target_phys_addr_t addr,
>      }
>  }
>  
> +int cpu_sh4_is_cached(CPUSH4State * env, target_ulong addr)
> +{
> +    int n;
> +    int use_asid = (env->mmucr & MMUCR_SV) == 0 || (env->sr & SR_MD) == 0;
> +
> +    /* check area */
> +    if (env->sr & SR_MD) {
> +        /* For previledged mode, P2 and P4 area is not cachable. */
> +        if ((0xA0000000 <= addr && addr < 0xC0000000) || 0xE0000000 <= addr)
> +            return 0;
> +    } else {
> +        /* For user mode, only U0 area is cachable. */
> +        if (0x80000000 <= addr)
> +            return 0;
> +    }
> +
> +    /*
> +     * TODO : Evaluate CCR and check if the cache is on or off.
> +     *        Now CCR is not in CPUSH4State, but in SH7750State.
> +     *        When you move the ccr inot CPUSH4State, the code will be
> +     *        as follows.
> +     */
> +#if 0
> +    /* check if operand cache is enabled or not. */
> +    if (!(env->ccr & 1))
> +        return 0;
> +#endif
> +
> +    /* if MMU is off, no check for TLB. */
> +    if (env->mmucr & MMUCR_AT)
> +        return 1;
> +
> +    /* check TLB */
> +    n = find_tlb_entry(env, addr, env->itlb, ITLB_SIZE, use_asid);
> +    if (n >= 0)
> +        return env->itlb[n].c;
> +
> +    n = find_tlb_entry(env, addr, env->utlb, UTLB_SIZE, use_asid);
> +    if (n >= 0)
> +        return env->utlb[n].c;
> +
> +    return 0;
> +}
> +
>  #endif
> diff --git a/target-sh4/helper.h b/target-sh4/helper.h
> index e665185..4b2fcdd 100644
> --- a/target-sh4/helper.h
> +++ b/target-sh4/helper.h
> @@ -9,6 +9,10 @@ DEF_HELPER_0(debug, void)
>  DEF_HELPER_1(sleep, void, i32)
>  DEF_HELPER_1(trapa, void, i32)
>  
> +DEF_HELPER_2(movcal, void, i32, i32)
> +DEF_HELPER_0(discard_movcal_backup, void)
> +DEF_HELPER_1(ocbi, void, i32)
> +
>  DEF_HELPER_2(addv, i32, i32, i32)
>  DEF_HELPER_2(addc, i32, i32, i32)
>  DEF_HELPER_2(subv, i32, i32, i32)
> diff --git a/target-sh4/op_helper.c b/target-sh4/op_helper.c
> index 84e1ad3..90c94dc 100644
> --- a/target-sh4/op_helper.c
> +++ b/target-sh4/op_helper.c
> @@ -18,6 +18,7 @@
>   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA  02110-1301 USA
>   */
>  #include <assert.h>
> +#include <stdlib.h>
>  #include "exec.h"
>  #include "helper.h"
>  
> @@ -122,6 +123,57 @@ void helper_trapa(uint32_t tra)
>      cpu_loop_exit();
>  }
>  
> +void helper_movcal(uint32_t address, uint32_t value)
> +{
> +    if (cpu_sh4_is_cached (env, address))
> +    {
> +	memory_content *r = (memory_content *)malloc (sizeof(memory_content));
> +	r->address = address;
> +	r->value = value;
> +	r->next = NULL;
> +
> +	*(env->movcal_backup_tail) = r;
> +	env->movcal_backup_tail = &(r->next);
> +    }
> +}
> +
> +void helper_discard_movcal_backup(void)
> +{
> +    memory_content *current = env->movcal_backup;
> +
> +    while(current)
> +    {
> +	memory_content *next = current->next;
> +	free (current);
> +	env->movcal_backup = current = next;
> +	if (current == 0)
> +	    env->movcal_backup_tail = &(env->movcal_backup);
> +    } 
> +}
> +
> +void helper_ocbi(uint32_t address)
> +{
> +    memory_content **current = &(env->movcal_backup);
> +    while (*current)
> +    {
> +	uint32_t a = (*current)->address;
> +	if ((a & ~0x1F) == (address & ~0x1F))
> +	{
> +	    memory_content *next = (*current)->next;
> +	    stl_data(a, (*current)->value);
> +	    
> +	    if (next == 0)
> +	    {
> +		env->movcal_backup_tail = current;
> +	    }
> +
> +	    free (*current);
> +	    *current = next;
> +	    break;
> +	}
> +    }
> +}
> +
>  uint32_t helper_addc(uint32_t arg0, uint32_t arg1)
>  {
>      uint32_t tmp0, tmp1;
> diff --git a/target-sh4/translate.c b/target-sh4/translate.c
> index cc9f886..4ced176 100644
> --- a/target-sh4/translate.c
> +++ b/target-sh4/translate.c
> @@ -50,6 +50,7 @@ typedef struct DisasContext {
>      uint32_t delayed_pc;
>      int singlestep_enabled;
>      uint32_t features;
> +    int has_movcal;
>  } DisasContext;
>  
>  #if defined(CONFIG_USER_ONLY)
> @@ -283,6 +284,7 @@ CPUSH4State *cpu_sh4_init(const char *cpu_model)
>      env = qemu_mallocz(sizeof(CPUSH4State));
>      env->features = def->features;
>      cpu_exec_init(env);
> +    env->movcal_backup_tail = &(env->movcal_backup);
>      sh4_translate_init();
>      env->cpu_model_str = cpu_model;
>      cpu_sh4_reset(env);
> @@ -495,6 +497,37 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
>  
>  static void _decode_opc(DisasContext * ctx)
>  {
> +    /* This code tries to make movcal emulation sufficiently
> +       accurate for Linux purposes.  This instruction writes
> +       memory, and prior to that, always allocates a cache line.
> +       It is used in two contexts:
> +       - in memcpy, where data is copied in blocks, the first write
> +       of to a block uses movca.l for performance.
> +       - in arch/sh/mm/cache-sh4.c, movcal.l + ocbi combination is used
> +       to flush the cache. Here, the data written by movcal.l is never
> +       written to memory, and the data written is just bogus.
> +
> +       To simulate this, we simulate movcal.l, we store the value to memory,
> +       but we also remember the previous content. If we see ocbi, we check
> +       if movcal.l for that address was done previously. If so, the write should
> +       not have hit the memory, so we restore the previous content.
> +       When we see an instruction that is neither movca.l
> +       nor ocbi, the previous content is discarded.
> +
> +       To optimize, we only try to flush stores when we're at the start of
> +       TB, or if we already saw movca.l in this TB and did not flush stores
> +       yet.  */
> +    if (ctx->has_movcal)
> +	{
> +	  int opcode = ctx->opcode & 0xf0ff;
> +	  if (opcode != 0x0093 /* ocbi */
> +	      && opcode != 0x00c3 /* movca.l */)
> +	      {
> +		  gen_helper_discard_movcal_backup ();
> +		  ctx->has_movcal = 0;
> +	      }
> +	}
> +
>  #if 0
>      fprintf(stderr, "Translating opcode 0x%04x\n", ctx->opcode);
>  #endif
> @@ -1545,7 +1578,13 @@ static void _decode_opc(DisasContext * ctx)
>  	}
>  	return;
>      case 0x00c3:		/* movca.l R0,@Rm */
> -	tcg_gen_qemu_st32(REG(0), REG(B11_8), ctx->memidx);
> +        {
> +            TCGv val = tcg_temp_new();
> +            tcg_gen_qemu_ld32u(val, REG(B11_8), ctx->memidx);
> +            gen_helper_movcal (REG(B11_8), val);            
> +            tcg_gen_qemu_st32(REG(0), REG(B11_8), ctx->memidx);
> +        }
> +        ctx->has_movcal = 1;
>  	return;
>      case 0x40a9:
>  	/* MOVUA.L @Rm,R0 (Rm) -> R0
> @@ -1594,9 +1633,7 @@ static void _decode_opc(DisasContext * ctx)
>  	    break;
>      case 0x0093:		/* ocbi @Rn */
>  	{
> -	    TCGv dummy = tcg_temp_new();
> -	    tcg_gen_qemu_ld32s(dummy, REG(B11_8), ctx->memidx);
> -	    tcg_temp_free(dummy);
> +	    gen_helper_ocbi (REG(B11_8));
>  	}
>  	return;
>      case 0x00a3:		/* ocbp @Rn */
> @@ -1876,6 +1913,7 @@ gen_intermediate_code_internal(CPUState * env, TranslationBlock * tb,
>      ctx.tb = tb;
>      ctx.singlestep_enabled = env->singlestep_enabled;
>      ctx.features = env->features;
> +    ctx.has_movcal = (tb->flags & TB_FLAG_PENDING_MOVCA);
>  
>  #ifdef DEBUG_DISAS
>      qemu_log_mask(CPU_LOG_TB_CPU,

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Qemu-devel] SH: Fix movca.l/ocbi emulation.
  2008-12-11 18:34 [Qemu-devel] SH: Fix movca.l/ocbi emulation Vladimir Prus
  2009-01-14 20:45 ` Edgar E. Iglesias
@ 2009-01-18 16:38 ` Shin-ichiro KAWASAKI
  1 sibling, 0 replies; 7+ messages in thread
From: Shin-ichiro KAWASAKI @ 2009-01-18 16:38 UTC (permalink / raw)
  To: qemu-devel

Hi, Volodya-san.

Thank you for the patch.  I encountered the movca.l/ocbi
problem when I added SE7750 board support, and tried to 
create a patch like yours by mistake.  But same two patches
can never be good idea.
I hope your patch to be authorized, and want to provide
two comments on it from my point of view.

> diff --git a/cpu-exec.c b/cpu-exec.c
> index 9a35a59..64b0845 100644
> --- a/cpu-exec.c
> +++ b/cpu-exec.c
> @@ -174,7 +174,7 @@ static inline TranslationBlock *tb_find_fast(void)
>      /* we record a subset of the CPU state. It will
>         always be the same before a given translated block
>         is executed. */
> -    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
> +    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);  
I'm not sure if this modification should be in this patch.

(snip)

> +void helper_movcal(uint32_t address, uint32_t value)
> +{
> +  store_request_t *r = (store_request_t *)malloc (sizeof(store_request_t));
> +  r->address = address;
> +  r->value = value;
> +  r->next = NULL;
> +
> +  *(env->store_request_tail) = r;
> +  env->store_request_tail = &(r->next);
> +}
> +
> +void helper_do_stores(void)
> +{
> +  store_request_t *current = env->store_requests;
> +
> +  while(current)
> +    {
> +      uint32_t a = current->address, v = current->value;
> +      store_request_t *next = current->next;
> +      free (current);
> +      env->store_requests = current = next;
> +      if (current == 0)
> +	env->store_request_tail = &(env->store_requests);
> +
> +      stl_data(a, v);
> +    } 
> +}
> +
> +void helper_ocbi(uint32_t address)
> +{
> +  store_request_t **current = &(env->store_requests);
> +  while (*current)
> +    {
> +      if ((*current)->address == address)
> +	{
> +	  store_request_t *next = (*current)->next;
> +
> +	  if (next == 0)
> +	    {
> +	      env->store_request_tail = current;
> +	    }
> +
> +	  free (*current);
> +	  *current = next;
> +	  break;
> +	}
> +    }
> +}
> +
>  uint32_t helper_addc(uint32_t arg0, uint32_t arg1)
>  {
>      uint32_t tmp0, tmp1;

These movca.l/ocbi helpers should change their behavior
depending on cache status : area type, privilege mode,
cache enabling, and tlb entries' cache flags (Paul Mundt
and Edgar Iglesias pointed out it when they reviewed my patch) .

I added the cache mode checking routine to the end of this
mail.  Could you evaluate it and take into your patch?
That is_cached() function is to be placed in target-sh4/helper.c,
and to be invoked by helper_movca() or helper_ocbi().

Regards,
Shin-ichiro KAWASAKI


-------------------------------------------------------

int is_cached(CPUSH4State * env, target_ulong addr)
{
    int n;
    int use_asid = (env->mmucr & MMUCR_SV) == 0 || (env->sr & SR_MD) == 0;

    /* check area */
    if (env->sr & SR_MD) {
	/* For previledged mode, P2 and P4 area is not cachable. */
	if ((0xA0000000 <= addr && addr < 0xC0000000) || 0xE0000000 <= addr)
	    return 0;
    } else {
	/* For user mode, only U0 area is cachable. */
	if (0x80000000 <= addr)
	    return 0;
    }

    /*
     * TODO : Evaluate CCR and check if the cache is on or off.
     *        Now CCR is not in CPUSH4State, but in SH7750State.
     *        When you move the ccr inot CPUSH4State, the code will be
     *        as follows.
     */
#if 0
    /* check if operand cache is enabled or not. */
    if (!(env->ccr & 1))
	return 0;
#endif

    /* if MMU is off, no check for TLB. */
    if (env->mmucr & MMUCR_AT)
	return 1;

    /* check TLB */
    n = find_tlb_entry(env, addr, env->itlb, ITLB_SIZE, use_asid);
    if (n >= 0)
	return env->itlb[n].c;

    n = find_tlb_entry(env, addr, env->utlb, UTLB_SIZE, use_asid);
    if (n >= 0)
	return env->utlb[n].c;

    return 0;
}

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2009-04-01 13:48 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-12-11 18:34 [Qemu-devel] SH: Fix movca.l/ocbi emulation Vladimir Prus
2009-01-14 20:45 ` Edgar E. Iglesias
2009-03-31 20:58   ` Vladimir Prus
2009-04-01 12:44     ` Edgar E. Iglesias
2009-04-01 13:36       ` Vladimir Prus
2009-04-01 13:48         ` Edgar E. Iglesias
2009-01-18 16:38 ` Shin-ichiro KAWASAKI

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).