[Qemu-devel] RFC: Code fetch optimisation

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] RFC: Code fetch optimisation
@ 2007-10-14 11:44 J. Mayer
  2007-10-15  2:30 ` Paul Brook
  0 siblings, 1 reply; 17+ messages in thread
From: J. Mayer @ 2007-10-14 11:44 UTC (permalink / raw)
  To: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 480 bytes --]

Here's an updated version of the code fetch optimisation patch against
current CVS.
As a remainder, this patch avoid use of softmmu helpers to fetch the
code in most case. A new target define TARGET_HAS_VLE_INSNS has been
added which is used to handle the case of an instruction that span 2
pages, when the target CPU uses a variable-length instructions encoding.
For pure RISC, the code fetch is done using raw access routines.


-- 
J. Mayer <l_indien@magic.fr>
Never organized

[-- Attachment #2: code_raw_optim.diff --]
[-- Type: text/x-patch, Size: 57829 bytes --]

Index: cpu-all.h
===================================================================
RCS file: /sources/qemu/qemu/cpu-all.h,v
retrieving revision 1.76
diff -u -d -d -p -r1.76 cpu-all.h
--- cpu-all.h	23 Sep 2007 15:28:03 -0000	1.76
+++ cpu-all.h	14 Oct 2007 11:35:53 -0000
@@ -646,6 +646,13 @@ static inline void stfq_be_p(void *ptr, 
 #define ldl_code(p) ldl_raw(p)
 #define ldq_code(p) ldq_raw(p)
 
+#define ldub_code_p(sp, pp, p) ldub_raw(p)
+#define ldsb_code_p(sp, pp, p) ldsb_raw(p)
+#define lduw_code_p(sp, pp, p) lduw_raw(p)
+#define ldsw_code_p(sp, pp, p) ldsw_raw(p)
+#define ldl_code_p(sp, pp, p) ldl_raw(p)
+#define ldq_code_p(sp, pp, p) ldq_raw(p)
+
 #define ldub_kernel(p) ldub_raw(p)
 #define ldsb_kernel(p) ldsb_raw(p)
 #define lduw_kernel(p) lduw_raw(p)
Index: cpu-exec.c
===================================================================
RCS file: /sources/qemu/qemu/cpu-exec.c,v
retrieving revision 1.120
diff -u -d -d -p -r1.120 cpu-exec.c
--- cpu-exec.c	14 Oct 2007 07:07:04 -0000	1.120
+++ cpu-exec.c	14 Oct 2007 11:35:53 -0000
@@ -133,6 +133,7 @@ static TranslationBlock *tb_find_slow(ta
     tb->tc_ptr = tc_ptr;
     tb->cs_base = cs_base;
     tb->flags = flags;
+    tb->page_addr[0] = phys_page1;
     cpu_gen_code(env, tb, CODE_GEN_MAX_SIZE, &code_gen_size);
     code_gen_ptr = (void *)(((unsigned long)code_gen_ptr + code_gen_size + CODE_GEN_ALIGN - 1) & ~(CODE_GEN_ALIGN - 1));
 
Index: softmmu_header.h
===================================================================
RCS file: /sources/qemu/qemu/softmmu_header.h,v
retrieving revision 1.18
diff -u -d -d -p -r1.18 softmmu_header.h
--- softmmu_header.h	14 Oct 2007 07:07:05 -0000	1.18
+++ softmmu_header.h	14 Oct 2007 11:35:53 -0000
@@ -289,6 +289,68 @@ static inline void glue(glue(st, SUFFIX)
     }
 }
 
+#else
+
+#if DATA_SIZE <= 2
+static inline RES_TYPE glue(glue(glue(lds,SUFFIX),MEMSUFFIX),_p)(unsigned long *start_pc,
+                                                                 unsigned long phys_pc,
+                                                                 target_ulong virt_pc)
+{
+    RES_TYPE opc;
+
+    /* XXX: Target executing code from MMIO ares is not supported for now */
+#if defined(TARGET_HAS_VLE_INSNS) /* || defined(TARGET_MMIO_CODE) */
+    if (unlikely((*start_pc ^
+                  (phys_pc + sizeof(RES_TYPE) - 1)) >> TARGET_PAGE_BITS)) {
+        /* Slow path: phys_pc is not in the same page than start_pc
+         *            or the insn is spanning two pages
+         */
+        opc = glue(glue(lds,SUFFIX),MEMSUFFIX)(virt_pc);
+        /* Avoid softmmu access on next load */
+        /* XXX: dont: phys PC is not correct anymore
+         *      We could call get_phys_addr_code(env, pc); and remove the else
+         *      condition, here.
+         */
+        //*start_pc = phys_pc;
+    } else
+#endif
+    {
+        opc = glue(glue(lds,SUFFIX),_raw)(phys_pc);
+    }
+
+    return opc;
+}
+#endif
+
+static inline RES_TYPE glue(glue(glue(ld,USUFFIX),MEMSUFFIX),_p)(unsigned long *start_pc,
+                                                                 unsigned long phys_pc,
+                                                                 target_ulong virt_pc)
+{
+    RES_TYPE opc;
+
+    /* XXX: Target executing code from MMIO ares is not supported for now */
+#if defined(TARGET_HAS_VLE_INSNS) /* || defined(TARGET_MMIO_CODE) */
+    if (unlikely((*start_pc ^
+                  (phys_pc + sizeof(RES_TYPE) - 1)) >> TARGET_PAGE_BITS)) {
+        /* Slow path: phys_pc is not in the same page than start_pc
+         *            or the insn is spanning two pages
+         */
+        opc = glue(glue(ld,USUFFIX),MEMSUFFIX)(virt_pc);
+        /* Avoid softmmu access on next load */
+        /* XXX: dont: phys PC is not correct anymore
+         *      We could call get_phys_addr_code(env, pc); and remove the else
+         *      condition, here.
+         */
+        //*start_pc = phys_pc;
+    } else
+#endif
+    {
+        opc = glue(glue(ld,USUFFIX),_raw)(phys_pc);
+    }
+
+    return opc;
+}
+
 #endif /* ACCESS_TYPE != (NB_MMU_MODES + 1) */
 
 #endif /* !asm */
Index: target-alpha/translate.c
===================================================================
RCS file: /sources/qemu/qemu/target-alpha/translate.c,v
retrieving revision 1.6
diff -u -d -d -p -r1.6 translate.c
--- target-alpha/translate.c	14 Oct 2007 08:50:17 -0000	1.6
+++ target-alpha/translate.c	14 Oct 2007 11:35:54 -0000
@@ -1965,6 +1965,7 @@ int gen_intermediate_code_internal (CPUS
     static int insn_count;
 #endif
     DisasContext ctx, *ctxp = &ctx;
+    unsigned long phys_pc, phys_pc_start;
     target_ulong pc_start;
     uint32_t insn;
     uint16_t *gen_opc_end;
@@ -1972,6 +1973,9 @@ int gen_intermediate_code_internal (CPUS
     int ret;
 
     pc_start = tb->pc;
+    phys_pc_start = (unsigned long)phys_ram_base + tb->page_addr[0] +
+        (pc_start & ~TARGET_PAGE_MASK);
+    phys_pc = phys_pc_start;
     gen_opc_ptr = gen_opc_buf;
     gen_opc_end = gen_opc_buf + OPC_MAX_SIZE;
     gen_opparam_ptr = gen_opparam_buf;
@@ -2010,7 +2014,7 @@ int gen_intermediate_code_internal (CPUS
                     ctx.pc, ctx.mem_idx);
         }
 #endif
-        insn = ldl_code(ctx.pc);
+        insn = ldl_code_p(&phys_pc_start, phys_pc, ctx.pc);
 #if defined ALPHA_DEBUG_DISAS
         insn_count++;
         if (logfile != NULL) {
@@ -2018,6 +2022,7 @@ int gen_intermediate_code_internal (CPUS
         }
 #endif
         ctx.pc += 4;
+        phys_pc += 4;
         ret = translate_one(ctxp, insn);
         if (ret != 0)
             break;
Index: target-arm/translate.c
===================================================================
RCS file: /sources/qemu/qemu/target-arm/translate.c,v
retrieving revision 1.57
diff -u -d -d -p -r1.57 translate.c
--- target-arm/translate.c	17 Sep 2007 08:09:51 -0000	1.57
+++ target-arm/translate.c	14 Oct 2007 11:35:54 -0000
@@ -38,6 +38,8 @@
 /* internal defines */
 typedef struct DisasContext {
     target_ulong pc;
+    unsigned long phys_pc;
+    unsigned long phys_pc_start;
     int is_jmp;
     /* Nonzero if this instruction has been conditionally skipped.  */
     int condjmp;
@@ -2206,8 +2208,9 @@ static void disas_arm_insn(CPUState * en
 {
     unsigned int cond, insn, val, op1, i, shift, rm, rs, rn, rd, sh;
 
-    insn = ldl_code(s->pc);
+    insn = ldl_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 4;
+    s->phys_pc += 4;
 
     cond = insn >> 28;
     if (cond == 0xf){
@@ -2971,8 +2974,9 @@ static void disas_thumb_insn(DisasContex
     int32_t offset;
     int i;
 
-    insn = lduw_code(s->pc);
+    insn = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 2;
+    s->phys_pc += 2;
 
     switch (insn >> 12) {
     case 0: case 1:
@@ -3494,7 +3498,7 @@ static void disas_thumb_insn(DisasContex
             break;
         }
         offset = ((int32_t)insn << 21) >> 10;
-        insn = lduw_code(s->pc);
+        insn = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
         offset |= insn & 0x7ff;
 
         val = (uint32_t)s->pc + 2;
@@ -3544,6 +3548,9 @@ static inline int gen_intermediate_code_
 
     dc->is_jmp = DISAS_NEXT;
     dc->pc = pc_start;
+    dc->phys_pc_start = (unsigned long)phys_ram_base + tb->page_addr[0] +
+        (pc_start & ~TARGET_PAGE_MASK);
+    dc->phys_pc = dc->phys_pc_start;
     dc->singlestep_enabled = env->singlestep_enabled;
     dc->condjmp = 0;
     dc->thumb = env->thumb;
Index: target-cris/cpu.h
===================================================================
RCS file: /sources/qemu/qemu/target-cris/cpu.h,v
retrieving revision 1.2
diff -u -d -d -p -r1.2 cpu.h
--- target-cris/cpu.h	14 Oct 2007 07:07:06 -0000	1.2
+++ target-cris/cpu.h	14 Oct 2007 11:35:54 -0000
@@ -22,6 +22,8 @@
 #define CPU_CRIS_H
 
 #define TARGET_LONG_BITS 32
+/* need explicit support for instructions spanning 2 pages */
+#define TARGET_HAS_VLE_INSNS 1
 
 #include "cpu-defs.h"
 
Index: target-cris/translate.c
===================================================================
RCS file: /sources/qemu/qemu/target-cris/translate.c,v
retrieving revision 1.1
diff -u -d -d -p -r1.1 translate.c
--- target-cris/translate.c	8 Oct 2007 12:49:08 -0000	1.1
+++ target-cris/translate.c	14 Oct 2007 11:35:55 -0000
@@ -100,6 +100,7 @@ enum {
 typedef struct DisasContext {
 	CPUState *env;
 	target_ulong pc, insn_pc;
+        unsigned long phys_pc, phys_pc_start;
 
 	/* Decoder.  */
 	uint32_t ir;
@@ -828,7 +829,8 @@ static int dec_prep_alu_m(DisasContext *
 		if (memsize == 1)
 			insn_len++;
 
-		imm = ldl_code(dc->pc + 2);
+                imm = ldl_code_p(&dc->phys_pc_start, dc->phys_pc + 2,
+                                 dc->pc + 2);
 		if (memsize != 4) {
 			if (s_ext) {
 				imm = sign_extend(imm, (memsize * 8) - 1);
@@ -1962,7 +1964,7 @@ static unsigned int dec_lapc_im(DisasCon
 	rd = dc->op2;
 
 	cris_cc_mask(dc, 0);
-	imm = ldl_code(dc->pc + 2);
+	imm = ldl_code_p(&dc->phys_pc_start, dc->phys_pc + 2, dc->pc + 2);
 	DIS(fprintf (logfile, "lapc 0x%x, $r%u\n", imm + dc->pc, dc->op2));
 	gen_op_movl_T0_im (dc->pc + imm);
 	gen_movl_reg_T0[rd] ();
@@ -1999,7 +2001,7 @@ static unsigned int dec_jas_im(DisasCont
 {
 	uint32_t imm;
 
-	imm = ldl_code(dc->pc + 2);
+	imm = ldl_code_p(&dc->phys_pc_start, dc->phys_pc + 2, dc->pc + 2);
 
 	DIS(fprintf (logfile, "jas 0x%x\n", imm));
 	cris_cc_mask(dc, 0);
@@ -2016,7 +2018,7 @@ static unsigned int dec_jasc_im(DisasCon
 {
 	uint32_t imm;
 
-	imm = ldl_code(dc->pc + 2);
+	imm = ldl_code_p(&dc->phys_pc_start, dc->phys_pc + 2, dc->pc + 2);
 
 	DIS(fprintf (logfile, "jasc 0x%x\n", imm));
 	cris_cc_mask(dc, 0);
@@ -2047,7 +2049,7 @@ static unsigned int dec_bcc_im(DisasCont
 	int32_t offset;
 	uint32_t cond = dc->op2;
 
-	offset = ldl_code(dc->pc + 2);
+	offset = ldl_code_p(&dc->phys_pc_start, dc->phys_pc + 2, dc->pc + 2);
 	offset = sign_extend(offset, 15);
 
 	DIS(fprintf (logfile, "b%s %d pc=%x dst=%x\n",
@@ -2065,7 +2067,7 @@ static unsigned int dec_bas_im(DisasCont
 	int32_t simm;
 
 
-	simm = ldl_code(dc->pc + 2);
+	simm = ldl_code_p(&dc->phys_pc_start, dc->phys_pc + 2, dc->pc + 2);
 
 	DIS(fprintf (logfile, "bas 0x%x, $p%u\n", dc->pc + simm, dc->op2));
 	cris_cc_mask(dc, 0);
@@ -2081,7 +2083,7 @@ static unsigned int dec_bas_im(DisasCont
 static unsigned int dec_basc_im(DisasContext *dc)
 {
 	int32_t simm;
-	simm = ldl_code(dc->pc + 2);
+	simm = ldl_code_p(&dc->phys_pc_start, dc->phys_pc + 2, dc->pc + 2);
 
 	DIS(fprintf (logfile, "basc 0x%x, $p%u\n", dc->pc + simm, dc->op2));
 	cris_cc_mask(dc, 0);
@@ -2259,7 +2261,7 @@ cris_decoder(DisasContext *dc)
 	int i;
 
 	/* Load a halfword onto the instruction register.  */
-	tmp = ldl_code(dc->pc);
+	tmp = ldl_code_p(&dc->phys_pc_start, dc->phys_pc, dc->pc);
 	dc->ir = tmp & 0xffff;
 
 	/* Now decode it.  */
@@ -2313,6 +2315,9 @@ gen_intermediate_code_internal(CPUState 
 	uint32_t next_page_start;
 
 	pc_start = tb->pc;
+        dc->phys_pc_start = (unsigned long)phys_ram_base + tb->page_addr[0] +
+            (pc_start & ~TARGET_PAGE_MASK);
+        dc->phys_pc = dc->phys_pc_start;
 	dc->env = env;
 	dc->tb = tb;
 
@@ -2347,6 +2352,7 @@ gen_intermediate_code_internal(CPUState 
 		insn_len = cris_decoder(dc);
 		STATS(gen_op_exec_insn());
 		dc->pc += insn_len;
+                dc->phys_pc += insn_len;
 		if (!dc->flagx_live
 		    || (dc->flagx_live &&
 			!(dc->cc_op == CC_OP_FLAGS && dc->flags_x))) {
Index: target-i386/cpu.h
===================================================================
RCS file: /sources/qemu/qemu/target-i386/cpu.h,v
retrieving revision 1.51
diff -u -d -d -p -r1.51 cpu.h
--- target-i386/cpu.h	14 Oct 2007 07:07:06 -0000	1.51
+++ target-i386/cpu.h	14 Oct 2007 11:35:55 -0000
@@ -33,6 +33,8 @@
 /* support for self modifying code even if the modified instruction is
    close to the modifying instruction */
 #define TARGET_HAS_PRECISE_SMC
+/* need explicit support for instructions spanning 2 pages */
+#define TARGET_HAS_VLE_INSNS 1
 
 #define TARGET_HAS_ICE 1
 
Index: target-i386/translate.c
===================================================================
RCS file: /sources/qemu/qemu/target-i386/translate.c,v
retrieving revision 1.72
diff -u -d -d -p -r1.72 translate.c
--- target-i386/translate.c	27 Sep 2007 01:52:00 -0000	1.72
+++ target-i386/translate.c	14 Oct 2007 11:35:55 -0000
@@ -73,6 +73,7 @@ typedef struct DisasContext {
     int prefix;
     int aflag, dflag;
     target_ulong pc; /* pc = eip + cs_base */
+    unsigned long phys_pc,phys_pc_start;
     int is_jmp; /* 1 = means jump (stop translation), 2 means CPU
                    static state change (stop translation) */
     /* current block context */
@@ -1451,7 +1452,7 @@ static void gen_lea_modrm(DisasContext *
 
         if (base == 4) {
             havesib = 1;
-            code = ldub_code(s->pc++);
+            code = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
             scale = (code >> 6) & 3;
             index = ((code >> 3) & 7) | REX_X(s);
             base = (code & 7);
@@ -1462,8 +1463,10 @@ static void gen_lea_modrm(DisasContext *
         case 0:
             if ((base & 7) == 5) {
                 base = -1;
-                disp = (int32_t)ldl_code(s->pc);
+                disp = (int32_t)ldl_code_p(&s->phys_pc_start, s->phys_pc,
+                                           s->pc);
                 s->pc += 4;
+                s->phys_pc += 4;
                 if (CODE64(s) && !havesib) {
                     disp += s->pc + s->rip_offset;
                 }
@@ -1472,12 +1475,14 @@ static void gen_lea_modrm(DisasContext *
             }
             break;
         case 1:
-            disp = (int8_t)ldub_code(s->pc++);
+            disp = (int8_t)ldub_code_p(&s->phys_pc_start, s->phys_pc++,
+                                       s->pc++);
             break;
         default:
         case 2:
-            disp = ldl_code(s->pc);
+            disp = ldl_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
             s->pc += 4;
+            s->phys_pc += 4;
             break;
         }
 
@@ -1545,8 +1550,9 @@ static void gen_lea_modrm(DisasContext *
         switch (mod) {
         case 0:
             if (rm == 6) {
-                disp = lduw_code(s->pc);
+                disp = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
                 s->pc += 2;
+                s->phys_pc += 2;
                 gen_op_movl_A0_im(disp);
                 rm = 0; /* avoid SS override */
                 goto no_rm;
@@ -1555,12 +1561,14 @@ static void gen_lea_modrm(DisasContext *
             }
             break;
         case 1:
-            disp = (int8_t)ldub_code(s->pc++);
+            disp = (int8_t)ldub_code_p(&s->phys_pc_start, s->phys_pc++,
+                                       s->pc++);
             break;
         default:
         case 2:
-            disp = lduw_code(s->pc);
+            disp = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
             s->pc += 2;
+            s->phys_pc += 2;
             break;
         }
         switch(rm) {
@@ -1629,7 +1637,7 @@ static void gen_nop_modrm(DisasContext *
         base = rm;
 
         if (base == 4) {
-            code = ldub_code(s->pc++);
+            code = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
             base = (code & 7);
         }
 
@@ -1637,14 +1645,17 @@ static void gen_nop_modrm(DisasContext *
         case 0:
             if (base == 5) {
                 s->pc += 4;
+                s->phys_pc += 4;
             }
             break;
         case 1:
             s->pc++;
+            s->phys_pc++;
             break;
         default:
         case 2:
             s->pc += 4;
+            s->phys_pc += 4;
             break;
         }
     } else {
@@ -1652,14 +1663,17 @@ static void gen_nop_modrm(DisasContext *
         case 0:
             if (rm == 6) {
                 s->pc += 2;
+                s->phys_pc += 2;
             }
             break;
         case 1:
             s->pc++;
+            s->phys_pc++;
             break;
         default:
         case 2:
             s->pc += 2;
+            s->phys_pc += 2;
             break;
         }
     }
@@ -1727,17 +1741,20 @@ static inline uint32_t insn_get(DisasCon
 
     switch(ot) {
     case OT_BYTE:
-        ret = ldub_code(s->pc);
+        ret = ldub_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
         s->pc++;
+        s->phys_pc++;
         break;
     case OT_WORD:
-        ret = lduw_code(s->pc);
+        ret = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
         s->pc += 2;
+        s->phys_pc += 2;
         break;
     default:
     case OT_LONG:
-        ret = ldl_code(s->pc);
+        ret = ldl_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
         s->pc += 4;
+        s->phys_pc += 4;
         break;
     }
     return ret;
@@ -2689,7 +2706,7 @@ static void gen_sse(DisasContext *s, int
         gen_op_enter_mmx();
     }
 
-    modrm = ldub_code(s->pc++);
+    modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
     reg = ((modrm >> 3) & 7);
     if (is_xmm)
         reg |= rex_r;
@@ -2962,7 +2979,7 @@ static void gen_sse(DisasContext *s, int
         case 0x171: /* shift xmm, im */
         case 0x172:
         case 0x173:
-            val = ldub_code(s->pc++);
+            val = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
             if (is_xmm) {
                 gen_op_movl_T0_im(val);
                 gen_op_movl_env_T0(offsetof(CPUX86State,xmm_t0.XMM_L(0)));
@@ -3082,7 +3099,7 @@ static void gen_sse(DisasContext *s, int
         case 0x1c4:
             s->rip_offset = 1;
             gen_ldst_modrm(s, modrm, OT_WORD, OR_TMP0, 0);
-            val = ldub_code(s->pc++);
+            val = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
             if (b1) {
                 val &= 7;
                 gen_op_pinsrw_xmm(offsetof(CPUX86State,xmm_regs[reg]), val);
@@ -3095,7 +3112,7 @@ static void gen_sse(DisasContext *s, int
         case 0x1c5:
             if (mod != 3)
                 goto illegal_op;
-            val = ldub_code(s->pc++);
+            val = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
             if (b1) {
                 val &= 7;
                 rm = (modrm & 7) | REX_B(s);
@@ -3213,13 +3230,13 @@ static void gen_sse(DisasContext *s, int
         switch(b) {
         case 0x70: /* pshufx insn */
         case 0xc6: /* pshufx insn */
-            val = ldub_code(s->pc++);
+            val = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
             sse_op3 = (GenOpFunc3 *)sse_op2;
             sse_op3(op1_offset, op2_offset, val);
             break;
         case 0xc2:
             /* compare insns */
-            val = ldub_code(s->pc++);
+            val = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
             if (val >= 8)
                 goto illegal_op;
             sse_op2 = sse_op_table4[val][b1];
@@ -3260,8 +3277,9 @@ static target_ulong disas_insn(DisasCont
 #endif
     s->rip_offset = 0; /* for relative ip address */
  next_byte:
-    b = ldub_code(s->pc);
+    b = ldub_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc++;
+    s->phys_pc++;
     /* check prefixes */
 #ifdef TARGET_X86_64
     if (CODE64(s)) {
@@ -3375,7 +3393,7 @@ static target_ulong disas_insn(DisasCont
     case 0x0f:
         /**************************/
         /* extended op code */
-        b = ldub_code(s->pc++) | 0x100;
+        b = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++) | 0x100;
         goto reswitch;
 
         /**************************/
@@ -3400,7 +3418,7 @@ static target_ulong disas_insn(DisasCont
 
             switch(f) {
             case 0: /* OP Ev, Gv */
-                modrm = ldub_code(s->pc++);
+                modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
                 reg = ((modrm >> 3) & 7) | rex_r;
                 mod = (modrm >> 6) & 3;
                 rm = (modrm & 7) | REX_B(s);
@@ -3422,7 +3440,7 @@ static target_ulong disas_insn(DisasCont
                 gen_op(s, op, ot, opreg);
                 break;
             case 1: /* OP Gv, Ev */
-                modrm = ldub_code(s->pc++);
+                modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
                 mod = (modrm >> 6) & 3;
                 reg = ((modrm >> 3) & 7) | rex_r;
                 rm = (modrm & 7) | REX_B(s);
@@ -3457,7 +3475,7 @@ static target_ulong disas_insn(DisasCont
             else
                 ot = dflag + OT_WORD;
 
-            modrm = ldub_code(s->pc++);
+            modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
             mod = (modrm >> 6) & 3;
             rm = (modrm & 7) | REX_B(s);
             op = (modrm >> 3) & 7;
@@ -3506,7 +3524,7 @@ static target_ulong disas_insn(DisasCont
         else
             ot = dflag + OT_WORD;
 
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         mod = (modrm >> 6) & 3;
         rm = (modrm & 7) | REX_B(s);
         op = (modrm >> 3) & 7;
@@ -3648,7 +3666,7 @@ static target_ulong disas_insn(DisasCont
         else
             ot = dflag + OT_WORD;
 
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         mod = (modrm >> 6) & 3;
         rm = (modrm & 7) | REX_B(s);
         op = (modrm >> 3) & 7;
@@ -3754,7 +3772,7 @@ static target_ulong disas_insn(DisasCont
         else
             ot = dflag + OT_WORD;
 
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         mod = (modrm >> 6) & 3;
         rm = (modrm & 7) | REX_B(s);
         reg = ((modrm >> 3) & 7) | rex_r;
@@ -3805,7 +3823,7 @@ static target_ulong disas_insn(DisasCont
     case 0x69: /* imul Gv, Ev, I */
     case 0x6b:
         ot = dflag + OT_WORD;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         reg = ((modrm >> 3) & 7) | rex_r;
         if (b == 0x69)
             s->rip_offset = insn_const_size(ot);
@@ -3841,7 +3859,7 @@ static target_ulong disas_insn(DisasCont
             ot = OT_BYTE;
         else
             ot = dflag + OT_WORD;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
         if (mod == 3) {
@@ -3868,7 +3886,7 @@ static target_ulong disas_insn(DisasCont
             ot = OT_BYTE;
         else
             ot = dflag + OT_WORD;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
         gen_op_mov_TN_reg[ot][1][reg]();
@@ -3885,7 +3903,7 @@ static target_ulong disas_insn(DisasCont
         s->cc_op = CC_OP_SUBB + ot;
         break;
     case 0x1c7: /* cmpxchg8b */
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         mod = (modrm >> 6) & 3;
         if (mod == 3)
             goto illegal_op;
@@ -3944,7 +3962,7 @@ static target_ulong disas_insn(DisasCont
         } else {
             ot = dflag + OT_WORD;
         }
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         mod = (modrm >> 6) & 3;
         gen_pop_T0(s);
         if (mod == 3) {
@@ -3963,9 +3981,10 @@ static target_ulong disas_insn(DisasCont
     case 0xc8: /* enter */
         {
             int level;
-            val = lduw_code(s->pc);
+            val = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
             s->pc += 2;
-            level = ldub_code(s->pc++);
+            s->phys_pc += 2;
+            level = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
             gen_enter(s, val, level);
         }
         break;
@@ -4045,7 +4064,7 @@ static target_ulong disas_insn(DisasCont
             ot = OT_BYTE;
         else
             ot = dflag + OT_WORD;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         reg = ((modrm >> 3) & 7) | rex_r;
 
         /* generate a generic store */
@@ -4057,7 +4076,7 @@ static target_ulong disas_insn(DisasCont
             ot = OT_BYTE;
         else
             ot = dflag + OT_WORD;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         mod = (modrm >> 6) & 3;
         if (mod != 3) {
             s->rip_offset = insn_const_size(ot);
@@ -4076,14 +4095,14 @@ static target_ulong disas_insn(DisasCont
             ot = OT_BYTE;
         else
             ot = OT_WORD + dflag;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         reg = ((modrm >> 3) & 7) | rex_r;
 
         gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
         gen_op_mov_reg_T0[ot][reg]();
         break;
     case 0x8e: /* mov seg, Gv */
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         reg = (modrm >> 3) & 7;
         if (reg >= 6 || reg == R_CS)
             goto illegal_op;
@@ -4103,7 +4122,7 @@ static target_ulong disas_insn(DisasCont
         }
         break;
     case 0x8c: /* mov Gv, seg */
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         reg = (modrm >> 3) & 7;
         mod = (modrm >> 6) & 3;
         if (reg >= 6)
@@ -4126,7 +4145,7 @@ static target_ulong disas_insn(DisasCont
             d_ot = dflag + OT_WORD;
             /* ot is the size of source */
             ot = (b & 1) + OT_BYTE;
-            modrm = ldub_code(s->pc++);
+            modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
             reg = ((modrm >> 3) & 7) | rex_r;
             mod = (modrm >> 6) & 3;
             rm = (modrm & 7) | REX_B(s);
@@ -4163,7 +4182,7 @@ static target_ulong disas_insn(DisasCont
 
     case 0x8d: /* lea */
         ot = dflag + OT_WORD;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         mod = (modrm >> 6) & 3;
         if (mod == 3)
             goto illegal_op;
@@ -4190,8 +4209,9 @@ static target_ulong disas_insn(DisasCont
                 ot = dflag + OT_WORD;
 #ifdef TARGET_X86_64
             if (s->aflag == 2) {
-                offset_addr = ldq_code(s->pc);
+                offset_addr = ldq_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
                 s->pc += 8;
+                s->phys_pc += 8;
                 if (offset_addr == (int32_t)offset_addr)
                     gen_op_movq_A0_im(offset_addr);
                 else
@@ -4243,8 +4263,9 @@ static target_ulong disas_insn(DisasCont
         if (dflag == 2) {
             uint64_t tmp;
             /* 64 bit case */
-            tmp = ldq_code(s->pc);
+            tmp = ldq_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
             s->pc += 8;
+            s->phys_pc += 8;
             reg = (b & 7) | REX_B(s);
             gen_movtl_T0_im(tmp);
             gen_op_mov_reg_T0[OT_QUAD][reg]();
@@ -4270,7 +4291,7 @@ static target_ulong disas_insn(DisasCont
             ot = OT_BYTE;
         else
             ot = dflag + OT_WORD;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
         if (mod == 3) {
@@ -4313,7 +4334,7 @@ static target_ulong disas_insn(DisasCont
         op = R_GS;
     do_lxx:
         ot = dflag ? OT_LONG : OT_WORD;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
         if (mod == 3)
@@ -4345,7 +4366,7 @@ static target_ulong disas_insn(DisasCont
             else
                 ot = dflag + OT_WORD;
 
-            modrm = ldub_code(s->pc++);
+            modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
             mod = (modrm >> 6) & 3;
             op = (modrm >> 3) & 7;
 
@@ -4364,7 +4385,8 @@ static target_ulong disas_insn(DisasCont
                 gen_shift(s, op, ot, opreg, OR_ECX);
             } else {
                 if (shift == 2) {
-                    shift = ldub_code(s->pc++);
+                    shift = ldub_code_p(&s->phys_pc_start, s->phys_pc++,
+                                        s->pc++);
                 }
                 gen_shifti(s, op, ot, opreg, shift);
             }
@@ -4398,7 +4420,7 @@ static target_ulong disas_insn(DisasCont
         shift = 0;
     do_shiftd:
         ot = dflag + OT_WORD;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         mod = (modrm >> 6) & 3;
         rm = (modrm & 7) | REX_B(s);
         reg = ((modrm >> 3) & 7) | rex_r;
@@ -4412,7 +4434,7 @@ static target_ulong disas_insn(DisasCont
         gen_op_mov_TN_reg[ot][1][reg]();
 
         if (shift) {
-            val = ldub_code(s->pc++);
+            val = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
             if (ot == OT_QUAD)
                 val &= 0x3f;
             else
@@ -4450,7 +4472,7 @@ static target_ulong disas_insn(DisasCont
             gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
             break;
         }
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         mod = (modrm >> 6) & 3;
         rm = modrm & 7;
         op = ((b & 7) << 3) | ((modrm >> 3) & 7);
@@ -5013,7 +5035,7 @@ static target_ulong disas_insn(DisasCont
             ot = OT_BYTE;
         else
             ot = dflag ? OT_LONG : OT_WORD;
-        val = ldub_code(s->pc++);
+        val = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         gen_op_movl_T0_im(val);
         gen_check_io(s, ot, 0, pc_start - s->cs_base);
         if (gen_svm_check_io(s, pc_start,
@@ -5029,7 +5051,7 @@ static target_ulong disas_insn(DisasCont
             ot = OT_BYTE;
         else
             ot = dflag ? OT_LONG : OT_WORD;
-        val = ldub_code(s->pc++);
+        val = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         gen_op_movl_T0_im(val);
         gen_check_io(s, ot, 0, pc_start - s->cs_base);
         if (gen_svm_check_io(s, pc_start, svm_is_rep(prefixes) |
@@ -5073,8 +5095,9 @@ static target_ulong disas_insn(DisasCont
         /************************/
         /* control */
     case 0xc2: /* ret im */
-        val = ldsw_code(s->pc);
+        val = ldsw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
         s->pc += 2;
+        s->phys_pc += 2;
         gen_pop_T0(s);
         if (CODE64(s) && s->dflag)
             s->dflag = 2;
@@ -5093,8 +5116,9 @@ static target_ulong disas_insn(DisasCont
         gen_eob(s);
         break;
     case 0xca: /* lret im */
-        val = ldsw_code(s->pc);
+        val = ldsw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
         s->pc += 2;
+        s->phys_pc += 2;
     do_lret:
         if (s->pe && !s->vm86) {
             if (s->cc_op != CC_OP_DYNAMIC)
@@ -5223,13 +5247,13 @@ static target_ulong disas_insn(DisasCont
         break;
 
     case 0x190 ... 0x19f: /* setcc Gv */
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         gen_setcc(s, b);
         gen_ldst_modrm(s, modrm, OT_BYTE, OR_TMP0, 1);
         break;
     case 0x140 ... 0x14f: /* cmov Gv, Ev */
         ot = dflag + OT_WORD;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
         gen_setcc(s, b);
@@ -5338,7 +5362,7 @@ static target_ulong disas_insn(DisasCont
         /* bit operations */
     case 0x1ba: /* bt/bts/btr/btc Gv, im */
         ot = dflag + OT_WORD;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         op = (modrm >> 3) & 7;
         mod = (modrm >> 6) & 3;
         rm = (modrm & 7) | REX_B(s);
@@ -5350,7 +5374,7 @@ static target_ulong disas_insn(DisasCont
             gen_op_mov_TN_reg[ot][0][rm]();
         }
         /* load shift */
-        val = ldub_code(s->pc++);
+        val = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         gen_op_movl_T1_im(val);
         if (op < 4)
             goto illegal_op;
@@ -5378,7 +5402,7 @@ static target_ulong disas_insn(DisasCont
         op = 3;
     do_btx:
         ot = dflag + OT_WORD;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
         rm = (modrm & 7) | REX_B(s);
@@ -5404,7 +5428,7 @@ static target_ulong disas_insn(DisasCont
     case 0x1bc: /* bsf */
     case 0x1bd: /* bsr */
         ot = dflag + OT_WORD;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         reg = ((modrm >> 3) & 7) | rex_r;
         gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
         /* NOTE: in order to handle the 0 case, we must load the
@@ -5451,7 +5475,7 @@ static target_ulong disas_insn(DisasCont
     case 0xd4: /* aam */
         if (CODE64(s))
             goto illegal_op;
-        val = ldub_code(s->pc++);
+        val = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         if (val == 0) {
             gen_exception(s, EXCP00_DIVZ, pc_start - s->cs_base);
         } else {
@@ -5462,7 +5486,7 @@ static target_ulong disas_insn(DisasCont
     case 0xd5: /* aad */
         if (CODE64(s))
             goto illegal_op;
-        val = ldub_code(s->pc++);
+        val = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         gen_op_aad(val);
         s->cc_op = CC_OP_LOGICB;
         break;
@@ -5494,7 +5518,7 @@ static target_ulong disas_insn(DisasCont
         gen_interrupt(s, EXCP03_INT3, pc_start - s->cs_base, s->pc - s->cs_base);
         break;
     case 0xcd: /* int N */
-        val = ldub_code(s->pc++);
+        val = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         if (gen_svm_check_intercept(s, pc_start, SVM_EXIT_SWINT))
             break;
         if (s->vm86 && s->iopl != 3) {
@@ -5567,7 +5591,7 @@ static target_ulong disas_insn(DisasCont
         if (CODE64(s))
             goto illegal_op;
         ot = dflag ? OT_LONG : OT_WORD;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         reg = (modrm >> 3) & 7;
         mod = (modrm >> 6) & 3;
         if (mod == 3)
@@ -5738,7 +5762,7 @@ static target_ulong disas_insn(DisasCont
         }
         break;
     case 0x100:
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         mod = (modrm >> 6) & 3;
         op = (modrm >> 3) & 7;
         switch(op) {
@@ -5808,7 +5832,7 @@ static target_ulong disas_insn(DisasCont
         }
         break;
     case 0x101:
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         mod = (modrm >> 6) & 3;
         op = (modrm >> 3) & 7;
         rm = modrm & 7;
@@ -6022,7 +6046,7 @@ static target_ulong disas_insn(DisasCont
             /* d_ot is the size of destination */
             d_ot = dflag + OT_WORD;
 
-            modrm = ldub_code(s->pc++);
+            modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
             reg = ((modrm >> 3) & 7) | rex_r;
             mod = (modrm >> 6) & 3;
             rm = (modrm & 7) | REX_B(s);
@@ -6048,7 +6072,7 @@ static target_ulong disas_insn(DisasCont
             if (!s->pe || s->vm86)
                 goto illegal_op;
             ot = dflag ? OT_LONG : OT_WORD;
-            modrm = ldub_code(s->pc++);
+            modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
             reg = (modrm >> 3) & 7;
             mod = (modrm >> 6) & 3;
             rm = modrm & 7;
@@ -6075,7 +6099,7 @@ static target_ulong disas_insn(DisasCont
         if (!s->pe || s->vm86)
             goto illegal_op;
         ot = dflag ? OT_LONG : OT_WORD;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         reg = ((modrm >> 3) & 7) | rex_r;
         gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
         gen_op_mov_TN_reg[ot][1][reg]();
@@ -6089,7 +6113,7 @@ static target_ulong disas_insn(DisasCont
         gen_op_mov_reg_T1[ot][reg]();
         break;
     case 0x118:
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         mod = (modrm >> 6) & 3;
         op = (modrm >> 3) & 7;
         switch(op) {
@@ -6108,7 +6132,7 @@ static target_ulong disas_insn(DisasCont
         }
         break;
     case 0x119 ... 0x11f: /* nop (multi byte) */
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         gen_nop_modrm(s, modrm);
         break;
     case 0x120: /* mov reg, crN */
@@ -6116,7 +6140,7 @@ static target_ulong disas_insn(DisasCont
         if (s->cpl != 0) {
             gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
         } else {
-            modrm = ldub_code(s->pc++);
+            modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
             if ((modrm & 0xc0) != 0xc0)
                 goto illegal_op;
             rm = (modrm & 7) | REX_B(s);
@@ -6158,7 +6182,7 @@ static target_ulong disas_insn(DisasCont
         if (s->cpl != 0) {
             gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
         } else {
-            modrm = ldub_code(s->pc++);
+            modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
             if ((modrm & 0xc0) != 0xc0)
                 goto illegal_op;
             rm = (modrm & 7) | REX_B(s);
@@ -6199,7 +6223,7 @@ static target_ulong disas_insn(DisasCont
         if (!(s->cpuid_features & CPUID_SSE2))
             goto illegal_op;
         ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         mod = (modrm >> 6) & 3;
         if (mod == 3)
             goto illegal_op;
@@ -6208,7 +6232,7 @@ static target_ulong disas_insn(DisasCont
         gen_ldst_modrm(s, modrm, ot, reg, 1);
         break;
     case 0x1ae:
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         mod = (modrm >> 6) & 3;
         op = (modrm >> 3) & 7;
         switch(op) {
@@ -6274,7 +6298,7 @@ static target_ulong disas_insn(DisasCont
         }
         break;
     case 0x10d: /* prefetch */
-        modrm = ldub_code(s->pc++);
+        modrm = ldub_code_p(&s->phys_pc_start, s->phys_pc++, s->pc++);
         gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
         /* ignore for now */
         break;
@@ -6752,6 +6776,9 @@ static inline int gen_intermediate_code_
 
     dc->is_jmp = DISAS_NEXT;
     pc_ptr = pc_start;
+    dc->phys_pc_start = (unsigned long)phys_ram_base + tb->page_addr[0] +
+        (pc_start & ~TARGET_PAGE_MASK);
+    dc->phys_pc = dc->phys_pc_start;
     lj = -1;
 
     for(;;) {
Index: target-m68k/cpu.h
===================================================================
RCS file: /sources/qemu/qemu/target-m68k/cpu.h,v
retrieving revision 1.14
diff -u -d -d -p -r1.14 cpu.h
--- target-m68k/cpu.h	14 Oct 2007 07:07:06 -0000	1.14
+++ target-m68k/cpu.h	14 Oct 2007 11:35:55 -0000
@@ -22,6 +22,8 @@
 #define CPU_M68K_H
 
 #define TARGET_LONG_BITS 32
+/* need explicit support for instructions spanning 2 pages */
+#define TARGET_HAS_VLE_INSNS 1
 
 #include "cpu-defs.h"
 
Index: target-m68k/translate.c
===================================================================
RCS file: /sources/qemu/qemu/target-m68k/translate.c,v
retrieving revision 1.20
diff -u -d -d -p -r1.20 translate.c
--- target-m68k/translate.c	17 Sep 2007 08:09:53 -0000	1.20
+++ target-m68k/translate.c	14 Oct 2007 11:35:55 -0000
@@ -45,6 +45,8 @@ typedef struct DisasContext {
     CPUM68KState *env;
     target_ulong insn_pc; /* Start of the current instruction.  */
     target_ulong pc;
+    unsigned long phys_pc;
+    unsigned long phys_pc_start;
     int is_jmp;
     int cc_op;
     int user;
@@ -207,10 +209,12 @@ static int gen_ldst(DisasContext *s, int
 static inline uint32_t read_im32(DisasContext *s)
 {
     uint32_t im;
-    im = ((uint32_t)lduw_code(s->pc)) << 16;
+    im = ((uint32_t)lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc)) << 16;
     s->pc += 2;
-    im |= lduw_code(s->pc);
+    s->phys_pc += 2;
+    im |= lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 2;
+    s->phys_pc += 2;
     return im;
 }
 
@@ -244,8 +248,9 @@ static int gen_lea_indexed(DisasContext 
     uint32_t bd, od;
 
     offset = s->pc;
-    ext = lduw_code(s->pc);
+    ext = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 2;
+    s->phys_pc += 2;
 
     if ((ext & 0x800) == 0 && !m68k_feature(s->env, M68K_FEATURE_WORD_INDEX))
         return -1;
@@ -258,8 +263,10 @@ static int gen_lea_indexed(DisasContext 
         if ((ext & 0x30) > 0x10) {
             /* base displacement */
             if ((ext & 0x30) == 0x20) {
-                bd = (int16_t)lduw_code(s->pc);
+                bd = (int16_t)lduw_code_p(&s->phys_pc_start, s->phys_pc,
+                                          s->pc);
                 s->pc += 2;
+                s->phys_pc += 2;
             } else {
                 bd = read_im32(s);
             }
@@ -307,8 +314,10 @@ static int gen_lea_indexed(DisasContext 
             if ((ext & 3) > 1) {
                 /* outer displacement */
                 if ((ext & 3) == 2) {
-                    od = (int16_t)lduw_code(s->pc);
+                    od = (int16_t)lduw_code_p(&s->phys_pc_start, s->phys_pc,
+                                              s->pc);
                     s->pc += 2;
+                    s->phys_pc += 2;
                 } else {
                     od = read_im32(s);
                 }
@@ -455,8 +464,9 @@ static int gen_lea(DisasContext *s, uint
     case 5: /* Indirect displacement.  */
         reg += QREG_A0;
         tmp = gen_new_qreg(QMODE_I32);
-        ext = lduw_code(s->pc);
+        ext = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
         s->pc += 2;
+        s->phys_pc += 2;
         gen_op_add32(tmp, reg, gen_im32((int16_t)ext));
         return tmp;
     case 6: /* Indirect index + displacement.  */
@@ -465,8 +475,9 @@ static int gen_lea(DisasContext *s, uint
     case 7: /* Other */
         switch (reg) {
         case 0: /* Absolute short.  */
-            offset = ldsw_code(s->pc);
+            offset = ldsw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
             s->pc += 2;
+            s->phys_pc += 2;
             return gen_im32(offset);
         case 1: /* Absolute long.  */
             offset = read_im32(s);
@@ -474,8 +485,9 @@ static int gen_lea(DisasContext *s, uint
         case 2: /* pc displacement  */
             tmp = gen_new_qreg(QMODE_I32);
             offset = s->pc;
-            offset += ldsw_code(s->pc);
+            offset += ldsw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
             s->pc += 2;
+            s->phys_pc += 2;
             return gen_im32(offset);
         case 3: /* pc index+displacement.  */
             return gen_lea_indexed(s, opsize, -1);
@@ -581,18 +593,23 @@ static int gen_ea(DisasContext *s, uint1
             /* Sign extend values for consistency.  */
             switch (opsize) {
             case OS_BYTE:
-                if (val)
-                    offset = ldsb_code(s->pc + 1);
-                else
-                    offset = ldub_code(s->pc + 1);
+                if (val) {
+                    offset = ldsb_code_p(&s->phys_pc_start, s->phys_pc + 1,
+                                         s->pc + 1);
+                } else {
+                    offset = ldub_code_p(&s->phys_pc_start, s->phys_pc + 1,
+                                         s->pc + 1);
+                }
                 s->pc += 2;
+                s->phys_pc += 2;
                 break;
             case OS_WORD:
                 if (val)
-                    offset = ldsw_code(s->pc);
+                    offset = ldsw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
                 else
-                    offset = lduw_code(s->pc);
+                    offset = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
                 s->pc += 2;
+                s->phys_pc += 2;
                 break;
             case OS_LONG:
                 offset = read_im32(s);
@@ -879,8 +896,9 @@ DISAS_INSN(divl)
     int reg;
     uint16_t ext;
 
-    ext = lduw_code(s->pc);
+    ext = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 2;
+    s->phys_pc += 2;
     if (ext & 0x87f8) {
         gen_exception(s, s->pc - 4, EXCP_UNSUPPORTED);
         return;
@@ -1066,8 +1084,9 @@ DISAS_INSN(movem)
     int tmp;
     int is_load;
 
-    mask = lduw_code(s->pc);
+    mask = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 2;
+    s->phys_pc += 2;
     tmp = gen_lea(s, insn, OS_LONG);
     if (tmp == -1) {
         gen_addr_fault(s);
@@ -1111,8 +1130,9 @@ DISAS_INSN(bitop_im)
         opsize = OS_LONG;
     op = (insn >> 6) & 3;
 
-    bitnum = lduw_code(s->pc);
+    bitnum = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 2;
+    s->phys_pc += 2;
     if (bitnum & 0xff00) {
         disas_undef(s, insn);
         return;
@@ -1375,8 +1395,9 @@ static void gen_set_sr(DisasContext *s, 
     else if ((insn & 0x3f) == 0x3c)
       {
         uint16_t val;
-        val = lduw_code(s->pc);
+        val = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
         s->pc += 2;
+        s->phys_pc += 2;
         gen_set_sr_im(s, val, ccr_only);
       }
     else
@@ -1502,8 +1523,9 @@ DISAS_INSN(mull)
 
     /* The upper 32 bits of the product are discarded, so
        muls.l and mulu.l are functionally equivalent.  */
-    ext = lduw_code(s->pc);
+    ext = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 2;
+    s->phys_pc += 2;
     if (ext & 0x87ff) {
         gen_exception(s, s->pc - 4, EXCP_UNSUPPORTED);
         return;
@@ -1523,8 +1545,9 @@ DISAS_INSN(link)
     int reg;
     int tmp;
 
-    offset = ldsw_code(s->pc);
+    offset = ldsw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 2;
+    s->phys_pc += 2;
     reg = AREG(insn, 0);
     tmp = gen_new_qreg(QMODE_I32);
     gen_op_sub32(tmp, QREG_SP, gen_im32(4));
@@ -1622,9 +1645,11 @@ DISAS_INSN(tpf)
     switch (insn & 7) {
     case 2: /* One extension word.  */
         s->pc += 2;
+        s->phys_pc += 2;
         break;
     case 3: /* Two extension words.  */
         s->pc += 4;
+        s->phys_pc += 4;
         break;
     case 4: /* No extension words.  */
         break;
@@ -1644,8 +1669,9 @@ DISAS_INSN(branch)
     op = (insn >> 8) & 0xf;
     offset = (int8_t)insn;
     if (offset == 0) {
-        offset = ldsw_code(s->pc);
+        offset = ldsw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
         s->pc += 2;
+        s->phys_pc += 2;
     } else if (offset == -1) {
         offset = read_im32(s);
     }
@@ -1957,14 +1983,16 @@ DISAS_INSN(strldsr)
     uint32_t addr;
 
     addr = s->pc - 2;
-    ext = lduw_code(s->pc);
+    ext = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 2;
+    s->phys_pc += 2;
     if (ext != 0x46FC) {
         gen_exception(s, addr, EXCP_UNSUPPORTED);
         return;
     }
-    ext = lduw_code(s->pc);
+    ext = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 2;
+    s->phys_pc += 2;
     if (IS_USER(s) || (ext & SR_S) == 0) {
         gen_exception(s, addr, EXCP_PRIVILEGE);
         return;
@@ -2032,8 +2060,9 @@ DISAS_INSN(stop)
         return;
     }
 
-    ext = lduw_code(s->pc);
+    ext = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 2;
+    s->phys_pc += 2;
 
     gen_set_sr_im(s, ext, 0);
     gen_jmp(s, gen_im32(s->pc));
@@ -2059,8 +2088,9 @@ DISAS_INSN(movec)
         return;
     }
 
-    ext = lduw_code(s->pc);
+    ext = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 2;
+    s->phys_pc += 2;
 
     if (ext & 0x8000) {
         reg = AREG(ext, 12);
@@ -2121,8 +2151,9 @@ DISAS_INSN(fpu)
     int round;
     int opsize;
 
-    ext = lduw_code(s->pc);
+    ext = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 2;
+    s->phys_pc += 2;
     opmode = ext & 0x7f;
     switch ((ext >> 13) & 7) {
     case 0: case 2:
@@ -2331,6 +2362,7 @@ DISAS_INSN(fpu)
     return;
 undef:
     s->pc -= 2;
+    s->phys_pc -= 2;
     disas_undef_fpu(s, insn);
 }
 
@@ -2343,11 +2375,14 @@ DISAS_INSN(fbcc)
     int l1;
 
     addr = s->pc;
-    offset = ldsw_code(s->pc);
+    offset = ldsw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 2;
+    s->phys_pc += 2;
     if (insn & (1 << 6)) {
-        offset = (offset << 16) | lduw_code(s->pc);
+        offset = (offset << 16) |
+            lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
         s->pc += 2;
+        s->phys_pc += 2;
     }
 
     l1 = gen_new_label();
@@ -2473,8 +2508,9 @@ DISAS_INSN(mac)
     int dual;
     int saved_flags = -1;
 
-    ext = lduw_code(s->pc);
+    ext = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 2;
+    s->phys_pc += 2;
 
     acc = ((insn >> 7) & 1) | ((ext >> 3) & 2);
     dual = ((insn & 0x30) != 0 && (ext & 3) != 0);
@@ -2882,8 +2918,9 @@ static void disas_m68k_insn(CPUState * e
 {
     uint16_t insn;
 
-    insn = lduw_code(s->pc);
+    insn = lduw_code_p(&s->phys_pc_start, s->phys_pc, s->pc);
     s->pc += 2;
+    s->phys_pc += 2;
 
     opcode_table[insn](s, insn);
 }
@@ -3169,6 +3206,9 @@ gen_intermediate_code_internal(CPUState 
     dc->env = env;
     dc->is_jmp = DISAS_NEXT;
     dc->pc = pc_start;
+    dc->phys_pc_start = (unsigned long)phys_ram_base + tb->page_addr[0] +
+        (pc_start & ~TARGET_PAGE_MASK);
+    dc->phys_pc = dc->phys_pc_start;
     dc->cc_op = CC_OP_DYNAMIC;
     dc->singlestep_enabled = env->singlestep_enabled;
     dc->fpcr = env->fpcr;
Index: target-mips/translate.c
===================================================================
RCS file: /sources/qemu/qemu/target-mips/translate.c,v
retrieving revision 1.106
diff -u -d -d -p -r1.106 translate.c
--- target-mips/translate.c	9 Oct 2007 03:39:58 -0000	1.106
+++ target-mips/translate.c	14 Oct 2007 11:35:56 -0000
@@ -536,6 +536,7 @@ FOP_CONDS(abs, ps)
 typedef struct DisasContext {
     struct TranslationBlock *tb;
     target_ulong pc, saved_pc;
+    unsigned long phys_pc, phys_pc_start;
     uint32_t opcode;
     uint32_t fp_status;
     /* Routine used to access memory */
@@ -1764,6 +1765,7 @@ static void gen_compute_branch (DisasCon
             /* Skip the instruction in the delay slot */
             MIPS_DEBUG("bnever, link and skip");
             ctx->pc += 4;
+            ctx->phys_pc += 4;
             return;
         case OPC_BNEL:    /* rx != rx likely */
         case OPC_BGTZL:   /* 0 > 0 likely */
@@ -1771,6 +1773,7 @@ static void gen_compute_branch (DisasCon
             /* Skip the instruction in the delay slot */
             MIPS_DEBUG("bnever and skip");
             ctx->pc += 4;
+            ctx->phys_pc += 4;
             return;
         case OPC_J:
             ctx->hflags |= MIPS_HFLAG_B;
@@ -6495,6 +6498,9 @@ gen_intermediate_code_internal (CPUState
     gen_opparam_ptr = gen_opparam_buf;
     nb_gen_labels = 0;
     ctx.pc = pc_start;
+    ctx.phys_pc_start = (unsigned long)phys_ram_base + tb->page_addr[0] +
+        (pc_start & ~TARGET_PAGE_MASK);
+    ctx.phys_pc = ctx.phys_pc_start;
     ctx.saved_pc = -1;
     ctx.tb = tb;
     ctx.bstate = BS_NONE;
@@ -6544,9 +6550,10 @@ gen_intermediate_code_internal (CPUState
             gen_opc_hflags[lj] = ctx.hflags & MIPS_HFLAG_BMASK;
             gen_opc_instr_start[lj] = 1;
         }
-        ctx.opcode = ldl_code(ctx.pc);
+        ctx.opcode = ldl_code_p(&ctx.phys_pc_start, ctx.phys_pc, ctx.pc);
         decode_opc(env, &ctx);
         ctx.pc += 4;
+        ctx.phys_pc += 4;
 
         if (env->singlestep_enabled)
             break;
Index: target-ppc/cpu.h
===================================================================
RCS file: /sources/qemu/qemu/target-ppc/cpu.h,v
retrieving revision 1.83
diff -u -d -d -p -r1.83 cpu.h
--- target-ppc/cpu.h	14 Oct 2007 10:21:20 -0000	1.83
+++ target-ppc/cpu.h	14 Oct 2007 11:35:56 -0000
@@ -37,6 +37,8 @@ typedef uint64_t ppc_gpr_t;
 #define TARGET_GPR_BITS  64
 #define TARGET_LONG_BITS 32
 #define REGX "%016" PRIx64
+/* need explicit support for instructions spanning 2 pages for VLE code */
+#define TARGET_HAS_VLE_INSNS 1
 #if defined(CONFIG_USER_ONLY)
 /* It looks like a lot of Linux programs assume page size
  * is 4kB long. This is evil, but we have to deal with it...
Index: target-ppc/translate.c
===================================================================
RCS file: /sources/qemu/qemu/target-ppc/translate.c,v
retrieving revision 1.93
diff -u -d -d -p -r1.93 translate.c
--- target-ppc/translate.c	14 Oct 2007 07:07:07 -0000	1.93
+++ target-ppc/translate.c	14 Oct 2007 11:35:56 -0000
@@ -6678,6 +6678,7 @@ static always_inline int gen_intermediat
 {
     DisasContext ctx, *ctxp = &ctx;
     opc_handler_t **table, *handler;
+    unsigned long phys_pc, phys_pc_start;
     target_ulong pc_start;
     uint16_t *gen_opc_end;
     int supervisor;
@@ -6685,6 +6686,9 @@ static always_inline int gen_intermediat
     int j, lj = -1;
 
     pc_start = tb->pc;
+    phys_pc_start = (unsigned long)phys_ram_base + tb->page_addr[0] +
+        (pc_start & ~TARGET_PAGE_MASK);
+    phys_pc = phys_pc_start;
     gen_opc_ptr = gen_opc_buf;
     gen_opc_end = gen_opc_buf + OPC_MAX_SIZE;
     gen_opparam_ptr = gen_opparam_buf;
@@ -6756,7 +6760,7 @@ static always_inline int gen_intermediat
                     ctx.nip, 1 - msr_pr, msr_ir);
         }
 #endif
-        ctx.opcode = ldl_code(ctx.nip);
+        ctx.opcode = ldl_code_p(&phys_pc_start, phys_pc, ctx.nip);
         if (msr_le) {
             ctx.opcode = ((ctx.opcode & 0xFF000000) >> 24) |
                 ((ctx.opcode & 0x00FF0000) >> 8) |
@@ -6771,6 +6775,7 @@ static always_inline int gen_intermediat
         }
 #endif
         ctx.nip += 4;
+        phys_pc += 4;
         table = env->opcodes;
         handler = table[opc1(ctx.opcode)];
         if (is_indirect_opcode(handler)) {
Index: target-sh4/translate.c
===================================================================
RCS file: /sources/qemu/qemu/target-sh4/translate.c,v
retrieving revision 1.18
diff -u -d -d -p -r1.18 translate.c
--- target-sh4/translate.c	29 Sep 2007 19:52:22 -0000	1.18
+++ target-sh4/translate.c	14 Oct 2007 11:35:56 -0000
@@ -1150,11 +1150,15 @@ gen_intermediate_code_internal(CPUState 
 {
     DisasContext ctx;
     target_ulong pc_start;
+    unsigned long phys_pc, phys_pc_start;
     static uint16_t *gen_opc_end;
     uint32_t old_flags;
     int i, ii;
 
     pc_start = tb->pc;
+    phys_pc_start = (unsigned long)phys_ram_base + tb->page_addr[0] +
+        (pc_start & ~TARGET_PAGE_MASK);
+    phys_pc = phys_pc_start;
     gen_opc_ptr = gen_opc_buf;
     gen_opc_end = gen_opc_buf + OPC_MAX_SIZE;
     gen_opparam_ptr = gen_opparam_buf;
@@ -1210,9 +1214,10 @@ gen_intermediate_code_internal(CPUState 
 	fprintf(stderr, "Loading opcode at address 0x%08x\n", ctx.pc);
 	fflush(stderr);
 #endif
-	ctx.opcode = lduw_code(ctx.pc);
+	ctx.opcode = lduw_code_p(&phys_pc_start, phys_pc, ctx.pc);
 	decode_opc(&ctx);
 	ctx.pc += 2;
+        phys_pc += 2;
 	if ((ctx.pc & (TARGET_PAGE_SIZE - 1)) == 0)
 	    break;
 	if (env->singlestep_enabled)
Index: target-sparc/translate.c
===================================================================
RCS file: /sources/qemu/qemu/target-sparc/translate.c,v
retrieving revision 1.75
diff -u -d -d -p -r1.75 translate.c
--- target-sparc/translate.c	14 Oct 2007 07:07:08 -0000	1.75
+++ target-sparc/translate.c	14 Oct 2007 11:35:56 -0000
@@ -48,6 +48,8 @@ typedef struct DisasContext {
     target_ulong pc;    /* current Program Counter: integer or DYNAMIC_PC */
     target_ulong npc;   /* next PC: integer or DYNAMIC_PC or JUMP_PC */
     target_ulong jump_pc[2]; /* used when JUMP_PC pc value is used */
+    unsigned long phys_pc;
+    unsigned long phys_pc_start;
     int is_br;
     int mem_idx;
     int fpu_enabled;
@@ -1089,7 +1091,7 @@ static void disas_sparc_insn(DisasContex
 {
     unsigned int insn, opc, rs1, rs2, rd;
 
-    insn = ldl_code(dc->pc);
+    insn = ldl_code_p(&dc->phys_pc_start, dc->phys_pc, dc->pc);
     opc = GET_FIELD(insn, 0, 1);
 
     rd = GET_FIELD(insn, 2, 6);
@@ -3376,6 +3378,9 @@ static inline int gen_intermediate_code_
     dc->tb = tb;
     pc_start = tb->pc;
     dc->pc = pc_start;
+    dc->phys_pc_start = (unsigned long)phys_ram_base + tb->page_addr[0] +
+        (pc_start & ~TARGET_PAGE_MASK);
+    dc->phys_pc = dc->phys_pc_start;
     last_pc = dc->pc;
     dc->npc = (target_ulong) tb->cs_base;
 #if defined(CONFIG_USER_ONLY)
@@ -3422,6 +3427,7 @@ static inline int gen_intermediate_code_
             }
         }
         last_pc = dc->pc;
+        dc->phys_pc = dc->phys_pc_start + dc->pc - pc_start;
         disas_sparc_insn(dc);
 
         if (dc->is_br)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] RFC: Code fetch optimisation
  2007-10-14 11:44 [Qemu-devel] RFC: Code fetch optimisation J. Mayer
@ 2007-10-15  2:30 ` Paul Brook
  2007-10-15 12:09   ` J. Mayer
  0 siblings, 1 reply; 17+ messages in thread
From: Paul Brook @ 2007-10-15  2:30 UTC (permalink / raw)
  To: qemu-devel; +Cc: J. Mayer

On Sunday 14 October 2007, J. Mayer wrote:
> Here's an updated version of the code fetch optimisation patch against
> current CVS.
> As a remainder, this patch avoid use of softmmu helpers to fetch the
> code in most case. A new target define TARGET_HAS_VLE_INSNS has been
> added which is used to handle the case of an instruction that span 2
> pages, when the target CPU uses a variable-length instructions encoding.
> For pure RISC, the code fetch is done using raw access routines.

> +    unsigned long phys_pc;
> +    unsigned long phys_pc_start;

These are ram offsets, not physical addresses. I recommend naming them as such 
to avoid confusion.

> +    opc = glue(glue(lds,SUFFIX),MEMSUFFIX)(virt_pc);
> +    /* Avoid softmmu access on next load */
> +    /* XXX: dont: phys PC is not correct anymore
> +     *      We could call get_phys_addr_code(env, pc); and remove the else
> +     *      condition, here. 
> +     */
> +    //*start_pc = phys_pc;

The commented out code is completely bogus, please remove it. The comment is 
also somewhat misleading/incorrect. The else would still be required for 
accesses that span a page boundary.

The code itself looks ok, though I'd be surprised if it made a significant 
difference. We're always going to hit the fast-path TLB lookup case anyway.

Paul

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] RFC: Code fetch optimisation
  2007-10-15  2:30 ` Paul Brook
@ 2007-10-15 12:09   ` J. Mayer
  2007-10-15 16:01     ` Paul Brook
  0 siblings, 1 reply; 17+ messages in thread
From: J. Mayer @ 2007-10-15 12:09 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel

On Mon, 2007-10-15 at 03:30 +0100, Paul Brook wrote:
> On Sunday 14 October 2007, J. Mayer wrote:
> > Here's an updated version of the code fetch optimisation patch against
> > current CVS.
> > As a remainder, this patch avoid use of softmmu helpers to fetch the
> > code in most case. A new target define TARGET_HAS_VLE_INSNS has been
> > added which is used to handle the case of an instruction that span 2
> > pages, when the target CPU uses a variable-length instructions encoding.
> > For pure RISC, the code fetch is done using raw access routines.
> 
> > +    unsigned long phys_pc;
> > +    unsigned long phys_pc_start;
> 
> These are ram offsets, not physical addresses. I recommend naming them as such 
> to avoid confusion.

Well, those are host addresses. Fabrice even suggested me to replace
them with void * to prevent confusion, but I kept using unsigned long
because the _p functions API do not use pointers. As those values are
defined as phys_ram_base + offset, those are likely to be host address,
not RAM offset, and are used directly to dereference host pointers in
the ldxxx_p functions. Did I miss something ?

> > +    opc = glue(glue(lds,SUFFIX),MEMSUFFIX)(virt_pc);
> > +    /* Avoid softmmu access on next load */
> > +    /* XXX: dont: phys PC is not correct anymore
> > +     *      We could call get_phys_addr_code(env, pc); and remove the else
> > +     *      condition, here. 
> > +     */
> > +    //*start_pc = phys_pc;
> 
> The commented out code is completely bogus, please remove it. The comment is 
> also somewhat misleading/incorrect. The else would still be required for 
> accesses that span a page boundary.

I guess trying to optimize this case retrieving the physical address
would not bring any optimization as in fact only the last translated
instruction of a TB (then only a few code loads) may hit this case.
I'd like to keep a comment here to show that it may not be a good idea
(or may not be as simple as it seems at first sight) to try to do more
optimisation here, but you're right this comment is not correct.

> The code itself looks ok, though I'd be surprised if it made a significant 
> difference. We're always going to hit the fast-path TLB lookup case anyway.

It seems that the generated code for the code fetch is much more
efficient than the one generated when we get when using the softmmu
routines. But it's true we do not get any significant performance boost.
As it was previously mentioned, the idea of the patch is more a 'don't
do unneeded things during code translation' than a great performance
improvment.

-- 
J. Mayer <l_indien@magic.fr>
Never organized

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] RFC: Code fetch optimisation
  2007-10-15 12:09   ` J. Mayer
@ 2007-10-15 16:01     ` Paul Brook
  2007-10-15 16:19       ` Fabrice Bellard
  2007-10-15 21:30       ` J. Mayer
  0 siblings, 2 replies; 17+ messages in thread
From: Paul Brook @ 2007-10-15 16:01 UTC (permalink / raw)
  To: qemu-devel; +Cc: J. Mayer

> > > +    unsigned long phys_pc;
> > > +    unsigned long phys_pc_start;
> >
> > These are ram offsets, not physical addresses. I recommend naming them as
> > such to avoid confusion.
>
> Well, those are host addresses. Fabrice even suggested me to replace
> them with void * to prevent confusion, but I kept using unsigned long
> because the _p functions API do not use pointers. As those values are
> defined as phys_ram_base + offset, those are likely to be host address,
> not RAM offset, and are used directly to dereference host pointers in
> the ldxxx_p functions. Did I miss something ?

You are correct, they are host addresses. I still think calling them phys_pc 
is confusing. It took me a while to convince myself that "unsigned long" was 
an appropriate type (ignoring 64-bit windows hosts for now).

How about host_pc?

> > > +    /* Avoid softmmu access on next load */
> > > +    /* XXX: dont: phys PC is not correct anymore
> > > +     *      We could call get_phys_addr_code(env, pc); and remove the
> > > else +     *      condition, here.
> > > +     */
> > > +    //*start_pc = phys_pc;
> >
> > The commented out code is completely bogus, please remove it. The comment
> > is also somewhat misleading/incorrect. The else would still be required
> > for accesses that span a page boundary.
>
> I guess trying to optimize this case retrieving the physical address
> would not bring any optimization as in fact only the last translated
> instruction of a TB (then only a few code loads) may hit this case.

VLE targets (x86, m68k) can translate almost a full page of instructions, and 
a page boundary can be anywhere within that block. Once we've spanned 
multiple pages there's not point stopping translation immediately. We may as 
well translate as many instructions as we can on the second page.

I'd guess most TB are much smaller than a page, so on average only a few 
instructions are going to come after the page boundary.

> I'd like to keep a comment here to show that it may not be a good idea
> (or may not be as simple as it seems at first sight) to try to do more
> optimisation here, but you're right this comment is not correct.

Agreed.

> > The code itself looks ok, though I'd be surprised if it made a
> > significant difference. We're always going to hit the fast-path TLB
> > lookup case anyway.
>
> It seems that the generated code for the code fetch is much more
> efficient than the one generated when we get when using the softmmu
> routines. But it's true we do not get any significant performance boost.
> As it was previously mentioned, the idea of the patch is more a 'don't
> do unneeded things during code translation' than a great performance
> improvment.

OTOH it does make the the code more complicated. I'm agnostic about whether 
this patch should be applied.

Paul

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] RFC: Code fetch optimisation
  2007-10-15 16:01     ` Paul Brook
@ 2007-10-15 16:19       ` Fabrice Bellard
  2007-10-15 21:30       ` J. Mayer
  1 sibling, 0 replies; 17+ messages in thread
From: Fabrice Bellard @ 2007-10-15 16:19 UTC (permalink / raw)
  To: qemu-devel

Paul Brook wrote:
 > [...]
>>>The code itself looks ok, though I'd be surprised if it made a
>>>significant difference. We're always going to hit the fast-path TLB
>>>lookup case anyway.
>>
>>It seems that the generated code for the code fetch is much more
>>efficient than the one generated when we get when using the softmmu
>>routines. But it's true we do not get any significant performance boost.
>>As it was previously mentioned, the idea of the patch is more a 'don't
>>do unneeded things during code translation' than a great performance
>>improvment.
> 
> 
> OTOH it does make the the code more complicated. I'm agnostic about whether 
> this patch should be applied.

If it does not correct the existing x86 issues (no code segment limit 
tests and no explicit handling of code fetch exceptions in the 
translation phase in VLE case) I see no advantage in commiting it in its 
current form.

Regards,

Fabrice.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] RFC: Code fetch optimisation
  2007-10-15 16:01     ` Paul Brook
  2007-10-15 16:19       ` Fabrice Bellard
@ 2007-10-15 21:30       ` J. Mayer
  2007-10-15 22:42         ` Paul Brook
  1 sibling, 1 reply; 17+ messages in thread
From: J. Mayer @ 2007-10-15 21:30 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel

On Mon, 2007-10-15 at 17:01 +0100, Paul Brook wrote:
> > > > +    unsigned long phys_pc;
> > > > +    unsigned long phys_pc_start;
> > >
> > > These are ram offsets, not physical addresses. I recommend naming them as
> > > such to avoid confusion.
> >
> > Well, those are host addresses. Fabrice even suggested me to replace
> > them with void * to prevent confusion, but I kept using unsigned long
> > because the _p functions API do not use pointers. As those values are
> > defined as phys_ram_base + offset, those are likely to be host address,
> > not RAM offset, and are used directly to dereference host pointers in
> > the ldxxx_p functions. Did I miss something ?
> 
> You are correct, they are host addresses. I still think calling them phys_pc 
> is confusing. It took me a while to convince myself that "unsigned long" was 
> an appropriate type (ignoring 64-bit windows hosts for now).
> 
> How about host_pc?

It's OK for me.

> > > > +    /* Avoid softmmu access on next load */
> > > > +    /* XXX: dont: phys PC is not correct anymore
> > > > +     *      We could call get_phys_addr_code(env, pc); and remove the
> > > > else +     *      condition, here.
> > > > +     */
> > > > +    //*start_pc = phys_pc;
> > >
> > > The commented out code is completely bogus, please remove it. The comment
> > > is also somewhat misleading/incorrect. The else would still be required
> > > for accesses that span a page boundary.
> >
> > I guess trying to optimize this case retrieving the physical address
> > would not bring any optimization as in fact only the last translated
> > instruction of a TB (then only a few code loads) may hit this case.
> 
> VLE targets (x86, m68k) can translate almost a full page of instructions, and 
> a page boundary can be anywhere within that block. Once we've spanned 
> multiple pages there's not point stopping translation immediately. We may as 
> well translate as many instructions as we can on the second page.
> 
> I'd guess most TB are much smaller than a page, so on average only a few 
> instructions are going to come after the page boundary.

This leads me to another reflexion. For fixed length encoding targets,
we always stop translation when reaching a page boundary. If we keep
using the current model and we optimize the slow case, it would be
possible to stop only if we cross 2 pages boundary during code
translation, and it seems that this case is not likely to happen. If we
keep the current behavior, we could remove the second page_addr element
in the tb structure and maybe optimize parts of the tb management and
invalidation code.

> > I'd like to keep a comment here to show that it may not be a good idea
> > (or may not be as simple as it seems at first sight) to try to do more
> > optimisation here, but you're right this comment is not correct.
> 
> Agreed.
> 
> > > The code itself looks ok, though I'd be surprised if it made a
> > > significant difference. We're always going to hit the fast-path TLB
> > > lookup case anyway.
> >
> > It seems that the generated code for the code fetch is much more
> > efficient than the one generated when we get when using the softmmu
> > routines. But it's true we do not get any significant performance boost.
> > As it was previously mentioned, the idea of the patch is more a 'don't
> > do unneeded things during code translation' than a great performance
> > improvment.
> 
> OTOH it does make the the code more complicated. I'm agnostic about whether 
> this patch should be applied.

I agree that this proposal was an answer to a challenging idea that I
received more than a real need.
The worst thing in this patch, imho, is that you need to increase 2
values each time you want to change the PC. This is likely to bring some
bug when one will forgot to increase one of the two. I was thinking of
hiding the pc, host_pc and host_pc_start (and maybe also pc_start) in a
structure and add inline helpers:
* get_pc would return the current virtual PC, as needed by the jump and
relative memory accesses functions.
* get_tb_len would return the difference between the virtual PC and the
virtual pc_start, as it is done at the end of the gen_intermediate_code
functions
* move_pc would add an offset to the virtual and the physical PC. This
has to be target dependant, due to the special case for Sparc
* update_phys_pc would be void for most targets, except for Sparc where
the phys_pc needs to be adjusted after the translation of each target
instruction.
and maybe more, if needed.
This structure could also contain target specific information. To
address the problem of segment limit check reported by Fabrice Bellard,
we could for example add the address of the next segment limit for x86
target and add a target specific check at the start of the ldx_code_p
function. But I don't know much about segmentation "subtilities" on x86,
then this idea may not be appropriate to solve this problem.

-- 
J. Mayer <l_indien@magic.fr>
Never organized

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] RFC: Code fetch optimisation
  2007-10-15 21:30       ` J. Mayer
@ 2007-10-15 22:42         ` Paul Brook
  2007-10-16 20:27           ` J. Mayer
  0 siblings, 1 reply; 17+ messages in thread
From: Paul Brook @ 2007-10-15 22:42 UTC (permalink / raw)
  To: qemu-devel; +Cc: J. Mayer

> > VLE targets (x86, m68k) can translate almost a full page of instructions,
> > and a page boundary can be anywhere within that block. Once we've spanned
> > multiple pages there's not point stopping translation immediately. We may
> > as well translate as many instructions as we can on the second page.
> >
> > I'd guess most TB are much smaller than a page, so on average only a few
> > instructions are going to come after the page boundary.
>
> This leads me to another reflexion. For fixed length encoding targets,
> we always stop translation when reaching a page boundary. If we keep
> using the current model and we optimize the slow case, it would be
> possible to stop only if we cross 2 pages boundary during code
> translation, and it seems that this case is not likely to happen. If we
> keep the current behavior, we could remove the second page_addr element
> in the tb structure and maybe optimize parts of the tb management and
> invalidation code.

The latter may be the only feasible option.

Some targets (ARMv5, maybe others) do not have an explicit fault address for 
MMU instruction faults. The faulting address is the address of the current 
instruction when the fault occurs. Prefetch aborts are generated at 
translation time, which effectively means the faulting instruction must be at 
the start of a TB. Terminating the TB on a page boundary guarantees this 
behavior.

For VLE targets we already get this wrong (the prefetch abort occurs some time 
before the faulting instruction executes). I don't know if this behavior is 
permitted by the ISA, but it's definitely possible to construct cases where 
it has visible effect.

Paul

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] RFC: Code fetch optimisation
  2007-10-15 22:42         ` Paul Brook
@ 2007-10-16 20:27           ` J. Mayer
  2007-10-16 22:00             ` Paul Brook
  2007-10-16 22:26             ` Paul Brook
  0 siblings, 2 replies; 17+ messages in thread
From: J. Mayer @ 2007-10-16 20:27 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel

On Mon, 2007-10-15 at 23:42 +0100, Paul Brook wrote:
> > > VLE targets (x86, m68k) can translate almost a full page of instructions,
> > > and a page boundary can be anywhere within that block. Once we've spanned
> > > multiple pages there's not point stopping translation immediately. We may
> > > as well translate as many instructions as we can on the second page.
> > >
> > > I'd guess most TB are much smaller than a page, so on average only a few
> > > instructions are going to come after the page boundary.
> >
> > This leads me to another reflexion. For fixed length encoding targets,
> > we always stop translation when reaching a page boundary. If we keep
> > using the current model and we optimize the slow case, it would be
> > possible to stop only if we cross 2 pages boundary during code
> > translation, and it seems that this case is not likely to happen. If we
> > keep the current behavior, we could remove the second page_addr element
> > in the tb structure and maybe optimize parts of the tb management and
> > invalidation code.
> 
> The latter may be the only feasible option.
> 
> Some targets (ARMv5, maybe others) do not have an explicit fault address for 
> MMU instruction faults. The faulting address is the address of the current 
> instruction when the fault occurs. Prefetch aborts are generated at 
> translation time, which effectively means the faulting instruction must be at 
> the start of a TB. Terminating the TB on a page boundary guarantees this 
> behavior.

Well, we got the same behavior on PowerPC. What I was thinking of is
that if we fix the VLE problems, the fix, if done in a proper way, could
also allow benefit to RISC targets. What I don't know is; would we
really have a benefit not stopping translation on page boundaries ?

> For VLE targets we already get this wrong (the prefetch abort occurs some time 
> before the faulting instruction executes). I don't know if this behavior is 
> permitted by the ISA, but it's definitely possible to construct cases where 
> it has visible effect.

I think that it would be possible to do things properly. I'm not really
sure what is the best solution to implement it but if, in the "slow
case" path of the code fetch low-level routine, we call the
get_physical_address or the cpu_get_phys_page_debug function, we then
have a way to know if the code fetch is allowed. If it, we would just
have to adjust our host_pc and host_pc_start for the next fetch to be
optimized. If not, we could stop the translation and generate a
"gen_op_raise_excp_error" to raise the exception at the right place,
respecting the ISA insns execution ordering. Generating the exception
from inside the TB won't be OK, as it may not be necessary on a second
execution of the same TB, then the solution may be to link the TB with a
special other TB that would just raise this exception and would be
unlink once the exception has been treated. Or maybe the solution would
just be to stop the translation knowing that the exception will be
raised when trying to translate the first instruction in the next page.
There still may be specific problems for instructions spanning 2 pages,
using those solutions...

-- 
J. Mayer <l_indien@magic.fr>
Never organized

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] RFC: Code fetch optimisation
  2007-10-16 20:27           ` J. Mayer
@ 2007-10-16 22:00             ` Paul Brook
  2007-10-16 23:38               ` J. Mayer
  2007-10-16 22:26             ` Paul Brook
  1 sibling, 1 reply; 17+ messages in thread
From: Paul Brook @ 2007-10-16 22:00 UTC (permalink / raw)
  To: qemu-devel; +Cc: J. Mayer

> Well, we got the same behavior on PowerPC. What I was thinking of is
> that if we fix the VLE problems, the fix, if done in a proper way, could
> also allow benefit to RISC targets. What I don't know is; would we
> really have a benefit not stopping translation on page boundaries ?

> > For VLE targets we already get this wrong (the prefetch abort occurs some
> > time before the faulting instruction executes). I don't know if this
> > behavior is permitted by the ISA, but it's definitely possible to
> > construct cases where it has visible effect.
>
> I think that it would be possible to do things properly.
> [...] Or maybe the solution would
> just be to stop the translation knowing that the exception will be
> raised when trying to translate the first instruction in the next page.

I'd go for this one. It's approximately the same method currently used for 
RISC targets.

In general think this will require target specific support. For RISC targets 
this is trivial. For x86/m68k figuring out the length of an insn is trickier.

Detecting crossing a page boundary on subsequent insns in the load/mmu 
routines is problematic because it happens relatively late. In particular it 
may theoretically happen after we've output ops that change CPU state.

I suspect the best solution is to backtrack (remove the generated ops) after 
decoding the insn if we discover we've passed a page boundary. The ld*_code 
routines can simply return garbage (e.g. zero) if the read is not on the 
first page.

Trying to generate prefetch aborts at runtime sounds too hairy for my liking.

Paul

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] RFC: Code fetch optimisation
  2007-10-16 22:00             ` Paul Brook
@ 2007-10-16 23:38               ` J. Mayer
  2007-10-17  0:43                 ` Paul Brook
  0 siblings, 1 reply; 17+ messages in thread
From: J. Mayer @ 2007-10-16 23:38 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel

On Tue, 2007-10-16 at 23:00 +0100, Paul Brook wrote:
> > Well, we got the same behavior on PowerPC. What I was thinking of is
> > that if we fix the VLE problems, the fix, if done in a proper way, could
> > also allow benefit to RISC targets. What I don't know is; would we
> > really have a benefit not stopping translation on page boundaries ?

> I suspect that we're going to want/need to break the TB to get the exception 
> semantics right, so for RISC targets there's no point having TBs that
> span a 
> page boundary.

My opinion is that this an optimisation that may be tried later, if it
really give an advantage in terms of translation efficiency, which is
far from being evident. Then, let's keep what works well and just try to
solve the VLE problems for now...

> > > For VLE targets we already get this wrong (the prefetch abort occurs some
> > > time before the faulting instruction executes). I don't know if this
> > > behavior is permitted by the ISA, but it's definitely possible to
> > > construct cases where it has visible effect.
> >
> > I think that it would be possible to do things properly.
> > [...] Or maybe the solution would
> > just be to stop the translation knowing that the exception will be
> > raised when trying to translate the first instruction in the next page.
> 
> I'd go for this one. It's approximately the same method currently used for 
> RISC targets.

> In general think this will require target specific support. For RISC targets 
> this is trivial. For x86/m68k figuring out the length of an insn is trickier.
> 
> Detecting crossing a page boundary on subsequent insns in the load/mmu 
> routines is problematic because it happens relatively late. In particular it 
> may theoretically happen after we've output ops that change CPU state.
> 
> I suspect the best solution is to backtrack (remove the generated ops) after 
> decoding the insn if we discover we've passed a page boundary. The ld*_code 
> routines can simply return garbage (e.g. zero) if the read is not on the 
> first page.

The "incorrect" returned value may be target specific to be sure it's
always an invalid opcode.
Backtracking should not be hard if we register the last cc pointer each
time we finish translating an insn. I'll think about this solution,
which really seems feasible to me.

> Trying to generate prefetch aborts at runtime sounds too hairy for my liking.

It might be really tricky and is likely to be bugged, I agree.

-- 
J. Mayer <l_indien@magic.fr>
Never organized

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] RFC: Code fetch optimisation
  2007-10-16 23:38               ` J. Mayer
@ 2007-10-17  0:43                 ` Paul Brook
  0 siblings, 0 replies; 17+ messages in thread
From: Paul Brook @ 2007-10-17  0:43 UTC (permalink / raw)
  To: J. Mayer; +Cc: qemu-devel

> > I suspect the best solution is to backtrack (remove the generated ops)
> > after decoding the insn if we discover we've passed a page boundary. The
> > ld*_code routines can simply return garbage (e.g. zero) if the read is
> > not on the first page.
>
> The "incorrect" returned value may be target specific to be sure it's
> always an invalid opcode.

It doesn't matter whether it's valid or not, and we've no way of guaranteeing 
that anyway. We just have to make sure we don't generate an infinitely long 
instruction.

On a related note, I notice that we don't enforce x86 instruction length 
limits.

> Backtracking should not be hard if we register the last cc pointer each
> time we finish translating an insn. I'll think about this solution,
> which really seems feasible to me.

Right. You only have to worry about backtracking the state that's lives across 
insns and is not constant within a TB. For x86 I think this is dc->pc, 
dc->cc_op, gen_opc_ptr and nb_gen_labels. Plus you need to reset dc->is_jmp 
to zero.  gen_opparam_ptr is not used after disassembly, so can be ignored.

Paul

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] RFC: Code fetch optimisation
  2007-10-16 20:27           ` J. Mayer
  2007-10-16 22:00             ` Paul Brook
@ 2007-10-16 22:26             ` Paul Brook
  1 sibling, 0 replies; 17+ messages in thread
From: Paul Brook @ 2007-10-16 22:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: J. Mayer

> Well, we got the same behavior on PowerPC. What I was thinking of is
> that if we fix the VLE problems, the fix, if done in a proper way, could
> also allow benefit to RISC targets. What I don't know is; would we
> really have a benefit not stopping translation on page boundaries ?

[ I meant to say in my previous mail, but got cut during editing ]

I suspect that we're going to want/need to break the TB to get the exception 
semantics right, so for RISC targets there's no point having TBs that span a 
page boundary.

Paul

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Qemu-devel] RFC: Code fetch optimisation
@ 2007-10-12  8:33 J. Mayer
  2007-10-12 15:21 ` Blue Swirl
  0 siblings, 1 reply; 17+ messages in thread
From: J. Mayer @ 2007-10-12  8:33 UTC (permalink / raw)
  To: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 783 bytes --]

Here's a small patch that allow an optimisation for code fetch, at least
for RISC CPU targets, as suggested by Fabrice Bellard.
The main idea is that a translated block is never to span over a page
boundary. As the tb_find_slow routine already gets the physical address
of the page of code to be translated, the code translator could then
fetch the code using raw host memory accesses instead of doing it
through the softmmu routines.
This patch could also be adapted to RISC CPU targets, with care for the
last instruction of a page. For now, I did implement it for alpha, arm,
mips, PowerPC and SH4.
I don't actually know if the optimsation would bring a sensible speed
gain or if it will be absolutelly marginal.

Please comment.

-- 
J. Mayer <l_indien@magic.fr>
Never organized

[-- Attachment #2: code_raw_optim.diff --]
[-- Type: text/x-patch, Size: 7621 bytes --]

Index: cpu-exec.c
===================================================================
RCS file: /sources/qemu/qemu/cpu-exec.c,v
retrieving revision 1.119
diff -u -d -d -p -r1.119 cpu-exec.c
--- cpu-exec.c	8 Oct 2007 13:16:13 -0000	1.119
+++ cpu-exec.c	12 Oct 2007 07:14:43 -0000
@@ -133,6 +133,7 @@ static TranslationBlock *tb_find_slow(ta
     tb->tc_ptr = tc_ptr;
     tb->cs_base = cs_base;
     tb->flags = flags;
+    tb->page_addr[0] = phys_page1;
     cpu_gen_code(env, tb, CODE_GEN_MAX_SIZE, &code_gen_size);
     code_gen_ptr = (void *)(((unsigned long)code_gen_ptr + code_gen_size + CODE_GEN_ALIGN - 1) & ~(CODE_GEN_ALIGN - 1));
 
Index: target-alpha/translate.c
===================================================================
RCS file: /sources/qemu/qemu/target-alpha/translate.c,v
retrieving revision 1.5
diff -u -d -d -p -r1.5 translate.c
--- target-alpha/translate.c	16 Sep 2007 21:08:01 -0000	1.5
+++ target-alpha/translate.c	12 Oct 2007 07:14:47 -0000
@@ -1966,12 +1966,15 @@ int gen_intermediate_code_internal (CPUS
 #endif
     DisasContext ctx, *ctxp = &ctx;
     target_ulong pc_start;
+    unsigned long phys_pc;
     uint32_t insn;
     uint16_t *gen_opc_end;
     int j, lj = -1;
     int ret;
 
     pc_start = tb->pc;
+    phys_pc = (unsigned long)phys_ram_base + tb->page_addr[0] +
+        (pc_start & ~TARGET_PAGE_MASK);
     gen_opc_ptr = gen_opc_buf;
     gen_opc_end = gen_opc_buf + OPC_MAX_SIZE;
     gen_opparam_ptr = gen_opparam_buf;
@@ -2010,7 +2013,7 @@ int gen_intermediate_code_internal (CPUS
                     ctx.pc, ctx.mem_idx);
         }
 #endif
-        insn = ldl_code(ctx.pc);
+        insn = ldl_raw(phys_pc);
 #if defined ALPHA_DEBUG_DISAS
         insn_count++;
         if (logfile != NULL) {
@@ -2018,6 +2021,7 @@ int gen_intermediate_code_internal (CPUS
         }
 #endif
         ctx.pc += 4;
+        phys_pc += 4;
         ret = translate_one(ctxp, insn);
         if (ret != 0)
             break;
Index: target-arm/translate.c
===================================================================
RCS file: /sources/qemu/qemu/target-arm/translate.c,v
retrieving revision 1.57
diff -u -d -d -p -r1.57 translate.c
--- target-arm/translate.c	17 Sep 2007 08:09:51 -0000	1.57
+++ target-arm/translate.c	12 Oct 2007 07:14:47 -0000
@@ -38,6 +38,7 @@
 /* internal defines */
 typedef struct DisasContext {
     target_ulong pc;
+    unsigned long phys_pc;
     int is_jmp;
     /* Nonzero if this instruction has been conditionally skipped.  */
     int condjmp;
@@ -2206,8 +2207,9 @@ static void disas_arm_insn(CPUState * en
 {
     unsigned int cond, insn, val, op1, i, shift, rm, rs, rn, rd, sh;
 
-    insn = ldl_code(s->pc);
+    insn = ldl_raw(s->phys_pc);
     s->pc += 4;
+    s->phys_pc += 4;
 
     cond = insn >> 28;
     if (cond == 0xf){
@@ -2971,8 +2973,9 @@ static void disas_thumb_insn(DisasContex
     int32_t offset;
     int i;
 
-    insn = lduw_code(s->pc);
+    insn = lduw_raw(s->phys_pc);
     s->pc += 2;
+    s->phys_pc += 2;
 
     switch (insn >> 12) {
     case 0: case 1:
@@ -3494,7 +3497,7 @@ static void disas_thumb_insn(DisasContex
             break;
         }
         offset = ((int32_t)insn << 21) >> 10;
-        insn = lduw_code(s->pc);
+        insn = lduw_raw(s->phys_pc);
         offset |= insn & 0x7ff;
 
         val = (uint32_t)s->pc + 2;
@@ -3544,6 +3547,8 @@ static inline int gen_intermediate_code_
 
     dc->is_jmp = DISAS_NEXT;
     dc->pc = pc_start;
+    dc->phys_pc = (unsigned long)phys_ram_base + tb->page_addr[0] +
+        (pc_start & ~TARGET_PAGE_MASK);
     dc->singlestep_enabled = env->singlestep_enabled;
     dc->condjmp = 0;
     dc->thumb = env->thumb;
Index: target-mips/translate.c
===================================================================
RCS file: /sources/qemu/qemu/target-mips/translate.c,v
retrieving revision 1.106
diff -u -d -d -p -r1.106 translate.c
--- target-mips/translate.c	9 Oct 2007 03:39:58 -0000	1.106
+++ target-mips/translate.c	12 Oct 2007 07:14:48 -0000
@@ -6483,6 +6483,7 @@ gen_intermediate_code_internal (CPUState
 {
     DisasContext ctx;
     target_ulong pc_start;
+    unsigned long phys_pc;
     uint16_t *gen_opc_end;
     int j, lj = -1;
 
@@ -6490,6 +6491,8 @@ gen_intermediate_code_internal (CPUState
         fprintf (logfile, "search pc %d\n", search_pc);
 
     pc_start = tb->pc;
+    phys_pc = (unsigned long)phys_ram_base + tb->page_addr[0] +
+        (pc_start & ~TARGET_PAGE_MASK);
     gen_opc_ptr = gen_opc_buf;
     gen_opc_end = gen_opc_buf + OPC_MAX_SIZE;
     gen_opparam_ptr = gen_opparam_buf;
@@ -6544,9 +6547,10 @@ gen_intermediate_code_internal (CPUState
             gen_opc_hflags[lj] = ctx.hflags & MIPS_HFLAG_BMASK;
             gen_opc_instr_start[lj] = 1;
         }
-        ctx.opcode = ldl_code(ctx.pc);
+        ctx.opcode = ldl_raw(phys_pc);
         decode_opc(env, &ctx);
         ctx.pc += 4;
+        phys_pc += 4;
 
         if (env->singlestep_enabled)
             break;
Index: target-ppc/translate.c
===================================================================
RCS file: /sources/qemu/qemu/target-ppc/translate.c,v
retrieving revision 1.92
diff -u -d -d -p -r1.92 translate.c
--- target-ppc/translate.c	7 Oct 2007 23:10:08 -0000	1.92
+++ target-ppc/translate.c	12 Oct 2007 07:14:49 -0000
@@ -6679,12 +7569,15 @@ static always_inline int gen_intermediat
     DisasContext ctx, *ctxp = &ctx;
     opc_handler_t **table, *handler;
     target_ulong pc_start;
+    unsigned long phys_pc;
     uint16_t *gen_opc_end;
     int supervisor;
     int single_step, branch_step;
     int j, lj = -1;
 
     pc_start = tb->pc;
+    phys_pc = (unsigned long)phys_ram_base + tb->page_addr[0] +
+        (pc_start & ~TARGET_PAGE_MASK);
     gen_opc_ptr = gen_opc_buf;
     gen_opc_end = gen_opc_buf + OPC_MAX_SIZE;
     gen_opparam_ptr = gen_opparam_buf;
@@ -6763,7 +7649,7 @@ static always_inline int gen_intermediat
                     ctx.nip, 1 - msr_pr, msr_ir);
         }
 #endif
-        ctx.opcode = ldl_code(ctx.nip);
+        ctx.opcode = ldl_raw(phys_pc);
         if (msr_le) {
             ctx.opcode = ((ctx.opcode & 0xFF000000) >> 24) |
                 ((ctx.opcode & 0x00FF0000) >> 8) |
@@ -6778,6 +7664,7 @@ static always_inline int gen_intermediat
         }
 #endif
         ctx.nip += 4;
+        phys_pc += 4;
         table = env->opcodes;
         handler = table[opc1(ctx.opcode)];
         if (is_indirect_opcode(handler)) {
Index: target-sh4/translate.c
===================================================================
RCS file: /sources/qemu/qemu/target-sh4/translate.c,v
retrieving revision 1.18
diff -u -d -d -p -r1.18 translate.c
--- target-sh4/translate.c	29 Sep 2007 19:52:22 -0000	1.18
+++ target-sh4/translate.c	12 Oct 2007 07:14:50 -0000
@@ -1150,11 +1150,14 @@ gen_intermediate_code_internal(CPUState 
 {
     DisasContext ctx;
     target_ulong pc_start;
+    unsigned long phys_pc;
     static uint16_t *gen_opc_end;
     uint32_t old_flags;
     int i, ii;
 
     pc_start = tb->pc;
+    phys_pc = (unsigned long)phys_ram_base + tb->page_addr[0] +
+        (pc_start & ~TARGET_PAGE_MASK);
     gen_opc_ptr = gen_opc_buf;
     gen_opc_end = gen_opc_buf + OPC_MAX_SIZE;
     gen_opparam_ptr = gen_opparam_buf;
@@ -1210,9 +1213,10 @@ gen_intermediate_code_internal(CPUState 
 	fprintf(stderr, "Loading opcode at address 0x%08x\n", ctx.pc);
 	fflush(stderr);
 #endif
-	ctx.opcode = lduw_code(ctx.pc);
+	ctx.opcode = lduw_raw(phys_pc);
 	decode_opc(&ctx);
 	ctx.pc += 2;
+        phys_pc += 2;
 	if ((ctx.pc & (TARGET_PAGE_SIZE - 1)) == 0)
 	    break;
 	if (env->singlestep_enabled)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] RFC: Code fetch optimisation
  2007-10-12  8:33 J. Mayer
@ 2007-10-12 15:21 ` Blue Swirl
  2007-10-12 18:24   ` Jocelyn Mayer
                     ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Blue Swirl @ 2007-10-12 15:21 UTC (permalink / raw)
  To: qemu-devel

On 10/12/07, J. Mayer <l_indien@magic.fr> wrote:
> Here's a small patch that allow an optimisation for code fetch, at least
> for RISC CPU targets, as suggested by Fabrice Bellard.
> The main idea is that a translated block is never to span over a page
> boundary. As the tb_find_slow routine already gets the physical address
> of the page of code to be translated, the code translator could then
> fetch the code using raw host memory accesses instead of doing it
> through the softmmu routines.
> This patch could also be adapted to RISC CPU targets, with care for the
> last instruction of a page. For now, I did implement it for alpha, arm,
> mips, PowerPC and SH4.
> I don't actually know if the optimsation would bring a sensible speed
> gain or if it will be absolutelly marginal.
>
> Please comment.

This will not work correctly for execution of MMIO registers, but
maybe that won't work on real hardware either. Who cares.

Wouldn't it be even more efficient if you moved most of this calculation:
+    phys_pc = (unsigned long)phys_ram_base + tb->page_addr[0] +
+        (pc_start & ~TARGET_PAGE_MASK);
here:
+    tb->page_addr[0] = phys_page1;
?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] RFC: Code fetch optimisation
  2007-10-12 15:21 ` Blue Swirl
@ 2007-10-12 18:24   ` Jocelyn Mayer
  2007-10-12 18:36   ` Fabrice Bellard
  2007-10-12 18:39   ` Fabrice Bellard
  2 siblings, 0 replies; 17+ messages in thread
From: Jocelyn Mayer @ 2007-10-12 18:24 UTC (permalink / raw)
  To: qemu-devel

On Fri, 2007-10-12 at 18:21 +0300, Blue Swirl wrote:
> On 10/12/07, J. Mayer <l_indien@magic.fr> wrote:
> > Here's a small patch that allow an optimisation for code fetch, at least
> > for RISC CPU targets, as suggested by Fabrice Bellard.
> > The main idea is that a translated block is never to span over a page
> > boundary. As the tb_find_slow routine already gets the physical address
> > of the page of code to be translated, the code translator could then
> > fetch the code using raw host memory accesses instead of doing it
> > through the softmmu routines.
> > This patch could also be adapted to RISC CPU targets, with care for the
> > last instruction of a page. For now, I did implement it for alpha, arm,
> > mips, PowerPC and SH4.
> > I don't actually know if the optimsation would bring a sensible speed
> > gain or if it will be absolutelly marginal.
> >
> > Please comment.
> 
> This will not work correctly for execution of MMIO registers, but
> maybe that won't work on real hardware either. Who cares.

I wonder if this is important or not... But maybe, when retrieving the
physical address we could check if it is inside ROM/RAM or an I/O area
and in the last case do not give the phys_addr information to the
translator. In that case, it would go on using the ldxx_code. I guess if
we want to do that, a set of helpers would be appreciated to avoid
adding code like:
if (phys_pc == 0)
  opc = ldul_code(virt_pc)
else
  opc = ldul_raw(phys_pc)
everywhere... I could also add another check so this set of macro would
automatically use ldxx_code if we reach a page boundary, which would
then make easy to use this optimisation for CISC/VLE architectures too.

I'm not sure of the proper solution to allow executing code from mmio
devices. But adding specific accessors to handle the CISC/VLE case is to
be done. Something like this might be OK:

static inline target_ulong ldl_code_p(unsigned long *start_pc, unsigned
long *phys_pc, target_ulong *virt_pc)
{
    target_ulong opc;
    if ((*start_pc ^ *phys_pc) & TARGET_PAGE_MASK) {
        /* slow path that may raise an exception */
        opc = ldul_code(virt_pc);
	*start_pc = phys_pc; /* Avoid softmmu call on next load */
    } else {
	/* Optimized path */
        opc = ldul_raw(phys_pc);
    }
    *phys_pc += sizeof(target_ulong);
    *virt_pc += sizeof(target_ulong);
    return opc;
}

Of course, 8, 16 and 64 (why not ?) bits accessors would be also
provided the same way.

> Wouldn't it be even more efficient if you moved most of this calculation:
> +    phys_pc = (unsigned long)phys_ram_base + tb->page_addr[0] +
> +        (pc_start & ~TARGET_PAGE_MASK);
> here:
> +    tb->page_addr[0] = phys_page1;
> ?

Maybe. I choosed to do this way because it's exactly the same assignment
that is done in tb_link_phys after the gen_intermediate_code function
returns. I then though that the safer thing to do was to store the same
value so, whatever might happen, the value in the tb structure is never
inconsistent. I also guess that it's not so important as the tb is not
linked at this point...

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] RFC: Code fetch optimisation
  2007-10-12 15:21 ` Blue Swirl
  2007-10-12 18:24   ` Jocelyn Mayer
@ 2007-10-12 18:36   ` Fabrice Bellard
  2007-10-12 18:39   ` Fabrice Bellard
  2 siblings, 0 replies; 17+ messages in thread
From: Fabrice Bellard @ 2007-10-12 18:36 UTC (permalink / raw)
  To: qemu-devel

Blue Swirl wrote:
> On 10/12/07, J. Mayer <l_indien@magic.fr> wrote:
>> Here's a small patch that allow an optimisation for code fetch, at least
>> for RISC CPU targets, as suggested by Fabrice Bellard.
>> The main idea is that a translated block is never to span over a page
>> boundary. As the tb_find_slow routine already gets the physical address
>> of the page of code to be translated, the code translator could then
>> fetch the code using raw host memory accesses instead of doing it
>> through the softmmu routines.
>> This patch could also be adapted to RISC CPU targets, with care for the
>> last instruction of a page. For now, I did implement it for alpha, arm,
>> mips, PowerPC and SH4.
>> I don't actually know if the optimsation would bring a sensible speed
>> gain or if it will be absolutelly marginal.
>>
>> Please comment.
> 
> This will not work correctly for execution of MMIO registers, but
> maybe that won't work on real hardware either. Who cares.

It can never happen because QEMU currently does not support it (see 
get_phys_addr_code()). I started to implement it but never really 
finished it (real hardware can do it so QEMU should support it). The 
idea consist in using a reserved ram page to store the code. Another 
point is that the TB must be discarded once executed as the MMIO data 
can change.

Regards,

Fabrice.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] RFC: Code fetch optimisation
  2007-10-12 15:21 ` Blue Swirl
  2007-10-12 18:24   ` Jocelyn Mayer
  2007-10-12 18:36   ` Fabrice Bellard
@ 2007-10-12 18:39   ` Fabrice Bellard
  2 siblings, 0 replies; 17+ messages in thread
From: Fabrice Bellard @ 2007-10-12 18:39 UTC (permalink / raw)
  To: qemu-devel

Blue Swirl wrote:
> On 10/12/07, J. Mayer <l_indien@magic.fr> wrote:
>> Here's a small patch that allow an optimisation for code fetch, at least
>> for RISC CPU targets, as suggested by Fabrice Bellard.
>> The main idea is that a translated block is never to span over a page
>> boundary. As the tb_find_slow routine already gets the physical address
>> of the page of code to be translated, the code translator could then
>> fetch the code using raw host memory accesses instead of doing it
>> through the softmmu routines.
>> This patch could also be adapted to RISC CPU targets, with care for the
>> last instruction of a page. For now, I did implement it for alpha, arm,
>> mips, PowerPC and SH4.
>> I don't actually know if the optimsation would bring a sensible speed
>> gain or if it will be absolutelly marginal.
>>
>> Please comment.
> 
> This will not work correctly for execution of MMIO registers, but
> maybe that won't work on real hardware either. Who cares.

It can never happen because QEMU currently does not support it (see 
get_phys_addr_code()). I started to implement it but never really 
finished it (real hardware can do it so QEMU should support it). The 
idea consist in using a reserved ram page to store the code. Another 
point is that the TB must be discarded once executed as the MMIO data 
can change.

Regards,

Fabrice.

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2007-10-17  0:43 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-14 11:44 [Qemu-devel] RFC: Code fetch optimisation J. Mayer
2007-10-15  2:30 ` Paul Brook
2007-10-15 12:09   ` J. Mayer
2007-10-15 16:01     ` Paul Brook
2007-10-15 16:19       ` Fabrice Bellard
2007-10-15 21:30       ` J. Mayer
2007-10-15 22:42         ` Paul Brook
2007-10-16 20:27           ` J. Mayer
2007-10-16 22:00             ` Paul Brook
2007-10-16 23:38               ` J. Mayer
2007-10-17  0:43                 ` Paul Brook
2007-10-16 22:26             ` Paul Brook
  -- strict thread matches above, loose matches on Subject: below --
2007-10-12  8:33 J. Mayer
2007-10-12 15:21 ` Blue Swirl
2007-10-12 18:24   ` Jocelyn Mayer
2007-10-12 18:36   ` Fabrice Bellard
2007-10-12 18:39   ` Fabrice Bellard

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).