[PATCH 14/14] target/i386: use + to put flags together

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 14/14] target/i386: use + to put flags together
  2024-10-20 15:53 [PATCH 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
@ 2024-10-20 15:53 ` Paolo Bonzini
  0 siblings, 0 replies; 21+ messages in thread
From: Paolo Bonzini @ 2024-10-20 15:53 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson

This gives greater opportunity for reassociation on x86 targets,
since addition can use the LEA instruction.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/cc_helper_template.h.inc | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/target/i386/tcg/cc_helper_template.h.inc b/target/i386/tcg/cc_helper_template.h.inc
index f29a6dfb77c..d7672c8840a 100644
--- a/target/i386/tcg/cc_helper_template.h.inc
+++ b/target/i386/tcg/cc_helper_template.h.inc
@@ -55,7 +55,7 @@ static uint32_t glue(compute_all_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = lshift((src1 ^ src2 ^ -1) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static int glue(compute_c_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
@@ -84,7 +84,7 @@ static uint32_t glue(compute_all_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1,
     zf = (dst == 0) << 6;
     sf = lshift(dst, 8 - DATA_BITS) & 0x80;
     of = lshift((src1 ^ src2 ^ -1) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static int glue(compute_c_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1,
@@ -110,7 +110,7 @@ static uint32_t glue(compute_all_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2)
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = lshift((src1 ^ src2) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static int glue(compute_c_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2)
@@ -141,7 +141,7 @@ static uint32_t glue(compute_all_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2,
     zf = (dst == 0) << 6;
     sf = lshift(dst, 8 - DATA_BITS) & 0x80;
     of = lshift((src1 ^ src2) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static int glue(compute_c_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2,
@@ -169,7 +169,7 @@ static uint32_t glue(compute_all_logic, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = 0;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static uint32_t glue(compute_all_inc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
@@ -185,7 +185,7 @@ static uint32_t glue(compute_all_inc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = (dst == SIGN_MASK) * CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static uint32_t glue(compute_all_dec, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
@@ -201,7 +201,7 @@ static uint32_t glue(compute_all_dec, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = (dst == SIGN_MASK - 1) * CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static uint32_t glue(compute_all_shl, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
@@ -215,7 +215,7 @@ static uint32_t glue(compute_all_shl, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     /* of is defined iff shift count == 1 */
     of = lshift(src1 ^ dst, 12 - DATA_BITS) & CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static int glue(compute_c_shl, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
@@ -234,7 +234,7 @@ static uint32_t glue(compute_all_sar, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     /* of is defined iff shift count == 1 */
     of = lshift(src1 ^ dst, 12 - DATA_BITS) & CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 /* NOTE: we compute the flags like the P4. On olders CPUs, only OF and
@@ -250,7 +250,7 @@ static uint32_t glue(compute_all_mul, SUFFIX)(DATA_TYPE dst, target_long src1)
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = cf * CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static uint32_t glue(compute_all_bmilg, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
@@ -263,7 +263,7 @@ static uint32_t glue(compute_all_bmilg, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = 0;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static int glue(compute_c_bmilg, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
@@ -281,7 +281,7 @@ static int glue(compute_all_blsi, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = 0;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static int glue(compute_c_blsi, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
-- 
2.46.2



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v2 00/14] target/i386: miscellaneous flags improvements
@ 2024-10-28 15:18 Paolo Bonzini
  2024-10-28 15:18 ` [PATCH 01/14] target/i386: use tcg_gen_ext_tl when applicable Paolo Bonzini
                   ` (13 more replies)
  0 siblings, 14 replies; 21+ messages in thread
From: Paolo Bonzini @ 2024-10-28 15:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson

This includes improved translation of checks, microoptimization of the
helpers, and improvements to the cc_op_* functions from Richard.

Unlike his original patches[1] I didn't convert cc_op_live() to a switch
statement, instead keeping the array but making sure that all of its
entries are nonzero.  The only zero entry was CC_OP_CLR, which is now
changed to spill the constant value of EFLAGS to cc_op_src.  While this
has a 0.2% cost in number of TCG ops, getting rid of the special
case for CC_OP_CLR makes it even easier to optimize computation of ZF
from CC_OP_DYNAMIC; this is quite common, for example in switch
statements that have CMP/JG/JE sequences (JE followed JL/JG/JA/JB seems
less common than the opposite, though that's not universal).

On a quick-and-dirty run of "ls -lR", the changes add ~750 spills of
0x44 to cc_op_src; but it also reduces to one half the calls to
cc_compute_all (most of them are completely eliminated), and that
is a lot more expensive.

One thing I noticed is that those spills are really huge (11 bytes).
It might help to move cc_* at the very beginning of CPUX86State, because
the number of accesses to cc_* is comparable to the number of accesses
to registers (despite cc_* being mostly written, while registers are
both read and written).

Thanks,

Paolo

[1] https://patchew.org/QEMU/20240701025115.1265117-1-richard.henderson@linaro.org/

v1->v2:
- use MAKE_64BIT_MASK in helper_cc_compute_nz
- get rid completely of parity_table

Paolo Bonzini (10):
  target/i386: use tcg_gen_ext_tl when applicable
  target/i386: remove CC_OP_CLR
  target/i386: optimize computation of ZF from CC_OP_DYNAMIC
  target/i386: optimize TEST+Jxx sequences
  target/i386: add a few more trivial CCPrepare cases
  target/i386: add a note about gen_jcc1
  target/i386: make flag variables unsigned
  target/i386: use compiler builtin to compute PF
  target/i386: use higher-precision arithmetic to compute CF
  target/i386: use + to put flags together

Richard Henderson (4):
  target/i386: Tidy cc_op_str usage
  target/i386: Rearrange CCOp
  target/i386: Introduce cc_op_size
  target/i386: Wrap cc_op_live with a validity check

 include/qemu/host-utils.h                |   9 ++
 target/i386/cpu.h                        |  33 ++++--
 target/i386/helper.h                     |   1 +
 target/i386/tcg/helper-tcg.h             |   6 +-
 target/i386/tcg/cc_helper_template.h.inc | 127 +++++++++++++++--------
 target/i386/cpu-dump.c                   |  18 ++--
 target/i386/tcg/cc_helper.c              |  51 +++------
 target/i386/tcg/int_helper.c             |   4 +-
 target/i386/tcg/translate.c              | 103 ++++++++++++------
 target/i386/tcg/decode-new.c.inc         |   2 +-
 target/i386/tcg/emit.c.inc               |  24 ++---
 11 files changed, 226 insertions(+), 152 deletions(-)

-- 
2.47.0

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 01/14] target/i386: use tcg_gen_ext_tl when applicable
  2024-10-28 15:18 [PATCH v2 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
@ 2024-10-28 15:18 ` Paolo Bonzini
  2024-10-28 15:18 ` [PATCH 02/14] target/i386: Tidy cc_op_str usage Paolo Bonzini
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 21+ messages in thread
From: Paolo Bonzini @ 2024-10-28 15:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson

Prefer it to gen_ext_tl in the common case where the destination is known.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/translate.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index ef190416b49..dc308f31041 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -883,16 +883,16 @@ static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv reg)
     case CC_OP_SUBB ... CC_OP_SUBQ:
         /* (DATA_TYPE)CC_SRCT < (DATA_TYPE)CC_SRC */
         size = s->cc_op - CC_OP_SUBB;
-        gen_ext_tl(s->cc_srcT, s->cc_srcT, size, false);
-        gen_ext_tl(cpu_cc_src, cpu_cc_src, size, false);
+        tcg_gen_ext_tl(s->cc_srcT, s->cc_srcT, size);
+        tcg_gen_ext_tl(cpu_cc_src, cpu_cc_src, size);
         return (CCPrepare) { .cond = TCG_COND_LTU, .reg = s->cc_srcT,
                              .reg2 = cpu_cc_src, .use_reg2 = true };
 
     case CC_OP_ADDB ... CC_OP_ADDQ:
         /* (DATA_TYPE)CC_DST < (DATA_TYPE)CC_SRC */
         size = s->cc_op - CC_OP_ADDB;
-        gen_ext_tl(cpu_cc_dst, cpu_cc_dst, size, false);
-        gen_ext_tl(cpu_cc_src, cpu_cc_src, size, false);
+        tcg_gen_ext_tl(cpu_cc_dst, cpu_cc_dst, size);
+        tcg_gen_ext_tl(cpu_cc_src, cpu_cc_src, size);
         return (CCPrepare) { .cond = TCG_COND_LTU, .reg = cpu_cc_dst,
                              .reg2 = cpu_cc_src, .use_reg2 = true };
 
@@ -1041,8 +1041,8 @@ static CCPrepare gen_prepare_cc(DisasContext *s, int b, TCGv reg)
         size = s->cc_op - CC_OP_SUBB;
         switch (jcc_op) {
         case JCC_BE:
-            gen_ext_tl(s->cc_srcT, s->cc_srcT, size, false);
-            gen_ext_tl(cpu_cc_src, cpu_cc_src, size, false);
+            tcg_gen_ext_tl(s->cc_srcT, s->cc_srcT, size);
+            tcg_gen_ext_tl(cpu_cc_src, cpu_cc_src, size);
             cc = (CCPrepare) { .cond = TCG_COND_LEU, .reg = s->cc_srcT,
                                .reg2 = cpu_cc_src, .use_reg2 = true };
             break;
@@ -1052,8 +1052,8 @@ static CCPrepare gen_prepare_cc(DisasContext *s, int b, TCGv reg)
         case JCC_LE:
             cond = TCG_COND_LE;
         fast_jcc_l:
-            gen_ext_tl(s->cc_srcT, s->cc_srcT, size, true);
-            gen_ext_tl(cpu_cc_src, cpu_cc_src, size, true);
+            tcg_gen_ext_tl(s->cc_srcT, s->cc_srcT, size | MO_SIGN);
+            tcg_gen_ext_tl(cpu_cc_src, cpu_cc_src, size | MO_SIGN);
             cc = (CCPrepare) { .cond = cond, .reg = s->cc_srcT,
                                .reg2 = cpu_cc_src, .use_reg2 = true };
             break;
-- 
2.47.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 02/14] target/i386: Tidy cc_op_str usage
  2024-10-28 15:18 [PATCH v2 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
  2024-10-28 15:18 ` [PATCH 01/14] target/i386: use tcg_gen_ext_tl when applicable Paolo Bonzini
@ 2024-10-28 15:18 ` Paolo Bonzini
  2024-10-28 15:18 ` [PATCH 03/14] target/i386: remove CC_OP_CLR Paolo Bonzini
                   ` (11 subsequent siblings)
  13 siblings, 0 replies; 21+ messages in thread
From: Paolo Bonzini @ 2024-10-28 15:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson

From: Richard Henderson <richard.henderson@linaro.org>

Make const.  Use the read-only strings directly; do not copy
them into an on-stack buffer with snprintf.  Allow for holes
in the cc_op_str array, now present with CC_OP_POPCNT.

Fixes: 460231ad369 ("target/i386: give CC_OP_POPCNT low bits corresponding to MO_TL")
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Link: https://lore.kernel.org/r/20240701025115.1265117-2-richard.henderson@linaro.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/cpu-dump.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/target/i386/cpu-dump.c b/target/i386/cpu-dump.c
index 3bb8e440916..dc6723aedee 100644
--- a/target/i386/cpu-dump.c
+++ b/target/i386/cpu-dump.c
@@ -27,7 +27,7 @@
 /***********************************************************/
 /* x86 debug */
 
-static const char *cc_op_str[CC_OP_NB] = {
+static const char * const cc_op_str[] = {
     [CC_OP_DYNAMIC] = "DYNAMIC",
 
     [CC_OP_EFLAGS] = "EFLAGS",
@@ -347,7 +347,6 @@ void x86_cpu_dump_state(CPUState *cs, FILE *f, int flags)
     X86CPU *cpu = X86_CPU(cs);
     CPUX86State *env = &cpu->env;
     int eflags, i, nb;
-    char cc_op_name[32];
     static const char *seg_name[6] = { "ES", "CS", "SS", "DS", "FS", "GS" };
 
     eflags = cpu_compute_eflags(env);
@@ -456,10 +455,16 @@ void x86_cpu_dump_state(CPUState *cs, FILE *f, int flags)
                      env->dr[6], env->dr[7]);
     }
     if (flags & CPU_DUMP_CCOP) {
-        if ((unsigned)env->cc_op < CC_OP_NB)
-            snprintf(cc_op_name, sizeof(cc_op_name), "%s", cc_op_str[env->cc_op]);
-        else
-            snprintf(cc_op_name, sizeof(cc_op_name), "[%d]", env->cc_op);
+        const char *cc_op_name = NULL;
+        char cc_op_buf[32];
+
+        if ((unsigned)env->cc_op < ARRAY_SIZE(cc_op_str)) {
+            cc_op_name = cc_op_str[env->cc_op];
+        }
+        if (cc_op_name == NULL) {
+            snprintf(cc_op_buf, sizeof(cc_op_buf), "[%d]", env->cc_op);
+            cc_op_name = cc_op_buf;
+        }
 #ifdef TARGET_X86_64
         if (env->hflags & HF_CS64_MASK) {
             qemu_fprintf(f, "CCS=%016" PRIx64 " CCD=%016" PRIx64 " CCO=%s\n",
-- 
2.47.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 03/14] target/i386: remove CC_OP_CLR
  2024-10-28 15:18 [PATCH v2 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
  2024-10-28 15:18 ` [PATCH 01/14] target/i386: use tcg_gen_ext_tl when applicable Paolo Bonzini
  2024-10-28 15:18 ` [PATCH 02/14] target/i386: Tidy cc_op_str usage Paolo Bonzini
@ 2024-10-28 15:18 ` Paolo Bonzini
  2024-10-28 15:18 ` [PATCH 04/14] target/i386: Rearrange CCOp Paolo Bonzini
                   ` (10 subsequent siblings)
  13 siblings, 0 replies; 21+ messages in thread
From: Paolo Bonzini @ 2024-10-28 15:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson

Just use CC_OP_EFLAGS; it is not that likely that the flags computed by
CC_OP_CLR survive the end of the basic block, in which case there is no
need to spill cc_op_src.

cc_op_src now does need spilling if the XOR is followed by a memory
operation, but this only costs 0.2% extra TCG ops.  They will be recouped
by simplifications in how QEMU evaluates ZF at runtime, which are even
greater with this change.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/cpu.h           |  1 -
 target/i386/cpu-dump.c      |  1 -
 target/i386/tcg/cc_helper.c |  3 ---
 target/i386/tcg/translate.c | 10 ----------
 target/i386/tcg/emit.c.inc  | 15 ++++-----------
 5 files changed, 4 insertions(+), 26 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index e4c947b478b..c89db50eddc 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1297,7 +1297,6 @@ typedef enum {
     CC_OP_ADCX, /* CC_DST = C, CC_SRC = rest.  */
     CC_OP_ADOX, /* CC_SRC2 = O, CC_SRC = rest.  */
     CC_OP_ADCOX, /* CC_DST = C, CC_SRC2 = O, CC_SRC = rest.  */
-    CC_OP_CLR, /* Z and P set, all other flags clear.  */
 
     CC_OP_MULB, /* modify all flags, C, O = (CC_SRC != 0) */
     CC_OP_MULW,
diff --git a/target/i386/cpu-dump.c b/target/i386/cpu-dump.c
index dc6723aedee..a72ed93bd2f 100644
--- a/target/i386/cpu-dump.c
+++ b/target/i386/cpu-dump.c
@@ -91,7 +91,6 @@ static const char * const cc_op_str[] = {
     [CC_OP_BMILGQ] = "BMILGQ",
 
     [CC_OP_POPCNT] = "POPCNT",
-    [CC_OP_CLR] = "CLR",
 };
 
 static void
diff --git a/target/i386/tcg/cc_helper.c b/target/i386/tcg/cc_helper.c
index dbddaa2fcb3..40583c04cf9 100644
--- a/target/i386/tcg/cc_helper.c
+++ b/target/i386/tcg/cc_helper.c
@@ -104,8 +104,6 @@ target_ulong helper_cc_compute_all(target_ulong dst, target_ulong src1,
 
     case CC_OP_EFLAGS:
         return src1;
-    case CC_OP_CLR:
-        return CC_Z | CC_P;
     case CC_OP_POPCNT:
         return dst ? 0 : CC_Z;
 
@@ -243,7 +241,6 @@ target_ulong helper_cc_compute_c(target_ulong dst, target_ulong src1,
     case CC_OP_LOGICW:
     case CC_OP_LOGICL:
     case CC_OP_LOGICQ:
-    case CC_OP_CLR:
     case CC_OP_POPCNT:
         return 0;
 
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index dc308f31041..a20fbb019c8 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -309,7 +309,6 @@ static const uint8_t cc_op_live[CC_OP_NB] = {
     [CC_OP_ADCX] = USES_CC_DST | USES_CC_SRC,
     [CC_OP_ADOX] = USES_CC_SRC | USES_CC_SRC2,
     [CC_OP_ADCOX] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
-    [CC_OP_CLR] = 0,
     [CC_OP_POPCNT] = USES_CC_DST,
 };
 
@@ -803,10 +802,6 @@ static void gen_mov_eflags(DisasContext *s, TCGv reg)
         tcg_gen_mov_tl(reg, cpu_cc_src);
         return;
     }
-    if (s->cc_op == CC_OP_CLR) {
-        tcg_gen_movi_tl(reg, CC_Z | CC_P);
-        return;
-    }
 
     dst = cpu_cc_dst;
     src1 = cpu_cc_src;
@@ -897,7 +892,6 @@ static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv reg)
                              .reg2 = cpu_cc_src, .use_reg2 = true };
 
     case CC_OP_LOGICB ... CC_OP_LOGICQ:
-    case CC_OP_CLR:
     case CC_OP_POPCNT:
         return (CCPrepare) { .cond = TCG_COND_NEVER };
 
@@ -969,7 +963,6 @@ static CCPrepare gen_prepare_eflags_s(DisasContext *s, TCGv reg)
     case CC_OP_ADCOX:
         return (CCPrepare) { .cond = TCG_COND_TSTNE, .reg = cpu_cc_src,
                              .imm = CC_S };
-    case CC_OP_CLR:
     case CC_OP_POPCNT:
         return (CCPrepare) { .cond = TCG_COND_NEVER };
     default:
@@ -988,7 +981,6 @@ static CCPrepare gen_prepare_eflags_o(DisasContext *s, TCGv reg)
     case CC_OP_ADCOX:
         return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src2,
                              .no_setcond = true };
-    case CC_OP_CLR:
     case CC_OP_POPCNT:
         return (CCPrepare) { .cond = TCG_COND_NEVER };
     case CC_OP_MULB ... CC_OP_MULQ:
@@ -1013,8 +1005,6 @@ static CCPrepare gen_prepare_eflags_z(DisasContext *s, TCGv reg)
     case CC_OP_ADCOX:
         return (CCPrepare) { .cond = TCG_COND_TSTNE, .reg = cpu_cc_src,
                              .imm = CC_Z };
-    case CC_OP_CLR:
-        return (CCPrepare) { .cond = TCG_COND_ALWAYS };
     default:
         {
             MemOp size = (s->cc_op - CC_OP_ADDB) & 3;
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index fd17a9b1eca..790307dbba8 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -1452,19 +1452,12 @@ static void gen_bt_flags(DisasContext *s, X86DecodedInsn *decode, TCGv src, TCGv
      * C is the result of the test, Z is unchanged, and the others
      * are all undefined.
      */
-    switch (s->cc_op) {
-    case CC_OP_DYNAMIC:
-    case CC_OP_CLR:
-    case CC_OP_EFLAGS:
-    case CC_OP_ADCX:
-    case CC_OP_ADOX:
-    case CC_OP_ADCOX:
+    if (s->cc_op == CC_OP_DYNAMIC || CC_OP_HAS_EFLAGS(s->cc_op)) {
         /* Generate EFLAGS and replace the C bit.  */
         cf = tcg_temp_new();
         tcg_gen_setcond_tl(TCG_COND_TSTNE, cf, src, mask);
         prepare_update_cf(decode, s, cf);
-        break;
-    default:
+    } else {
         /*
          * Z was going to be computed from the non-zero status of CC_DST.
          * We can get that same Z value (and the new C value) by leaving
@@ -1475,7 +1468,6 @@ static void gen_bt_flags(DisasContext *s, X86DecodedInsn *decode, TCGv src, TCGv
         decode->cc_dst = cpu_cc_dst;
         decode->cc_op = ((s->cc_op - CC_OP_MULB) & 3) + CC_OP_SARB;
         tcg_gen_shr_tl(decode->cc_src, src, s->T1);
-        break;
     }
 }
 
@@ -4724,7 +4716,8 @@ static void gen_XOR(DisasContext *s, X86DecodedInsn *decode)
         decode->op[2].unit == X86_OP_INT &&
         decode->op[1].n == decode->op[2].n) {
         tcg_gen_movi_tl(s->T0, 0);
-        decode->cc_op = CC_OP_CLR;
+        decode->cc_op = CC_OP_EFLAGS;
+        decode->cc_src = tcg_constant_tl(CC_Z | CC_P);
     } else {
         MemOp ot = decode->op[1].ot;
 
-- 
2.47.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 04/14] target/i386: Rearrange CCOp
  2024-10-28 15:18 [PATCH v2 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
                   ` (2 preceding siblings ...)
  2024-10-28 15:18 ` [PATCH 03/14] target/i386: remove CC_OP_CLR Paolo Bonzini
@ 2024-10-28 15:18 ` Paolo Bonzini
  2024-10-28 15:18 ` [PATCH 05/14] target/i386: Introduce cc_op_size Paolo Bonzini
                   ` (9 subsequent siblings)
  13 siblings, 0 replies; 21+ messages in thread
From: Paolo Bonzini @ 2024-10-28 15:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson

From: Richard Henderson <richard.henderson@linaro.org>

Give the first few enumerators explicit integer constants,
align the BWLQ enumerators.

This will be used to simplify ((op - CC_OP_*B) & 3).

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Link: https://lore.kernel.org/r/20240701025115.1265117-4-richard.henderson@linaro.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/cpu.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index c89db50eddc..b605f592521 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1292,11 +1292,10 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w);
  * are only needed for conditional branches.
  */
 typedef enum {
-    CC_OP_DYNAMIC, /* must use dynamic code to get cc_op */
-    CC_OP_EFLAGS,  /* all cc are explicitly computed, CC_SRC = flags */
-    CC_OP_ADCX, /* CC_DST = C, CC_SRC = rest.  */
-    CC_OP_ADOX, /* CC_SRC2 = O, CC_SRC = rest.  */
-    CC_OP_ADCOX, /* CC_DST = C, CC_SRC2 = O, CC_SRC = rest.  */
+    CC_OP_EFLAGS = 0,  /* all cc are explicitly computed, CC_SRC = flags */
+    CC_OP_ADCX = 1,    /* CC_DST = C, CC_SRC = rest.  */
+    CC_OP_ADOX = 2,    /* CC_SRC2 = O, CC_SRC = rest.  */
+    CC_OP_ADCOX = 3,   /* CC_DST = C, CC_SRC2 = O, CC_SRC = rest.  */
 
     CC_OP_MULB, /* modify all flags, C, O = (CC_SRC != 0) */
     CC_OP_MULW,
@@ -1369,9 +1368,12 @@ typedef enum {
     CC_OP_POPCNTQ__,
     CC_OP_POPCNT = sizeof(target_ulong) == 8 ? CC_OP_POPCNTQ__ : CC_OP_POPCNTL__,
 
+    CC_OP_DYNAMIC, /* must use dynamic code to get cc_op */
     CC_OP_NB,
 } CCOp;
-QEMU_BUILD_BUG_ON(CC_OP_NB >= 128);
+
+/* See X86DecodedInsn.cc_op, using int8_t. */
+QEMU_BUILD_BUG_ON(CC_OP_DYNAMIC > INT8_MAX);
 
 typedef struct SegmentCache {
     uint32_t selector;
-- 
2.47.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 05/14] target/i386: Introduce cc_op_size
  2024-10-28 15:18 [PATCH v2 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
                   ` (3 preceding siblings ...)
  2024-10-28 15:18 ` [PATCH 04/14] target/i386: Rearrange CCOp Paolo Bonzini
@ 2024-10-28 15:18 ` Paolo Bonzini
  2024-10-28 15:18 ` [PATCH 06/14] target/i386: Wrap cc_op_live with a validity check Paolo Bonzini
                   ` (8 subsequent siblings)
  13 siblings, 0 replies; 21+ messages in thread
From: Paolo Bonzini @ 2024-10-28 15:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson

From: Richard Henderson <richard.henderson@linaro.org>

Replace arithmetic on cc_op with a helper function.
Assert that the op has a size and that it is valid
for the configuration.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Link: https://lore.kernel.org/r/20240701025115.1265117-6-richard.henderson@linaro.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/cpu.h           | 17 ++++++++++++++++-
 target/i386/tcg/translate.c | 17 +++++++----------
 target/i386/tcg/emit.c.inc  |  5 +++--
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index b605f592521..e598fe3de9a 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -24,6 +24,7 @@
 #include "cpu-qom.h"
 #include "kvm/hyperv-proto.h"
 #include "exec/cpu-defs.h"
+#include "exec/memop.h"
 #include "hw/i386/topology.h"
 #include "qapi/qapi-types-common.h"
 #include "qemu/cpu-float.h"
@@ -1297,7 +1298,9 @@ typedef enum {
     CC_OP_ADOX = 2,    /* CC_SRC2 = O, CC_SRC = rest.  */
     CC_OP_ADCOX = 3,   /* CC_DST = C, CC_SRC2 = O, CC_SRC = rest.  */
 
-    CC_OP_MULB, /* modify all flags, C, O = (CC_SRC != 0) */
+    /* Low 2 bits = MemOp constant for the size */
+#define CC_OP_FIRST_BWLQ CC_OP_MULB
+    CC_OP_MULB = 4, /* modify all flags, C, O = (CC_SRC != 0) */
     CC_OP_MULW,
     CC_OP_MULL,
     CC_OP_MULQ,
@@ -1367,6 +1370,7 @@ typedef enum {
     CC_OP_POPCNTL__,
     CC_OP_POPCNTQ__,
     CC_OP_POPCNT = sizeof(target_ulong) == 8 ? CC_OP_POPCNTQ__ : CC_OP_POPCNTL__,
+#define CC_OP_LAST_BWLQ CC_OP_POPCNTQ__
 
     CC_OP_DYNAMIC, /* must use dynamic code to get cc_op */
     CC_OP_NB,
@@ -1375,6 +1379,17 @@ typedef enum {
 /* See X86DecodedInsn.cc_op, using int8_t. */
 QEMU_BUILD_BUG_ON(CC_OP_DYNAMIC > INT8_MAX);
 
+static inline MemOp cc_op_size(CCOp op)
+{
+    MemOp size = op & 3;
+
+    QEMU_BUILD_BUG_ON(CC_OP_FIRST_BWLQ & 3);
+    assert(op >= CC_OP_FIRST_BWLQ && op <= CC_OP_LAST_BWLQ);
+    assert(size <= MO_TL);
+
+    return size;
+}
+
 typedef struct SegmentCache {
     uint32_t selector;
     target_ulong base;
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index a20fbb019c8..46062002c02 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -885,7 +885,7 @@ static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv reg)
 
     case CC_OP_ADDB ... CC_OP_ADDQ:
         /* (DATA_TYPE)CC_DST < (DATA_TYPE)CC_SRC */
-        size = s->cc_op - CC_OP_ADDB;
+        size = cc_op_size(s->cc_op);
         tcg_gen_ext_tl(cpu_cc_dst, cpu_cc_dst, size);
         tcg_gen_ext_tl(cpu_cc_src, cpu_cc_src, size);
         return (CCPrepare) { .cond = TCG_COND_LTU, .reg = cpu_cc_dst,
@@ -902,7 +902,7 @@ static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv reg)
 
     case CC_OP_SHLB ... CC_OP_SHLQ:
         /* (CC_SRC >> (DATA_BITS - 1)) & 1 */
-        size = s->cc_op - CC_OP_SHLB;
+        size = cc_op_size(s->cc_op);
         return gen_prepare_sign_nz(cpu_cc_src, size);
 
     case CC_OP_MULB ... CC_OP_MULQ:
@@ -910,11 +910,11 @@ static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv reg)
                              .reg = cpu_cc_src };
 
     case CC_OP_BMILGB ... CC_OP_BMILGQ:
-        size = s->cc_op - CC_OP_BMILGB;
+        size = cc_op_size(s->cc_op);
         return gen_prepare_val_nz(cpu_cc_src, size, true);
 
     case CC_OP_BLSIB ... CC_OP_BLSIQ:
-        size = s->cc_op - CC_OP_BLSIB;
+        size = cc_op_size(s->cc_op);
         return gen_prepare_val_nz(cpu_cc_src, size, false);
 
     case CC_OP_ADCX:
@@ -966,10 +966,7 @@ static CCPrepare gen_prepare_eflags_s(DisasContext *s, TCGv reg)
     case CC_OP_POPCNT:
         return (CCPrepare) { .cond = TCG_COND_NEVER };
     default:
-        {
-            MemOp size = (s->cc_op - CC_OP_ADDB) & 3;
-            return gen_prepare_sign_nz(cpu_cc_dst, size);
-        }
+        return gen_prepare_sign_nz(cpu_cc_dst, cc_op_size(s->cc_op));
     }
 }
 
@@ -1007,7 +1004,7 @@ static CCPrepare gen_prepare_eflags_z(DisasContext *s, TCGv reg)
                              .imm = CC_Z };
     default:
         {
-            MemOp size = (s->cc_op - CC_OP_ADDB) & 3;
+            MemOp size = cc_op_size(s->cc_op);
             return gen_prepare_val_nz(cpu_cc_dst, size, true);
         }
     }
@@ -1028,7 +1025,7 @@ static CCPrepare gen_prepare_cc(DisasContext *s, int b, TCGv reg)
     switch (s->cc_op) {
     case CC_OP_SUBB ... CC_OP_SUBQ:
         /* We optimize relational operators for the cmp/jcc case.  */
-        size = s->cc_op - CC_OP_SUBB;
+        size = cc_op_size(s->cc_op);
         switch (jcc_op) {
         case JCC_BE:
             tcg_gen_ext_tl(s->cc_srcT, s->cc_srcT, size);
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index 790307dbba8..45ac5edb1ae 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -1466,7 +1466,7 @@ static void gen_bt_flags(DisasContext *s, X86DecodedInsn *decode, TCGv src, TCGv
          */
         decode->cc_src = tcg_temp_new();
         decode->cc_dst = cpu_cc_dst;
-        decode->cc_op = ((s->cc_op - CC_OP_MULB) & 3) + CC_OP_SARB;
+        decode->cc_op = CC_OP_SARB + cc_op_size(s->cc_op);
         tcg_gen_shr_tl(decode->cc_src, src, s->T1);
     }
 }
@@ -3346,7 +3346,8 @@ static bool gen_eflags_adcox(DisasContext *s, X86DecodedInsn *decode, bool want_
          * bit, we might as well fish CF out of EFLAGS and save a shift.
          */
         if (want_carry && (!need_flags || s->cc_op == CC_OP_SHLB + MO_TL)) {
-            tcg_gen_shri_tl(decode->cc_dst, cpu_cc_src, (8 << (s->cc_op - CC_OP_SHLB)) - 1);
+            MemOp size = cc_op_size(s->cc_op);
+            tcg_gen_shri_tl(decode->cc_dst, cpu_cc_src, (8 << size) - 1);
             got_cf = true;
         }
         gen_mov_eflags(s, decode->cc_src);
-- 
2.47.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 06/14] target/i386: Wrap cc_op_live with a validity check
  2024-10-28 15:18 [PATCH v2 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
                   ` (4 preceding siblings ...)
  2024-10-28 15:18 ` [PATCH 05/14] target/i386: Introduce cc_op_size Paolo Bonzini
@ 2024-10-28 15:18 ` Paolo Bonzini
  2024-10-28 15:18 ` [PATCH 07/14] target/i386: optimize computation of ZF from CC_OP_DYNAMIC Paolo Bonzini
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 21+ messages in thread
From: Paolo Bonzini @ 2024-10-28 15:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson

From: Richard Henderson <richard.henderson@linaro.org>

Assert that op is known and that cc_op_live_ is populated.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/cpu.h                |  1 -
 target/i386/tcg/translate.c      | 21 ++++++++++++++++++---
 target/i386/tcg/decode-new.c.inc |  2 +-
 target/i386/tcg/emit.c.inc       |  4 ++--
 4 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index e598fe3de9a..afe6643db83 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1373,7 +1373,6 @@ typedef enum {
 #define CC_OP_LAST_BWLQ CC_OP_POPCNTQ__
 
     CC_OP_DYNAMIC, /* must use dynamic code to get cc_op */
-    CC_OP_NB,
 } CCOp;
 
 /* See X86DecodedInsn.cc_op, using int8_t. */
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 46062002c02..1a9a2fe709e 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -291,7 +291,7 @@ enum {
 };
 
 /* Bit set if the global variable is live after setting CC_OP to X.  */
-static const uint8_t cc_op_live[CC_OP_NB] = {
+static const uint8_t cc_op_live_[] = {
     [CC_OP_DYNAMIC] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
     [CC_OP_EFLAGS] = USES_CC_SRC,
     [CC_OP_MULB ... CC_OP_MULQ] = USES_CC_DST | USES_CC_SRC,
@@ -312,6 +312,21 @@ static const uint8_t cc_op_live[CC_OP_NB] = {
     [CC_OP_POPCNT] = USES_CC_DST,
 };
 
+static uint8_t cc_op_live(CCOp op)
+{
+    uint8_t result;
+    assert(op >= 0 && op < ARRAY_SIZE(cc_op_live_));
+
+    /*
+     * Check that the array is fully populated.  A zero entry would correspond
+     * to a fixed value of EFLAGS, which can be obtained with CC_OP_EFLAGS
+     * as well.
+     */
+    result = cc_op_live_[op];
+    assert(result);
+    return result;
+}
+
 static void set_cc_op_1(DisasContext *s, CCOp op, bool dirty)
 {
     int dead;
@@ -321,7 +336,7 @@ static void set_cc_op_1(DisasContext *s, CCOp op, bool dirty)
     }
 
     /* Discard CC computation that will no longer be used.  */
-    dead = cc_op_live[s->cc_op] & ~cc_op_live[op];
+    dead = cc_op_live(s->cc_op) & ~cc_op_live(op);
     if (dead & USES_CC_DST) {
         tcg_gen_discard_tl(cpu_cc_dst);
     }
@@ -808,7 +823,7 @@ static void gen_mov_eflags(DisasContext *s, TCGv reg)
     src2 = cpu_cc_src2;
 
     /* Take care to not read values that are not live.  */
-    live = cc_op_live[s->cc_op] & ~USES_CC_SRCT;
+    live = cc_op_live(s->cc_op) & ~USES_CC_SRCT;
     dead = live ^ (USES_CC_DST | USES_CC_SRC | USES_CC_SRC2);
     if (dead) {
         TCGv zero = tcg_constant_tl(0);
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index 48bf730cd3e..cda32ee6784 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -2865,7 +2865,7 @@ static void disas_insn(DisasContext *s, CPUState *cpu)
             tcg_gen_mov_i32(cpu_cc_op, decode.cc_op_dynamic);
         }
         set_cc_op(s, decode.cc_op);
-        cc_live = cc_op_live[decode.cc_op];
+        cc_live = cc_op_live(decode.cc_op);
     } else {
         cc_live = 0;
     }
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index 45ac5edb1ae..785ff63f2ac 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -3777,13 +3777,13 @@ static void gen_shift_dynamic_flags(DisasContext *s, X86DecodedInsn *decode, TCG
     decode->cc_op_dynamic = tcg_temp_new_i32();
 
     assert(decode->cc_dst == s->T0);
-    if (cc_op_live[s->cc_op] & USES_CC_DST) {
+    if (cc_op_live(s->cc_op) & USES_CC_DST) {
         decode->cc_dst = tcg_temp_new();
         tcg_gen_movcond_tl(TCG_COND_EQ, decode->cc_dst, count, tcg_constant_tl(0),
                            cpu_cc_dst, s->T0);
     }
 
-    if (cc_op_live[s->cc_op] & USES_CC_SRC) {
+    if (cc_op_live(s->cc_op) & USES_CC_SRC) {
         tcg_gen_movcond_tl(TCG_COND_EQ, decode->cc_src, count, tcg_constant_tl(0),
                            cpu_cc_src, decode->cc_src);
     }
-- 
2.47.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 07/14] target/i386: optimize computation of ZF from CC_OP_DYNAMIC
  2024-10-28 15:18 [PATCH v2 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
                   ` (5 preceding siblings ...)
  2024-10-28 15:18 ` [PATCH 06/14] target/i386: Wrap cc_op_live with a validity check Paolo Bonzini
@ 2024-10-28 15:18 ` Paolo Bonzini
  2024-10-29 15:43   ` Richard Henderson
  2024-10-28 15:18 ` [PATCH 08/14] target/i386: optimize TEST+Jxx sequences Paolo Bonzini
                   ` (6 subsequent siblings)
  13 siblings, 1 reply; 21+ messages in thread
From: Paolo Bonzini @ 2024-10-28 15:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson

Most uses of CC_OP_DYNAMIC are for CMP/JB/JE or similar sequences.
We can optimize many of them to avoid computation of the flags.
This eliminates both TCG ops to set up the new cc_op, and helper
instructions because evaluating just ZF is much cheaper.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/helper.h        |  1 +
 target/i386/tcg/cc_helper.c | 13 +++++++++++++
 target/i386/tcg/translate.c | 10 +++++++---
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/target/i386/helper.h b/target/i386/helper.h
index eeb8df56eaa..3f67098f11f 100644
--- a/target/i386/helper.h
+++ b/target/i386/helper.h
@@ -1,5 +1,6 @@
 DEF_HELPER_FLAGS_4(cc_compute_all, TCG_CALL_NO_RWG_SE, tl, tl, tl, tl, int)
 DEF_HELPER_FLAGS_4(cc_compute_c, TCG_CALL_NO_RWG_SE, tl, tl, tl, tl, int)
+DEF_HELPER_FLAGS_3(cc_compute_nz, TCG_CALL_NO_RWG_SE, tl, tl, tl, int)
 
 DEF_HELPER_3(write_eflags, void, env, tl, i32)
 DEF_HELPER_1(read_eflags, tl, env)
diff --git a/target/i386/tcg/cc_helper.c b/target/i386/tcg/cc_helper.c
index 40583c04cf9..1b83775a914 100644
--- a/target/i386/tcg/cc_helper.c
+++ b/target/i386/tcg/cc_helper.c
@@ -95,6 +95,19 @@ static target_ulong compute_all_adcox(target_ulong dst, target_ulong src1,
     return (src1 & ~(CC_C | CC_O)) | (dst * CC_C) | (src2 * CC_O);
 }
 
+target_ulong helper_cc_compute_nz(target_ulong dst, target_ulong src1,
+                                  int op)
+{
+    if (CC_OP_HAS_EFLAGS(op)) {
+        return ~src1 & CC_Z;
+    } else {
+        MemOp size = cc_op_size(op);
+        target_ulong mask = MAKE_64BIT_MASK(0, 8 << size);
+
+        return dst & mask;
+    }
+}
+
 target_ulong helper_cc_compute_all(target_ulong dst, target_ulong src1,
                                    target_ulong src2, int op)
 {
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 1a9a2fe709e..5e326ab1aff 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -1008,15 +1008,19 @@ static CCPrepare gen_prepare_eflags_o(DisasContext *s, TCGv reg)
 static CCPrepare gen_prepare_eflags_z(DisasContext *s, TCGv reg)
 {
     switch (s->cc_op) {
-    case CC_OP_DYNAMIC:
-        gen_compute_eflags(s);
-        /* FALLTHRU */
     case CC_OP_EFLAGS:
     case CC_OP_ADCX:
     case CC_OP_ADOX:
     case CC_OP_ADCOX:
         return (CCPrepare) { .cond = TCG_COND_TSTNE, .reg = cpu_cc_src,
                              .imm = CC_Z };
+    case CC_OP_DYNAMIC:
+        gen_update_cc_op(s);
+        if (!reg) {
+            reg = tcg_temp_new();
+        }
+        gen_helper_cc_compute_nz(reg, cpu_cc_dst, cpu_cc_src, cpu_cc_op);
+        return (CCPrepare) { .cond = TCG_COND_EQ, .reg = reg, .imm = 0 };
     default:
         {
             MemOp size = cc_op_size(s->cc_op);
-- 
2.47.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 08/14] target/i386: optimize TEST+Jxx sequences
  2024-10-28 15:18 [PATCH v2 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
                   ` (6 preceding siblings ...)
  2024-10-28 15:18 ` [PATCH 07/14] target/i386: optimize computation of ZF from CC_OP_DYNAMIC Paolo Bonzini
@ 2024-10-28 15:18 ` Paolo Bonzini
  2024-10-28 15:18 ` [PATCH 09/14] target/i386: add a few more trivial CCPrepare cases Paolo Bonzini
                   ` (5 subsequent siblings)
  13 siblings, 0 replies; 21+ messages in thread
From: Paolo Bonzini @ 2024-10-28 15:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson

Mostly used for TEST+JG and TEST+JLE, but it is easy to cover
also JBE/JA and JL/JGE; shaves about 0.5% TCG ops.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/translate.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 5e326ab1aff..d3bbcf7317c 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -1069,6 +1069,28 @@ static CCPrepare gen_prepare_cc(DisasContext *s, int b, TCGv reg)
         }
         break;
 
+    case CC_OP_LOGICB ... CC_OP_LOGICQ:
+        /* Mostly used for test+jump */
+        size = s->cc_op - CC_OP_LOGICB;
+        switch (jcc_op) {
+        case JCC_BE:
+            /* CF = 0, becomes jz/je */
+            jcc_op = JCC_Z;
+            goto slow_jcc;
+        case JCC_L:
+            /* OF = 0, becomes js/jns */
+            jcc_op = JCC_S;
+            goto slow_jcc;
+        case JCC_LE:
+            /* SF or ZF, becomes signed <= 0 */
+            tcg_gen_ext_tl(cpu_cc_dst, cpu_cc_dst, size | MO_SIGN);
+            cc = (CCPrepare) { .cond = TCG_COND_LE, .reg = cpu_cc_dst };
+            break;
+        default:
+            goto slow_jcc;
+        }
+        break;
+
     default:
     slow_jcc:
         /* This actually generates good code for JC, JZ and JS.  */
-- 
2.47.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 09/14] target/i386: add a few more trivial CCPrepare cases
  2024-10-28 15:18 [PATCH v2 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
                   ` (7 preceding siblings ...)
  2024-10-28 15:18 ` [PATCH 08/14] target/i386: optimize TEST+Jxx sequences Paolo Bonzini
@ 2024-10-28 15:18 ` Paolo Bonzini
  2024-10-28 15:18 ` [PATCH 10/14] target/i386: add a note about gen_jcc1 Paolo Bonzini
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 21+ messages in thread
From: Paolo Bonzini @ 2024-10-28 15:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/translate.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index d3bbcf7317c..6e89d4faef1 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -993,6 +993,7 @@ static CCPrepare gen_prepare_eflags_o(DisasContext *s, TCGv reg)
     case CC_OP_ADCOX:
         return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src2,
                              .no_setcond = true };
+    case CC_OP_LOGICB ... CC_OP_LOGICQ:
     case CC_OP_POPCNT:
         return (CCPrepare) { .cond = TCG_COND_NEVER };
     case CC_OP_MULB ... CC_OP_MULQ:
@@ -1021,6 +1022,8 @@ static CCPrepare gen_prepare_eflags_z(DisasContext *s, TCGv reg)
         }
         gen_helper_cc_compute_nz(reg, cpu_cc_dst, cpu_cc_src, cpu_cc_op);
         return (CCPrepare) { .cond = TCG_COND_EQ, .reg = reg, .imm = 0 };
+    case CC_OP_POPCNT:
+        return (CCPrepare) { .cond = TCG_COND_EQ, .reg = cpu_cc_dst };
     default:
         {
             MemOp size = cc_op_size(s->cc_op);
-- 
2.47.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 10/14] target/i386: add a note about gen_jcc1
  2024-10-28 15:18 [PATCH v2 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
                   ` (8 preceding siblings ...)
  2024-10-28 15:18 ` [PATCH 09/14] target/i386: add a few more trivial CCPrepare cases Paolo Bonzini
@ 2024-10-28 15:18 ` Paolo Bonzini
  2024-10-29 15:44   ` Richard Henderson
  2024-10-28 15:18 ` [PATCH 11/14] target/i386: make flag variables unsigned Paolo Bonzini
                   ` (3 subsequent siblings)
  13 siblings, 1 reply; 21+ messages in thread
From: Paolo Bonzini @ 2024-10-28 15:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/translate.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 6e89d4faef1..5d729e68c98 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -1193,6 +1193,10 @@ static inline void gen_jcc1(DisasContext *s, int b, TCGLabel *l1)
 {
     CCPrepare cc = gen_prepare_cc(s, b, NULL);
 
+    /*
+     * Note that this must be _after_ gen_prepare_cc, because it
+     * can change the cc_op from CC_OP_DYNAMIC to CC_OP_EFLAGS!
+     */
     gen_update_cc_op(s);
     if (cc.use_reg2) {
         tcg_gen_brcond_tl(cc.cond, cc.reg, cc.reg2, l1);
-- 
2.47.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 11/14] target/i386: make flag variables unsigned
  2024-10-28 15:18 [PATCH v2 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
                   ` (9 preceding siblings ...)
  2024-10-28 15:18 ` [PATCH 10/14] target/i386: add a note about gen_jcc1 Paolo Bonzini
@ 2024-10-28 15:18 ` Paolo Bonzini
  2024-10-28 15:18 ` [PATCH 12/14] target/i386: use compiler builtin to compute PF Paolo Bonzini
                   ` (2 subsequent siblings)
  13 siblings, 0 replies; 21+ messages in thread
From: Paolo Bonzini @ 2024-10-28 15:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson

This makes it easier for the compiler to understand which bits are set,
and it also removes "cltq" instructions to canonicalize the output value
as 32-bit signed.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/cc_helper_template.h.inc | 46 ++++++++++++------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/target/i386/tcg/cc_helper_template.h.inc b/target/i386/tcg/cc_helper_template.h.inc
index c5425e57cfb..4cbbc73c3cd 100644
--- a/target/i386/tcg/cc_helper_template.h.inc
+++ b/target/i386/tcg/cc_helper_template.h.inc
@@ -39,9 +39,9 @@
 
 /* dynamic flags computation */
 
-static int glue(compute_all_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
+static uint32_t glue(compute_all_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 {
-    int cf, pf, af, zf, sf, of;
+    uint32_t cf, pf, af, zf, sf, of;
     DATA_TYPE src2 = dst - src1;
 
     cf = dst < src1;
@@ -58,10 +58,10 @@ static int glue(compute_c_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     return dst < src1;
 }
 
-static int glue(compute_all_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1,
+static uint32_t glue(compute_all_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1,
                                          DATA_TYPE src3)
 {
-    int cf, pf, af, zf, sf, of;
+    uint32_t cf, pf, af, zf, sf, of;
     DATA_TYPE src2 = dst - src1 - src3;
 
     cf = (src3 ? dst <= src1 : dst < src1);
@@ -79,9 +79,9 @@ static int glue(compute_c_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1,
     return src3 ? dst <= src1 : dst < src1;
 }
 
-static int glue(compute_all_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2)
+static uint32_t glue(compute_all_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2)
 {
-    int cf, pf, af, zf, sf, of;
+    uint32_t cf, pf, af, zf, sf, of;
     DATA_TYPE src1 = dst + src2;
 
     cf = src1 < src2;
@@ -100,10 +100,10 @@ static int glue(compute_c_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2)
     return src1 < src2;
 }
 
-static int glue(compute_all_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2,
+static uint32_t glue(compute_all_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2,
                                          DATA_TYPE src3)
 {
-    int cf, pf, af, zf, sf, of;
+    uint32_t cf, pf, af, zf, sf, of;
     DATA_TYPE src1 = dst + src2 + src3;
 
     cf = (src3 ? src1 <= src2 : src1 < src2);
@@ -123,9 +123,9 @@ static int glue(compute_c_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2,
     return (src3 ? src1 <= src2 : src1 < src2);
 }
 
-static int glue(compute_all_logic, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
+static uint32_t glue(compute_all_logic, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 {
-    int cf, pf, af, zf, sf, of;
+    uint32_t cf, pf, af, zf, sf, of;
 
     cf = 0;
     pf = parity_table[(uint8_t)dst];
@@ -136,9 +136,9 @@ static int glue(compute_all_logic, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     return cf | pf | af | zf | sf | of;
 }
 
-static int glue(compute_all_inc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
+static uint32_t glue(compute_all_inc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 {
-    int cf, pf, af, zf, sf, of;
+    uint32_t cf, pf, af, zf, sf, of;
     DATA_TYPE src2;
 
     cf = src1;
@@ -152,9 +152,9 @@ static int glue(compute_all_inc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     return cf | pf | af | zf | sf | of;
 }
 
-static int glue(compute_all_dec, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
+static uint32_t glue(compute_all_dec, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 {
-    int cf, pf, af, zf, sf, of;
+    uint32_t cf, pf, af, zf, sf, of;
     DATA_TYPE src2;
 
     cf = src1;
@@ -168,9 +168,9 @@ static int glue(compute_all_dec, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     return cf | pf | af | zf | sf | of;
 }
 
-static int glue(compute_all_shl, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
+static uint32_t glue(compute_all_shl, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 {
-    int cf, pf, af, zf, sf, of;
+    uint32_t cf, pf, af, zf, sf, of;
 
     cf = (src1 >> (DATA_BITS - 1)) & CC_C;
     pf = parity_table[(uint8_t)dst];
@@ -187,9 +187,9 @@ static int glue(compute_c_shl, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     return (src1 >> (DATA_BITS - 1)) & CC_C;
 }
 
-static int glue(compute_all_sar, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
+static uint32_t glue(compute_all_sar, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 {
-    int cf, pf, af, zf, sf, of;
+    uint32_t cf, pf, af, zf, sf, of;
 
     cf = src1 & 1;
     pf = parity_table[(uint8_t)dst];
@@ -204,9 +204,9 @@ static int glue(compute_all_sar, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 /* NOTE: we compute the flags like the P4. On olders CPUs, only OF and
    CF are modified and it is slower to do that.  Note as well that we
    don't truncate SRC1 for computing carry to DATA_TYPE.  */
-static int glue(compute_all_mul, SUFFIX)(DATA_TYPE dst, target_long src1)
+static uint32_t glue(compute_all_mul, SUFFIX)(DATA_TYPE dst, target_long src1)
 {
-    int cf, pf, af, zf, sf, of;
+    uint32_t cf, pf, af, zf, sf, of;
 
     cf = (src1 != 0);
     pf = parity_table[(uint8_t)dst];
@@ -217,9 +217,9 @@ static int glue(compute_all_mul, SUFFIX)(DATA_TYPE dst, target_long src1)
     return cf | pf | af | zf | sf | of;
 }
 
-static int glue(compute_all_bmilg, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
+static uint32_t glue(compute_all_bmilg, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 {
-    int cf, pf, af, zf, sf, of;
+    uint32_t cf, pf, af, zf, sf, of;
 
     cf = (src1 == 0);
     pf = 0; /* undefined */
@@ -237,7 +237,7 @@ static int glue(compute_c_bmilg, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 
 static int glue(compute_all_blsi, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 {
-    int cf, pf, af, zf, sf, of;
+    uint32_t cf, pf, af, zf, sf, of;
 
     cf = (src1 != 0);
     pf = 0; /* undefined */
-- 
2.47.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 12/14] target/i386: use compiler builtin to compute PF
  2024-10-28 15:18 [PATCH v2 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
                   ` (10 preceding siblings ...)
  2024-10-28 15:18 ` [PATCH 11/14] target/i386: make flag variables unsigned Paolo Bonzini
@ 2024-10-28 15:18 ` Paolo Bonzini
  2024-10-29 15:45   ` Richard Henderson
  2024-10-28 15:18 ` [PATCH 13/14] target/i386: use higher-precision arithmetic to compute CF Paolo Bonzini
  2024-10-28 15:18 ` [PATCH 14/14] target/i386: use + to put flags together Paolo Bonzini
  13 siblings, 1 reply; 21+ messages in thread
From: Paolo Bonzini @ 2024-10-28 15:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson

This removes the 256 byte parity table from the executable.

Suggested-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/qemu/host-utils.h                |  9 ++++++
 target/i386/tcg/helper-tcg.h             |  6 +++-
 target/i386/tcg/cc_helper_template.h.inc | 20 +++++++-------
 target/i386/tcg/cc_helper.c              | 35 ------------------------
 target/i386/tcg/int_helper.c             |  4 +--
 5 files changed, 26 insertions(+), 48 deletions(-)

diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index ead97d354d6..4d28fa22cfa 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -313,6 +313,15 @@ static inline int ctpop8(uint8_t val)
     return __builtin_popcount(val);
 }
 
+/*
+ * parity8 - return the parity (1 = odd) of an 8-bit value.
+ * @val: The value to search
+ */
+static inline int parity8(uint8_t val)
+{
+    return __builtin_parity(val);
+}
+
 /**
  * ctpop16 - count the population of one bits in a 16-bit value.
  * @val: The value to search
diff --git a/target/i386/tcg/helper-tcg.h b/target/i386/tcg/helper-tcg.h
index 15d6c6f8b4f..696d6ef016f 100644
--- a/target/i386/tcg/helper-tcg.h
+++ b/target/i386/tcg/helper-tcg.h
@@ -21,6 +21,7 @@
 #define I386_HELPER_TCG_H
 
 #include "exec/exec-all.h"
+#include "qemu/host-utils.h"
 
 /* Maximum instruction code size */
 #define TARGET_MAX_INSN_SIZE 16
@@ -87,7 +88,10 @@ G_NORETURN void x86_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr,
 #endif
 
 /* cc_helper.c */
-extern const uint8_t parity_table[256];
+static inline unsigned int compute_pf(uint8_t x)
+{
+    return !parity8(x) * CC_P;
+}
 
 /* misc_helper.c */
 void cpu_load_eflags(CPUX86State *env, int eflags, int update_mask);
diff --git a/target/i386/tcg/cc_helper_template.h.inc b/target/i386/tcg/cc_helper_template.h.inc
index 4cbbc73c3cd..8af8b8539f9 100644
--- a/target/i386/tcg/cc_helper_template.h.inc
+++ b/target/i386/tcg/cc_helper_template.h.inc
@@ -45,7 +45,7 @@ static uint32_t glue(compute_all_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     DATA_TYPE src2 = dst - src1;
 
     cf = dst < src1;
-    pf = parity_table[(uint8_t)dst];
+    pf = compute_pf(dst);
     af = (dst ^ src1 ^ src2) & CC_A;
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
@@ -65,7 +65,7 @@ static uint32_t glue(compute_all_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1,
     DATA_TYPE src2 = dst - src1 - src3;
 
     cf = (src3 ? dst <= src1 : dst < src1);
-    pf = parity_table[(uint8_t)dst];
+    pf = compute_pf(dst);
     af = (dst ^ src1 ^ src2) & 0x10;
     zf = (dst == 0) << 6;
     sf = lshift(dst, 8 - DATA_BITS) & 0x80;
@@ -85,7 +85,7 @@ static uint32_t glue(compute_all_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2)
     DATA_TYPE src1 = dst + src2;
 
     cf = src1 < src2;
-    pf = parity_table[(uint8_t)dst];
+    pf = compute_pf(dst);
     af = (dst ^ src1 ^ src2) & CC_A;
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
@@ -107,7 +107,7 @@ static uint32_t glue(compute_all_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2,
     DATA_TYPE src1 = dst + src2 + src3;
 
     cf = (src3 ? src1 <= src2 : src1 < src2);
-    pf = parity_table[(uint8_t)dst];
+    pf = compute_pf(dst);
     af = (dst ^ src1 ^ src2) & 0x10;
     zf = (dst == 0) << 6;
     sf = lshift(dst, 8 - DATA_BITS) & 0x80;
@@ -128,7 +128,7 @@ static uint32_t glue(compute_all_logic, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     uint32_t cf, pf, af, zf, sf, of;
 
     cf = 0;
-    pf = parity_table[(uint8_t)dst];
+    pf = compute_pf(dst);
     af = 0;
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
@@ -144,7 +144,7 @@ static uint32_t glue(compute_all_inc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     cf = src1;
     src1 = dst - 1;
     src2 = 1;
-    pf = parity_table[(uint8_t)dst];
+    pf = compute_pf(dst);
     af = (dst ^ src1 ^ src2) & CC_A;
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
@@ -160,7 +160,7 @@ static uint32_t glue(compute_all_dec, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     cf = src1;
     src1 = dst + 1;
     src2 = 1;
-    pf = parity_table[(uint8_t)dst];
+    pf = compute_pf(dst);
     af = (dst ^ src1 ^ src2) & CC_A;
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
@@ -173,7 +173,7 @@ static uint32_t glue(compute_all_shl, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     uint32_t cf, pf, af, zf, sf, of;
 
     cf = (src1 >> (DATA_BITS - 1)) & CC_C;
-    pf = parity_table[(uint8_t)dst];
+    pf = compute_pf(dst);
     af = 0; /* undefined */
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
@@ -192,7 +192,7 @@ static uint32_t glue(compute_all_sar, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     uint32_t cf, pf, af, zf, sf, of;
 
     cf = src1 & 1;
-    pf = parity_table[(uint8_t)dst];
+    pf = compute_pf(dst);
     af = 0; /* undefined */
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
@@ -209,7 +209,7 @@ static uint32_t glue(compute_all_mul, SUFFIX)(DATA_TYPE dst, target_long src1)
     uint32_t cf, pf, af, zf, sf, of;
 
     cf = (src1 != 0);
-    pf = parity_table[(uint8_t)dst];
+    pf = compute_pf(dst);
     af = 0; /* undefined */
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
diff --git a/target/i386/tcg/cc_helper.c b/target/i386/tcg/cc_helper.c
index 1b83775a914..f1940b40927 100644
--- a/target/i386/tcg/cc_helper.c
+++ b/target/i386/tcg/cc_helper.c
@@ -22,41 +22,6 @@
 #include "exec/helper-proto.h"
 #include "helper-tcg.h"
 
-const uint8_t parity_table[256] = {
-    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
-    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
-    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
-    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
-    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
-    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
-    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
-    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
-    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
-    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
-    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
-    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
-    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
-    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
-    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
-    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
-    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
-    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
-    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
-    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
-    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
-    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
-    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
-    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
-    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
-    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
-    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
-    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
-    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
-    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
-    CC_P, 0, 0, CC_P, 0, CC_P, CC_P, 0,
-    0, CC_P, CC_P, 0, CC_P, 0, 0, CC_P,
-};
-
 #define SHIFT 0
 #include "cc_helper_template.h.inc"
 #undef SHIFT
diff --git a/target/i386/tcg/int_helper.c b/target/i386/tcg/int_helper.c
index e1f92405282..1a02e9d8434 100644
--- a/target/i386/tcg/int_helper.c
+++ b/target/i386/tcg/int_helper.c
@@ -237,7 +237,7 @@ void helper_daa(CPUX86State *env)
     env->regs[R_EAX] = (env->regs[R_EAX] & ~0xff) | al;
     /* well, speed is not an issue here, so we compute the flags by hand */
     eflags |= (al == 0) << 6; /* zf */
-    eflags |= parity_table[al]; /* pf */
+    eflags |= compute_pf(al);
     eflags |= (al & 0x80); /* sf */
     CC_SRC = eflags;
     CC_OP = CC_OP_EFLAGS;
@@ -269,7 +269,7 @@ void helper_das(CPUX86State *env)
     env->regs[R_EAX] = (env->regs[R_EAX] & ~0xff) | al;
     /* well, speed is not an issue here, so we compute the flags by hand */
     eflags |= (al == 0) << 6; /* zf */
-    eflags |= parity_table[al]; /* pf */
+    eflags |= compute_pf(al);
     eflags |= (al & 0x80); /* sf */
     CC_SRC = eflags;
     CC_OP = CC_OP_EFLAGS;
-- 
2.47.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 13/14] target/i386: use higher-precision arithmetic to compute CF
  2024-10-28 15:18 [PATCH v2 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
                   ` (11 preceding siblings ...)
  2024-10-28 15:18 ` [PATCH 12/14] target/i386: use compiler builtin to compute PF Paolo Bonzini
@ 2024-10-28 15:18 ` Paolo Bonzini
  2024-10-29 15:46   ` Richard Henderson
  2024-10-28 15:18 ` [PATCH 14/14] target/i386: use + to put flags together Paolo Bonzini
  13 siblings, 1 reply; 21+ messages in thread
From: Paolo Bonzini @ 2024-10-28 15:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson

If the operands of the arithmetic instruction fit within a half-register,
it's easiest to use a comparison instruction to compute the carry.
`
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/cc_helper_template.h.inc | 37 ++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/target/i386/tcg/cc_helper_template.h.inc b/target/i386/tcg/cc_helper_template.h.inc
index 8af8b8539f9..f29a6dfb77c 100644
--- a/target/i386/tcg/cc_helper_template.h.inc
+++ b/target/i386/tcg/cc_helper_template.h.inc
@@ -22,12 +22,17 @@
 #if DATA_BITS == 8
 #define SUFFIX b
 #define DATA_TYPE uint8_t
+#define WIDER_TYPE uint32_t
 #elif DATA_BITS == 16
 #define SUFFIX w
 #define DATA_TYPE uint16_t
+#define WIDER_TYPE uint32_t
 #elif DATA_BITS == 32
 #define SUFFIX l
 #define DATA_TYPE uint32_t
+#if HOST_LONG_BITS <= 64
+#define WIDER_TYPE uint64_t
+#endif
 #elif DATA_BITS == 64
 #define SUFFIX q
 #define DATA_TYPE uint64_t
@@ -62,9 +67,18 @@ static uint32_t glue(compute_all_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1,
                                          DATA_TYPE src3)
 {
     uint32_t cf, pf, af, zf, sf, of;
+
+#ifdef WIDER_TYPE
+    WIDER_TYPE src13 = (WIDER_TYPE) src1 + (WIDER_TYPE) src3;
+    DATA_TYPE src2 = dst - src13;
+
+    cf = dst < src13;
+#else
     DATA_TYPE src2 = dst - src1 - src3;
 
     cf = (src3 ? dst <= src1 : dst < src1);
+#endif
+
     pf = compute_pf(dst);
     af = (dst ^ src1 ^ src2) & 0x10;
     zf = (dst == 0) << 6;
@@ -76,7 +90,13 @@ static uint32_t glue(compute_all_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1,
 static int glue(compute_c_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1,
                                        DATA_TYPE src3)
 {
+#ifdef WIDER_TYPE
+    WIDER_TYPE src13 = (WIDER_TYPE) src1 + (WIDER_TYPE) src3;
+
+    return dst < src13;
+#else
     return src3 ? dst <= src1 : dst < src1;
+#endif
 }
 
 static uint32_t glue(compute_all_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2)
@@ -104,9 +124,18 @@ static uint32_t glue(compute_all_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2,
                                          DATA_TYPE src3)
 {
     uint32_t cf, pf, af, zf, sf, of;
+
+#ifdef WIDER_TYPE
+    WIDER_TYPE src23 = (WIDER_TYPE) src2 + (WIDER_TYPE) src3;
+    DATA_TYPE src1 = dst + src23;
+
+    cf = src1 < src23;
+#else
     DATA_TYPE src1 = dst + src2 + src3;
 
     cf = (src3 ? src1 <= src2 : src1 < src2);
+#endif
+
     pf = compute_pf(dst);
     af = (dst ^ src1 ^ src2) & 0x10;
     zf = (dst == 0) << 6;
@@ -118,9 +147,16 @@ static uint32_t glue(compute_all_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2,
 static int glue(compute_c_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2,
                                        DATA_TYPE src3)
 {
+#ifdef WIDER_TYPE
+    WIDER_TYPE src23 = (WIDER_TYPE) src2 + (WIDER_TYPE) src3;
+    DATA_TYPE src1 = dst + src23;
+
+    return src1 < src23;
+#else
     DATA_TYPE src1 = dst + src2 + src3;
 
     return (src3 ? src1 <= src2 : src1 < src2);
+#endif
 }
 
 static uint32_t glue(compute_all_logic, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
@@ -258,3 +294,4 @@ static int glue(compute_c_blsi, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
 #undef DATA_TYPE
 #undef DATA_MASK
 #undef SUFFIX
+#undef WIDER_TYPE
-- 
2.47.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 14/14] target/i386: use + to put flags together
  2024-10-28 15:18 [PATCH v2 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
                   ` (12 preceding siblings ...)
  2024-10-28 15:18 ` [PATCH 13/14] target/i386: use higher-precision arithmetic to compute CF Paolo Bonzini
@ 2024-10-28 15:18 ` Paolo Bonzini
  2024-10-29 15:46   ` Richard Henderson
  13 siblings, 1 reply; 21+ messages in thread
From: Paolo Bonzini @ 2024-10-28 15:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: richard.henderson

This gives greater opportunity for reassociation on x86 targets,
since addition can use the LEA instruction.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/cc_helper_template.h.inc | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/target/i386/tcg/cc_helper_template.h.inc b/target/i386/tcg/cc_helper_template.h.inc
index f29a6dfb77c..d7672c8840a 100644
--- a/target/i386/tcg/cc_helper_template.h.inc
+++ b/target/i386/tcg/cc_helper_template.h.inc
@@ -55,7 +55,7 @@ static uint32_t glue(compute_all_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = lshift((src1 ^ src2 ^ -1) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static int glue(compute_c_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
@@ -84,7 +84,7 @@ static uint32_t glue(compute_all_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1,
     zf = (dst == 0) << 6;
     sf = lshift(dst, 8 - DATA_BITS) & 0x80;
     of = lshift((src1 ^ src2 ^ -1) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static int glue(compute_c_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1,
@@ -110,7 +110,7 @@ static uint32_t glue(compute_all_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2)
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = lshift((src1 ^ src2) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static int glue(compute_c_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2)
@@ -141,7 +141,7 @@ static uint32_t glue(compute_all_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2,
     zf = (dst == 0) << 6;
     sf = lshift(dst, 8 - DATA_BITS) & 0x80;
     of = lshift((src1 ^ src2) & (src1 ^ dst), 12 - DATA_BITS) & CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static int glue(compute_c_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2,
@@ -169,7 +169,7 @@ static uint32_t glue(compute_all_logic, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = 0;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static uint32_t glue(compute_all_inc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
@@ -185,7 +185,7 @@ static uint32_t glue(compute_all_inc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = (dst == SIGN_MASK) * CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static uint32_t glue(compute_all_dec, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
@@ -201,7 +201,7 @@ static uint32_t glue(compute_all_dec, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = (dst == SIGN_MASK - 1) * CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static uint32_t glue(compute_all_shl, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
@@ -215,7 +215,7 @@ static uint32_t glue(compute_all_shl, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     /* of is defined iff shift count == 1 */
     of = lshift(src1 ^ dst, 12 - DATA_BITS) & CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static int glue(compute_c_shl, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
@@ -234,7 +234,7 @@ static uint32_t glue(compute_all_sar, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     /* of is defined iff shift count == 1 */
     of = lshift(src1 ^ dst, 12 - DATA_BITS) & CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 /* NOTE: we compute the flags like the P4. On olders CPUs, only OF and
@@ -250,7 +250,7 @@ static uint32_t glue(compute_all_mul, SUFFIX)(DATA_TYPE dst, target_long src1)
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = cf * CC_O;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static uint32_t glue(compute_all_bmilg, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
@@ -263,7 +263,7 @@ static uint32_t glue(compute_all_bmilg, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = 0;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static int glue(compute_c_bmilg, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
@@ -281,7 +281,7 @@ static int glue(compute_all_blsi, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
     zf = (dst == 0) * CC_Z;
     sf = lshift(dst, 8 - DATA_BITS) & CC_S;
     of = 0;
-    return cf | pf | af | zf | sf | of;
+    return cf + pf + af + zf + sf + of;
 }
 
 static int glue(compute_c_blsi, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1)
-- 
2.47.0



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 07/14] target/i386: optimize computation of ZF from CC_OP_DYNAMIC
  2024-10-28 15:18 ` [PATCH 07/14] target/i386: optimize computation of ZF from CC_OP_DYNAMIC Paolo Bonzini
@ 2024-10-29 15:43   ` Richard Henderson
  0 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2024-10-29 15:43 UTC (permalink / raw)
  To: Paolo Bonzini, qemu-devel

On 10/28/24 15:18, Paolo Bonzini wrote:
> Most uses of CC_OP_DYNAMIC are for CMP/JB/JE or similar sequences.
> We can optimize many of them to avoid computation of the flags.
> This eliminates both TCG ops to set up the new cc_op, and helper
> instructions because evaluating just ZF is much cheaper.
> 
> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> ---
>   target/i386/helper.h        |  1 +
>   target/i386/tcg/cc_helper.c | 13 +++++++++++++
>   target/i386/tcg/translate.c | 10 +++++++---
>   3 files changed, 21 insertions(+), 3 deletions(-)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

r~


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 10/14] target/i386: add a note about gen_jcc1
  2024-10-28 15:18 ` [PATCH 10/14] target/i386: add a note about gen_jcc1 Paolo Bonzini
@ 2024-10-29 15:44   ` Richard Henderson
  0 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2024-10-29 15:44 UTC (permalink / raw)
  To: Paolo Bonzini, qemu-devel

On 10/28/24 15:18, Paolo Bonzini wrote:
> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> ---
>   target/i386/tcg/translate.c | 4 ++++
>   1 file changed, 4 insertions(+)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

r~



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 12/14] target/i386: use compiler builtin to compute PF
  2024-10-28 15:18 ` [PATCH 12/14] target/i386: use compiler builtin to compute PF Paolo Bonzini
@ 2024-10-29 15:45   ` Richard Henderson
  0 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2024-10-29 15:45 UTC (permalink / raw)
  To: Paolo Bonzini, qemu-devel

On 10/28/24 15:18, Paolo Bonzini wrote:
> This removes the 256 byte parity table from the executable.
> 
> Suggested-by: Richard Henderson<richard.henderson@linaro.org>
> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> ---
>   include/qemu/host-utils.h                |  9 ++++++
>   target/i386/tcg/helper-tcg.h             |  6 +++-
>   target/i386/tcg/cc_helper_template.h.inc | 20 +++++++-------
>   target/i386/tcg/cc_helper.c              | 35 ------------------------
>   target/i386/tcg/int_helper.c             |  4 +--
>   5 files changed, 26 insertions(+), 48 deletions(-)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

r~


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 13/14] target/i386: use higher-precision arithmetic to compute CF
  2024-10-28 15:18 ` [PATCH 13/14] target/i386: use higher-precision arithmetic to compute CF Paolo Bonzini
@ 2024-10-29 15:46   ` Richard Henderson
  0 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2024-10-29 15:46 UTC (permalink / raw)
  To: Paolo Bonzini, qemu-devel

On 10/28/24 15:18, Paolo Bonzini wrote:
> If the operands of the arithmetic instruction fit within a half-register,
> it's easiest to use a comparison instruction to compute the carry.
> `
> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> ---
>   target/i386/tcg/cc_helper_template.h.inc | 37 ++++++++++++++++++++++++
>   1 file changed, 37 insertions(+)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>

r~


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 14/14] target/i386: use + to put flags together
  2024-10-28 15:18 ` [PATCH 14/14] target/i386: use + to put flags together Paolo Bonzini
@ 2024-10-29 15:46   ` Richard Henderson
  0 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2024-10-29 15:46 UTC (permalink / raw)
  To: Paolo Bonzini, qemu-devel

On 10/28/24 15:18, Paolo Bonzini wrote:
> This gives greater opportunity for reassociation on x86 targets,
> since addition can use the LEA instruction.
> 
> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> ---
>   target/i386/tcg/cc_helper_template.h.inc | 24 ++++++++++++------------
>   1 file changed, 12 insertions(+), 12 deletions(-)

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>


r~


^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2024-10-29 15:46 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-28 15:18 [PATCH v2 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
2024-10-28 15:18 ` [PATCH 01/14] target/i386: use tcg_gen_ext_tl when applicable Paolo Bonzini
2024-10-28 15:18 ` [PATCH 02/14] target/i386: Tidy cc_op_str usage Paolo Bonzini
2024-10-28 15:18 ` [PATCH 03/14] target/i386: remove CC_OP_CLR Paolo Bonzini
2024-10-28 15:18 ` [PATCH 04/14] target/i386: Rearrange CCOp Paolo Bonzini
2024-10-28 15:18 ` [PATCH 05/14] target/i386: Introduce cc_op_size Paolo Bonzini
2024-10-28 15:18 ` [PATCH 06/14] target/i386: Wrap cc_op_live with a validity check Paolo Bonzini
2024-10-28 15:18 ` [PATCH 07/14] target/i386: optimize computation of ZF from CC_OP_DYNAMIC Paolo Bonzini
2024-10-29 15:43   ` Richard Henderson
2024-10-28 15:18 ` [PATCH 08/14] target/i386: optimize TEST+Jxx sequences Paolo Bonzini
2024-10-28 15:18 ` [PATCH 09/14] target/i386: add a few more trivial CCPrepare cases Paolo Bonzini
2024-10-28 15:18 ` [PATCH 10/14] target/i386: add a note about gen_jcc1 Paolo Bonzini
2024-10-29 15:44   ` Richard Henderson
2024-10-28 15:18 ` [PATCH 11/14] target/i386: make flag variables unsigned Paolo Bonzini
2024-10-28 15:18 ` [PATCH 12/14] target/i386: use compiler builtin to compute PF Paolo Bonzini
2024-10-29 15:45   ` Richard Henderson
2024-10-28 15:18 ` [PATCH 13/14] target/i386: use higher-precision arithmetic to compute CF Paolo Bonzini
2024-10-29 15:46   ` Richard Henderson
2024-10-28 15:18 ` [PATCH 14/14] target/i386: use + to put flags together Paolo Bonzini
2024-10-29 15:46   ` Richard Henderson
  -- strict thread matches above, loose matches on Subject: below --
2024-10-20 15:53 [PATCH 00/14] target/i386: miscellaneous flags improvements Paolo Bonzini
2024-10-20 15:53 ` [PATCH 14/14] target/i386: use + to put flags together Paolo Bonzini

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).