[PATCH 0/4] tcg: Optimize loads and stores to env

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/4] tcg: Optimize loads and stores to env
@ 2023-08-31  2:57 Richard Henderson
  2023-08-31  2:57 ` [PATCH 1/4] tcg: Don't free vector results Richard Henderson
                   ` (5 more replies)
  0 siblings, 6 replies; 9+ messages in thread
From: Richard Henderson @ 2023-08-31  2:57 UTC (permalink / raw)
  To: qemu-devel

This is aimed at improving gvec generated code, which involves large
numbers of loads and stores to the env slots of the guest cpu vector
registers.  The final patch helps eliminate redundant zero-extensions
that can appear with e.g. avx2 and sve.

From the small amount of timing that I have done, there is no change.
But of course as we all know, x86 is very good with redundant memory.
And frankly, I haven't found a good test case for measuring.
What I need is an algorithm with lots of integer vector code that can
be expanded with gvec.  Most of what I've found is either fp (out of
line) or too simple (small translation blocks with little scope for
optimization).

That said, it appears to be simple enough, and does eliminate some
redundant operations, even in places that I didn't expect.

r~

Richard Henderson (4):
  tcg: Don't free vector results
  tcg/optimize: Pipe OptContext into reset_ts
  tcg: Optimize env memory operations
  tcg: Eliminate duplicate env store operations

 tcg/optimize.c    | 226 ++++++++++++++++++++++++++++++++++++++++++++--
 tcg/tcg-op-gvec.c |  39 ++------
 2 files changed, 225 insertions(+), 40 deletions(-)

-- 
2.34.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/4] tcg: Don't free vector results
  2023-08-31  2:57 [PATCH 0/4] tcg: Optimize loads and stores to env Richard Henderson
@ 2023-08-31  2:57 ` Richard Henderson
  2023-08-31  2:57 ` [PATCH 2/4] tcg/optimize: Pipe OptContext into reset_ts Richard Henderson
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 9+ messages in thread
From: Richard Henderson @ 2023-08-31  2:57 UTC (permalink / raw)
  To: qemu-devel

Avoid reusing vector temporaries so that we may re-use them
when propagating stores to loads.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg-op-gvec.c | 39 ++++++---------------------------------
 1 file changed, 6 insertions(+), 33 deletions(-)

diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index e260a07c61..f5cfd9bf99 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -561,7 +561,6 @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
             tcg_gen_dupi_vec(vece, t_vec, in_c);
         }
         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
-        tcg_temp_free_vec(t_vec);
         return;
     }
 
@@ -1036,8 +1035,6 @@ static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
         fni(vece, t1, t0);
         tcg_gen_st_vec(t1, cpu_env, dofs + i);
     }
-    tcg_temp_free_vec(t0);
-    tcg_temp_free_vec(t1);
 }
 
 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
@@ -1059,8 +1056,6 @@ static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
         fni(vece, t1, t0, c);
         tcg_gen_st_vec(t1, cpu_env, dofs + i);
     }
-    tcg_temp_free_vec(t0);
-    tcg_temp_free_vec(t1);
 }
 
 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
@@ -1081,8 +1076,6 @@ static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
         }
         tcg_gen_st_vec(t1, cpu_env, dofs + i);
     }
-    tcg_temp_free_vec(t0);
-    tcg_temp_free_vec(t1);
 }
 
 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
@@ -1105,9 +1098,6 @@ static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
         fni(vece, t2, t0, t1);
         tcg_gen_st_vec(t2, cpu_env, dofs + i);
     }
-    tcg_temp_free_vec(t2);
-    tcg_temp_free_vec(t1);
-    tcg_temp_free_vec(t0);
 }
 
 /*
@@ -1134,9 +1124,6 @@ static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
         fni(vece, t2, t0, t1, c);
         tcg_gen_st_vec(t2, cpu_env, dofs + i);
     }
-    tcg_temp_free_vec(t0);
-    tcg_temp_free_vec(t1);
-    tcg_temp_free_vec(t2);
 }
 
 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
@@ -1162,10 +1149,6 @@ static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
             tcg_gen_st_vec(t1, cpu_env, aofs + i);
         }
     }
-    tcg_temp_free_vec(t3);
-    tcg_temp_free_vec(t2);
-    tcg_temp_free_vec(t1);
-    tcg_temp_free_vec(t0);
 }
 
 /*
@@ -1191,10 +1174,6 @@ static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
         fni(vece, t0, t1, t2, t3, c);
         tcg_gen_st_vec(t0, cpu_env, dofs + i);
     }
-    tcg_temp_free_vec(t3);
-    tcg_temp_free_vec(t2);
-    tcg_temp_free_vec(t1);
-    tcg_temp_free_vec(t0);
 }
 
 /* Expand a vector two-operand operation.  */
@@ -1732,7 +1711,6 @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
             TCGv_vec t_vec = tcg_temp_new_vec(type);
             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
-            tcg_temp_free_vec(t_vec);
         } else if (vece <= MO_32) {
             TCGv_i32 in = tcg_temp_ebb_new_i32();
             switch (vece) {
@@ -1766,7 +1744,6 @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
                 tcg_gen_st_vec(in, cpu_env, dofs + i);
             }
-            tcg_temp_free_vec(in);
         } else {
             TCGv_i64 in0 = tcg_temp_ebb_new_i64();
             TCGv_i64 in1 = tcg_temp_ebb_new_i64();
@@ -1796,7 +1773,6 @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
                 tcg_gen_st_vec(in, cpu_env, dofs + i);
             }
-            tcg_temp_free_vec(in);
         } else if (TCG_TARGET_HAS_v128) {
             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
@@ -1807,8 +1783,6 @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
                 tcg_gen_st_vec(in0, cpu_env, dofs + i);
                 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
             }
-            tcg_temp_free_vec(in0);
-            tcg_temp_free_vec(in1);
         } else {
             TCGv_i64 in[4];
             int j;
@@ -3137,14 +3111,14 @@ static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
 {
     TCGv_vec t0 = tcg_temp_new_vec(type);
+    TCGv_vec t1 = tcg_temp_new_vec(type);
     uint32_t i;
 
     for (i = 0; i < oprsz; i += tysz) {
         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
-        fni(vece, t0, t0, shift);
-        tcg_gen_st_vec(t0, cpu_env, dofs + i);
+        fni(vece, t1, t0, shift);
+        tcg_gen_st_vec(t1, cpu_env, dofs + i);
     }
-    tcg_temp_free_vec(t0);
 }
 
 static void
@@ -3722,16 +3696,15 @@ static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 {
     TCGv_vec t0 = tcg_temp_new_vec(type);
     TCGv_vec t1 = tcg_temp_new_vec(type);
+    TCGv_vec t2 = tcg_temp_new_vec(type);
     uint32_t i;
 
     for (i = 0; i < oprsz; i += tysz) {
         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
-        tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
-        tcg_gen_st_vec(t0, cpu_env, dofs + i);
+        tcg_gen_cmp_vec(cond, vece, t2, t0, t1);
+        tcg_gen_st_vec(t2, cpu_env, dofs + i);
     }
-    tcg_temp_free_vec(t1);
-    tcg_temp_free_vec(t0);
 }
 
 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/4] tcg/optimize: Pipe OptContext into reset_ts
  2023-08-31  2:57 [PATCH 0/4] tcg: Optimize loads and stores to env Richard Henderson
  2023-08-31  2:57 ` [PATCH 1/4] tcg: Don't free vector results Richard Henderson
@ 2023-08-31  2:57 ` Richard Henderson
  2023-10-02 12:37   ` Philippe Mathieu-Daudé
  2023-08-31  2:57 ` [PATCH 3/4] tcg: Optimize env memory operations Richard Henderson
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 9+ messages in thread
From: Richard Henderson @ 2023-08-31  2:57 UTC (permalink / raw)
  To: qemu-devel

Will be needed in the next patch.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 3013eb04e6..f00db3aa38 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -123,7 +123,7 @@ static inline bool ts_is_copy(TCGTemp *ts)
 }
 
 /* Reset TEMP's state, possibly removing the temp for the list of copies.  */
-static void reset_ts(TCGTemp *ts)
+static void reset_ts(OptContext *ctx, TCGTemp *ts)
 {
     TempOptInfo *ti = ts_info(ts);
     TempOptInfo *pi = ts_info(ti->prev_copy);
@@ -138,9 +138,9 @@ static void reset_ts(TCGTemp *ts)
     ti->s_mask = 0;
 }
 
-static void reset_temp(TCGArg arg)
+static void reset_temp(OptContext *ctx, TCGArg arg)
 {
-    reset_ts(arg_temp(arg));
+    reset_ts(ctx, arg_temp(arg));
 }
 
 /* Initialize and activate a temporary.  */
@@ -239,7 +239,7 @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
         return true;
     }
 
-    reset_ts(dst_ts);
+    reset_ts(ctx, dst_ts);
     di = ts_info(dst_ts);
     si = ts_info(src_ts);
 
@@ -700,7 +700,7 @@ static void finish_folding(OptContext *ctx, TCGOp *op)
     nb_oargs = def->nb_oargs;
     for (i = 0; i < nb_oargs; i++) {
         TCGTemp *ts = arg_temp(op->args[i]);
-        reset_ts(ts);
+        reset_ts(ctx, ts);
         /*
          * Save the corresponding known-zero/sign bits mask for the
          * first output argument (only one supported so far).
@@ -1213,14 +1213,14 @@ static bool fold_call(OptContext *ctx, TCGOp *op)
 
         for (i = 0; i < nb_globals; i++) {
             if (test_bit(i, ctx->temps_used.l)) {
-                reset_ts(&ctx->tcg->temps[i]);
+                reset_ts(ctx, &ctx->tcg->temps[i]);
             }
         }
     }
 
     /* Reset temp data for outputs. */
     for (i = 0; i < nb_oargs; i++) {
-        reset_temp(op->args[i]);
+        reset_temp(ctx, op->args[i]);
     }
 
     /* Stop optimizing MB across calls. */
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/4] tcg/optimize: Pipe OptContext into reset_ts
  2023-08-31  2:57 ` [PATCH 2/4] tcg/optimize: Pipe OptContext into reset_ts Richard Henderson
@ 2023-10-02 12:37   ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 9+ messages in thread
From: Philippe Mathieu-Daudé @ 2023-10-02 12:37 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel

On 31/8/23 04:57, Richard Henderson wrote:
> Will be needed in the next patch.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   tcg/optimize.c | 14 +++++++-------
>   1 file changed, 7 insertions(+), 7 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>



^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 3/4] tcg: Optimize env memory operations
  2023-08-31  2:57 [PATCH 0/4] tcg: Optimize loads and stores to env Richard Henderson
  2023-08-31  2:57 ` [PATCH 1/4] tcg: Don't free vector results Richard Henderson
  2023-08-31  2:57 ` [PATCH 2/4] tcg/optimize: Pipe OptContext into reset_ts Richard Henderson
@ 2023-08-31  2:57 ` Richard Henderson
  2023-08-31  2:57 ` [PATCH 4/4] tcg: Eliminate duplicate env store operations Richard Henderson
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 9+ messages in thread
From: Richard Henderson @ 2023-08-31  2:57 UTC (permalink / raw)
  To: qemu-devel

Propagate stores to loads, loads to loads.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 199 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 199 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index f00db3aa38..51c4c61b9f 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -25,6 +25,7 @@
 
 #include "qemu/osdep.h"
 #include "qemu/int128.h"
+#include "qemu/interval-tree.h"
 #include "tcg/tcg-op-common.h"
 #include "tcg-internal.h"
 
@@ -37,10 +38,18 @@
         glue(glue(case INDEX_op_, x), _i64):    \
         glue(glue(case INDEX_op_, x), _vec)
 
+typedef struct MemCopyInfo {
+    IntervalTreeNode itree;
+    QSIMPLEQ_ENTRY (MemCopyInfo) next;
+    TCGTemp *ts;
+    TCGType type;
+} MemCopyInfo;
+
 typedef struct TempOptInfo {
     bool is_const;
     TCGTemp *prev_copy;
     TCGTemp *next_copy;
+    QSIMPLEQ_HEAD(, MemCopyInfo) mem_copy;
     uint64_t val;
     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
     uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
@@ -51,6 +60,9 @@ typedef struct OptContext {
     TCGOp *prev_mb;
     TCGTempSet temps_used;
 
+    IntervalTreeRoot mem_copy;
+    QSIMPLEQ_HEAD(, MemCopyInfo) mem_free;
+
     /* In flight values from optimization. */
     uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
@@ -128,6 +140,7 @@ static void reset_ts(OptContext *ctx, TCGTemp *ts)
     TempOptInfo *ti = ts_info(ts);
     TempOptInfo *pi = ts_info(ti->prev_copy);
     TempOptInfo *ni = ts_info(ti->next_copy);
+    MemCopyInfo *mc;
 
     ni->prev_copy = ti->prev_copy;
     pi->next_copy = ti->next_copy;
@@ -136,6 +149,11 @@ static void reset_ts(OptContext *ctx, TCGTemp *ts)
     ti->is_const = false;
     ti->z_mask = -1;
     ti->s_mask = 0;
+
+    QSIMPLEQ_FOREACH(mc, &ti->mem_copy, next) {
+        interval_tree_remove(&mc->itree, &ctx->mem_copy);
+    }
+    QSIMPLEQ_CONCAT(&ctx->mem_free, &ti->mem_copy);
 }
 
 static void reset_temp(OptContext *ctx, TCGArg arg)
@@ -162,6 +180,7 @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
 
     ti->next_copy = ts;
     ti->prev_copy = ts;
+    QSIMPLEQ_INIT(&ti->mem_copy);
     if (ts->kind == TEMP_CONST) {
         ti->is_const = true;
         ti->val = ts->val;
@@ -174,6 +193,68 @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
     }
 }
 
+static MemCopyInfo *mem_copy_first(OptContext *ctx, intptr_t s, intptr_t l)
+{
+    IntervalTreeNode *r = interval_tree_iter_first(&ctx->mem_copy, s, l);
+    return r ? container_of(r, MemCopyInfo, itree) : NULL;
+}
+
+static MemCopyInfo *mem_copy_next(MemCopyInfo *mem, intptr_t s, intptr_t l)
+{
+    IntervalTreeNode *r = interval_tree_iter_next(&mem->itree, s, l);
+    return r ? container_of(r, MemCopyInfo, itree) : NULL;
+}
+
+static void remove_mem_copy(OptContext *ctx, MemCopyInfo *mc)
+{
+    TCGTemp *ts = mc->ts;
+    TempOptInfo *ti = ts_info(ts);
+
+    interval_tree_remove(&mc->itree, &ctx->mem_copy);
+    QSIMPLEQ_REMOVE(&ti->mem_copy, mc, MemCopyInfo, next);
+    QSIMPLEQ_INSERT_TAIL(&ctx->mem_free, mc, next);
+}
+
+static void remove_mem_copy_in(OptContext *ctx, intptr_t s, intptr_t l)
+{
+    while (true) {
+        MemCopyInfo *mc = mem_copy_first(ctx, s, l);
+        if (!mc) {
+            break;
+        }
+        remove_mem_copy(ctx, mc);
+    }
+}
+
+static void remove_mem_copy_all(OptContext *ctx)
+{
+    remove_mem_copy_in(ctx, 0, -1);
+    tcg_debug_assert(interval_tree_is_empty(&ctx->mem_copy));
+}
+
+static void record_mem_copy(OptContext *ctx, TCGType type,
+                            TCGTemp *ts, intptr_t start, intptr_t last)
+{
+    MemCopyInfo *mc;
+    TempOptInfo *ti = ts_info(ts);
+
+    mc = QSIMPLEQ_FIRST(&ctx->mem_free);
+    if (mc) {
+        QSIMPLEQ_REMOVE_HEAD(&ctx->mem_free, next);
+    } else {
+        mc = tcg_malloc(sizeof(*mc));
+    }
+
+    memset(mc, 0, sizeof(*mc));
+    mc->type = type;
+    mc->ts = ts;
+    mc->itree.start = start;
+    mc->itree.last = last;
+
+    interval_tree_insert(&mc->itree, &ctx->mem_copy);
+    QSIMPLEQ_INSERT_TAIL(&ti->mem_copy, mc, next);
+}
+
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
 {
     TCGTemp *i, *g, *l;
@@ -226,6 +307,18 @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
 }
 
+static TCGTemp *find_mem_copy_for(OptContext *ctx, TCGType type, intptr_t s)
+{
+    MemCopyInfo *mc;
+
+    for (mc = mem_copy_first(ctx, s, s); mc; mc = mem_copy_next(mc, s, s)) {
+        if (mc->itree.start == s && mc->type == type) {
+            return find_better_copy(ctx->tcg, mc->ts);
+        }
+    }
+    return NULL;
+}
+
 static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
@@ -268,6 +361,7 @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
+        MemCopyInfo *mc;
 
         di->next_copy = si->next_copy;
         di->prev_copy = src_ts;
@@ -275,6 +369,11 @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
         si->next_copy = dst_ts;
         di->is_const = si->is_const;
         di->val = si->val;
+
+        QSIMPLEQ_FOREACH(mc, &si->mem_copy, next) {
+            record_mem_copy(ctx, mc->type, dst_ts,
+                            mc->itree.start, mc->itree.last);
+        }
     }
     return true;
 }
@@ -693,6 +792,7 @@ static void finish_folding(OptContext *ctx, TCGOp *op)
      */
     if (def->flags & TCG_OPF_BB_END) {
         memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
+        remove_mem_copy_all(ctx);
         ctx->prev_mb = NULL;
         return;
     }
@@ -1218,6 +1318,11 @@ static bool fold_call(OptContext *ctx, TCGOp *op)
         }
     }
 
+    /* If the function has side effects, reset mem data. */
+    if (!(flags & TCG_CALL_NO_SIDE_EFFECTS)) {
+        remove_mem_copy_all(ctx);
+    }
+
     /* Reset temp data for outputs. */
     for (i = 0; i < nb_oargs; i++) {
         reset_temp(ctx, op->args[i]);
@@ -2075,6 +2180,83 @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_tcg_ld_memcopy(OptContext *ctx, TCGOp *op)
+{
+    TCGTemp *dst, *src;
+    intptr_t ofs;
+    TCGType type;
+
+    if (op->args[1] != tcgv_ptr_arg(cpu_env)) {
+        return false;
+    }
+
+    type = ctx->type;
+    ofs = op->args[2];
+    dst = arg_temp(op->args[0]);
+    src = find_mem_copy_for(ctx, type, ofs);
+    if (src && src->base_type == type) {
+        return tcg_opt_gen_mov(ctx, op, temp_arg(dst), temp_arg(src));
+    }
+
+    reset_ts(ctx, dst);
+    record_mem_copy(ctx, type, dst, ofs, ofs + tcg_type_size(type) - 1);
+    return true;
+}
+
+static bool fold_tcg_st(OptContext *ctx, TCGOp *op)
+{
+    intptr_t ofs = op->args[2];
+    intptr_t lm1;
+
+    if (op->args[1] != tcgv_ptr_arg(cpu_env)) {
+        remove_mem_copy_all(ctx);
+        return false;
+    }
+
+    switch (op->opc) {
+    CASE_OP_32_64(st8):
+        lm1 = 0;
+        break;
+    CASE_OP_32_64(st16):
+        lm1 = 1;
+        break;
+    case INDEX_op_st32_i64:
+    case INDEX_op_st_i32:
+        lm1 = 3;
+        break;
+    case INDEX_op_st_i64:
+        lm1 = 7;
+        break;
+    case INDEX_op_st_vec:
+        lm1 = tcg_type_size(ctx->type) - 1;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    remove_mem_copy_in(ctx, ofs, ofs + lm1);
+    return false;
+}
+
+static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
+{
+    TCGTemp *src;
+    intptr_t ofs, last;
+    TCGType type;
+
+    if (op->args[1] != tcgv_ptr_arg(cpu_env)) {
+        fold_tcg_st(ctx, op);
+        return false;
+    }
+
+    src = arg_temp(op->args[0]);
+    ofs = op->args[2];
+    type = ctx->type;
+    last = ofs + tcg_type_size(type) - 1;
+    remove_mem_copy_in(ctx, ofs, last);
+    record_mem_copy(ctx, type, src, ofs, last);
+    return false;
+}
+
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2_commutative(ctx, op) ||
@@ -2098,6 +2280,8 @@ void tcg_optimize(TCGContext *s)
     TCGOp *op, *op_next;
     OptContext ctx = { .tcg = s };
 
+    QSIMPLEQ_INIT(&ctx.mem_free);
+
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
        If this temp is a copy of other ones then the other copies are
@@ -2219,6 +2403,21 @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_ld32u_i64:
             done = fold_tcg_ld(&ctx, op);
             break;
+        case INDEX_op_ld_i32:
+        case INDEX_op_ld_i64:
+        case INDEX_op_ld_vec:
+            done = fold_tcg_ld_memcopy(&ctx, op);
+            break;
+        CASE_OP_32_64(st8):
+        CASE_OP_32_64(st16):
+        case INDEX_op_st32_i64:
+            done = fold_tcg_st(&ctx, op);
+            break;
+        case INDEX_op_st_i32:
+        case INDEX_op_st_i64:
+        case INDEX_op_st_vec:
+            done = fold_tcg_st_memcopy(&ctx, op);
+            break;
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 4/4] tcg: Eliminate duplicate env store operations
  2023-08-31  2:57 [PATCH 0/4] tcg: Optimize loads and stores to env Richard Henderson
                   ` (2 preceding siblings ...)
  2023-08-31  2:57 ` [PATCH 3/4] tcg: Optimize env memory operations Richard Henderson
@ 2023-08-31  2:57 ` Richard Henderson
  2023-09-28 22:45 ` [PATCH 0/4] tcg: Optimize loads and stores to env Richard Henderson
  2023-10-16  3:01 ` gaosong
  5 siblings, 0 replies; 9+ messages in thread
From: Richard Henderson @ 2023-08-31  2:57 UTC (permalink / raw)
  To: qemu-devel

Notice when a constant is stored to the same location twice.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 51c4c61b9f..6efc08f593 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -2251,6 +2251,19 @@ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
     src = arg_temp(op->args[0]);
     ofs = op->args[2];
     type = ctx->type;
+
+    /*
+     * Eliminate duplicate stores of a constant.
+     * This happens frequently when the target ISA zero-extends.
+     */
+    if (ts_is_const(src)) {
+        TCGTemp *prev = find_mem_copy_for(ctx, type, ofs);
+        if (src == prev) {
+            tcg_op_remove(ctx->tcg, op);
+            return true;
+        }
+    }
+
     last = ofs + tcg_type_size(type) - 1;
     remove_mem_copy_in(ctx, ofs, last);
     record_mem_copy(ctx, type, src, ofs, last);
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/4] tcg: Optimize loads and stores to env
  2023-08-31  2:57 [PATCH 0/4] tcg: Optimize loads and stores to env Richard Henderson
                   ` (3 preceding siblings ...)
  2023-08-31  2:57 ` [PATCH 4/4] tcg: Eliminate duplicate env store operations Richard Henderson
@ 2023-09-28 22:45 ` Richard Henderson
  2023-10-13 17:40   ` Richard Henderson
  2023-10-16  3:01 ` gaosong
  5 siblings, 1 reply; 9+ messages in thread
From: Richard Henderson @ 2023-09-28 22:45 UTC (permalink / raw)
  To: qemu-devel

Ping.

r~

On 8/30/23 22:57, Richard Henderson wrote:
> This is aimed at improving gvec generated code, which involves large
> numbers of loads and stores to the env slots of the guest cpu vector
> registers.  The final patch helps eliminate redundant zero-extensions
> that can appear with e.g. avx2 and sve.
> 
>  From the small amount of timing that I have done, there is no change.
> But of course as we all know, x86 is very good with redundant memory.
> And frankly, I haven't found a good test case for measuring.
> What I need is an algorithm with lots of integer vector code that can
> be expanded with gvec.  Most of what I've found is either fp (out of
> line) or too simple (small translation blocks with little scope for
> optimization).
> 
> That said, it appears to be simple enough, and does eliminate some
> redundant operations, even in places that I didn't expect.
> 
> 
> r~
> 
> 
> Richard Henderson (4):
>    tcg: Don't free vector results
>    tcg/optimize: Pipe OptContext into reset_ts
>    tcg: Optimize env memory operations
>    tcg: Eliminate duplicate env store operations
> 
>   tcg/optimize.c    | 226 ++++++++++++++++++++++++++++++++++++++++++++--
>   tcg/tcg-op-gvec.c |  39 ++------
>   2 files changed, 225 insertions(+), 40 deletions(-)
> 



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/4] tcg: Optimize loads and stores to env
  2023-09-28 22:45 ` [PATCH 0/4] tcg: Optimize loads and stores to env Richard Henderson
@ 2023-10-13 17:40   ` Richard Henderson
  0 siblings, 0 replies; 9+ messages in thread
From: Richard Henderson @ 2023-10-13 17:40 UTC (permalink / raw)
  To: qemu-devel

Ping 2.

On 9/28/23 15:45, Richard Henderson wrote:
> Ping.
> 
> r~
> 
> On 8/30/23 22:57, Richard Henderson wrote:
>> This is aimed at improving gvec generated code, which involves large
>> numbers of loads and stores to the env slots of the guest cpu vector
>> registers.  The final patch helps eliminate redundant zero-extensions
>> that can appear with e.g. avx2 and sve.
>>
>>  From the small amount of timing that I have done, there is no change.
>> But of course as we all know, x86 is very good with redundant memory.
>> And frankly, I haven't found a good test case for measuring.
>> What I need is an algorithm with lots of integer vector code that can
>> be expanded with gvec.  Most of what I've found is either fp (out of
>> line) or too simple (small translation blocks with little scope for
>> optimization).
>>
>> That said, it appears to be simple enough, and does eliminate some
>> redundant operations, even in places that I didn't expect.
>>
>>
>> r~
>>
>>
>> Richard Henderson (4):
>>    tcg: Don't free vector results
>>    tcg/optimize: Pipe OptContext into reset_ts
>>    tcg: Optimize env memory operations
>>    tcg: Eliminate duplicate env store operations
>>
>>   tcg/optimize.c    | 226 ++++++++++++++++++++++++++++++++++++++++++++--
>>   tcg/tcg-op-gvec.c |  39 ++------
>>   2 files changed, 225 insertions(+), 40 deletions(-)
>>
> 



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/4] tcg: Optimize loads and stores to env
  2023-08-31  2:57 [PATCH 0/4] tcg: Optimize loads and stores to env Richard Henderson
                   ` (4 preceding siblings ...)
  2023-09-28 22:45 ` [PATCH 0/4] tcg: Optimize loads and stores to env Richard Henderson
@ 2023-10-16  3:01 ` gaosong
  5 siblings, 0 replies; 9+ messages in thread
From: gaosong @ 2023-10-16  3:01 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel

在 2023/8/31 上午10:57, Richard Henderson 写道:
> This is aimed at improving gvec generated code, which involves large
> numbers of loads and stores to the env slots of the guest cpu vector
> registers.  The final patch helps eliminate redundant zero-extensions
> that can appear with e.g. avx2 and sve.
>
>  From the small amount of timing that I have done, there is no change.
> But of course as we all know, x86 is very good with redundant memory.
> And frankly, I haven't found a good test case for measuring.
> What I need is an algorithm with lots of integer vector code that can
> be expanded with gvec.  Most of what I've found is either fp (out of
> line) or too simple (small translation blocks with little scope for
> optimization).
>
> That said, it appears to be simple enough, and does eliminate some
> redundant operations, even in places that I didn't expect.
>
>
> r~
>
>
> Richard Henderson (4):
>    tcg: Don't free vector results
>    tcg/optimize: Pipe OptContext into reset_ts
>    tcg: Optimize env memory operations
>    tcg: Eliminate duplicate env store operations
>
>   tcg/optimize.c    | 226 ++++++++++++++++++++++++++++++++++++++++++++--
>   tcg/tcg-op-gvec.c |  39 ++------
>   2 files changed, 225 insertions(+), 40 deletions(-)
>
Patch 1 and Patch 3,    s  -i  "/cpu_env/tcg_env/g "

Reviewed-by: Song Gao <gaosong@loongson.cn>

Thanks.
Song Gao



^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-10-16  3:02 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-08-31  2:57 [PATCH 0/4] tcg: Optimize loads and stores to env Richard Henderson
2023-08-31  2:57 ` [PATCH 1/4] tcg: Don't free vector results Richard Henderson
2023-08-31  2:57 ` [PATCH 2/4] tcg/optimize: Pipe OptContext into reset_ts Richard Henderson
2023-10-02 12:37   ` Philippe Mathieu-Daudé
2023-08-31  2:57 ` [PATCH 3/4] tcg: Optimize env memory operations Richard Henderson
2023-08-31  2:57 ` [PATCH 4/4] tcg: Eliminate duplicate env store operations Richard Henderson
2023-09-28 22:45 ` [PATCH 0/4] tcg: Optimize loads and stores to env Richard Henderson
2023-10-13 17:40   ` Richard Henderson
2023-10-16  3:01 ` gaosong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).