[Qemu-devel] [PATCH 3/3] optimize: optimize using nonzero bits

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Richard Henderson <rth@twiddle.net>
To: qemu-devel@nongnu.org
Cc: Paolo Bonzini <pbonzini@redhat.com>,
	Aurelien Jarno <aurelien@aurel32.net>
Subject: [Qemu-devel] [PATCH 3/3] optimize: optimize using nonzero bits
Date: Fri, 11 Jan 2013 15:42:53 -0800	[thread overview]
Message-ID: <1357947773-31051-4-git-send-email-rth@twiddle.net> (raw)
In-Reply-To: <1357947773-31051-1-git-send-email-rth@twiddle.net>

From: Paolo Bonzini <pbonzini@redhat.com>

This adds two optimizations using the non-zero bit mask.  In some cases
involving shifts or ANDs the value can become zero, and can thus be
optimized to a move of zero.  Second, useless zero-extension or an
AND with constant can be detected that would only zero bits that are
already zero.

The main advantage of this optimization is that it turns zero-extensions
into moves, thus enabling much better copy propagation (around 1% code
reduction).  Here is for example a "test $0xff0000,%ecx + je" before
optimization:

 mov_i64 tmp0,rcx
 movi_i64 tmp1,$0xff0000
 discard cc_src
 and_i64 cc_dst,tmp0,tmp1
 movi_i32 cc_op,$0x1c
 ext32u_i64 tmp0,cc_dst
 movi_i64 tmp12,$0x0
 brcond_i64 tmp0,tmp12,eq,$0x0

and after (without patch on the left, with on the right):

 movi_i64 tmp1,$0xff0000                 movi_i64 tmp1,$0xff0000
 discard cc_src                          discard cc_src
 and_i64 cc_dst,rcx,tmp1                 and_i64 cc_dst,rcx,tmp1
 movi_i32 cc_op,$0x1c                    movi_i32 cc_op,$0x1c
 ext32u_i64 tmp0,cc_dst
 movi_i64 tmp12,$0x0                     movi_i64 tmp12,$0x0
 brcond_i64 tmp0,tmp12,eq,$0x0           brcond_i64 cc_dst,tmp12,eq,$0x0

Other similar cases: "test %eax, %eax + jne" where eax is already 32-bit
(after optimization, without patch on the left, with on the right):

 discard cc_src                          discard cc_src
 mov_i64 cc_dst,rax                      mov_i64 cc_dst,rax
 movi_i32 cc_op,$0x1c                    movi_i32 cc_op,$0x1c
 ext32u_i64 tmp0,cc_dst
 movi_i64 tmp12,$0x0                     movi_i64 tmp12,$0x0
 brcond_i64 tmp0,tmp12,ne,$0x0           brcond_i64 rax,tmp12,ne,$0x0

"test $0x1, %dl + je":

 movi_i64 tmp1,$0x1                      movi_i64 tmp1,$0x1
 discard cc_src                          discard cc_src
 and_i64 cc_dst,rdx,tmp1                 and_i64 cc_dst,rdx,tmp1
 movi_i32 cc_op,$0x1a                    movi_i32 cc_op,$0x1a
 ext8u_i64 tmp0,cc_dst
 movi_i64 tmp12,$0x0                     movi_i64 tmp12,$0x0
 brcond_i64 tmp0,tmp12,eq,$0x0           brcond_i64 cc_dst,tmp12,eq,$0x0

In some cases TCG even outsmarts GCC. :)  Here the input code has
"and $0x2,%eax + movslq %eax,%rbx + test %rbx, %rbx" and the optimizer,
thanks to copy propagation, does the following:

 movi_i64 tmp12,$0x2                     movi_i64 tmp12,$0x2
 and_i64 rax,rax,tmp12                   and_i64 rax,rax,tmp12
 mov_i64 cc_dst,rax                      mov_i64 cc_dst,rax
 ext32s_i64 tmp0,rax                  -> nop
 mov_i64 rbx,tmp0                     -> mov_i64 rbx,cc_dst
 and_i64 cc_dst,rbx,rbx               -> nop

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/optimize.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 090efbc..973d2d6 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -484,7 +484,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                                     TCGArg *args, TCGOpDef *tcg_op_defs)
 {
     int i, nb_ops, op_index, nb_temps, nb_globals, nb_call_args;
-    tcg_target_ulong mask;
+    tcg_target_ulong mask, affected;
     TCGOpcode op;
     const TCGOpDef *def;
     TCGArg *gen_args;
@@ -629,6 +629,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
 
         /* Simplify using known-zero bits */
         mask = -1;
+        affected = -1;
         switch (op) {
         CASE_OP_32_64(ext8s):
             if ((temps[args[1]].mask & 0x80) != 0) {
@@ -656,7 +657,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             mask = temps[args[2]].mask;
             if (temps[args[2]].state == TCG_TEMP_CONST) {
         and_const:
-                ;
+                affected = temps[args[1]].mask & ~mask;
             }
             mask = temps[args[1]].mask & mask;
             break;
@@ -708,6 +709,31 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             break;
         }
 
+        if (mask == 0) {
+            assert(def->nb_oargs == 1);
+            s->gen_opc_buf[op_index] = op_to_movi(op);
+            tcg_opt_gen_movi(gen_args, args[0], 0);
+            args += def->nb_oargs + def->nb_iargs + def->nb_cargs;
+            gen_args += 2;
+            continue;
+        }
+        if (affected == 0) {
+            assert(def->nb_oargs == 1);
+            if (temps_are_copies(args[0], args[1])) {
+                s->gen_opc_buf[op_index] = INDEX_op_nop;
+            } else if (temps[args[1]].state != TCG_TEMP_CONST) {
+                s->gen_opc_buf[op_index] = op_to_mov(op);
+                tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
+                gen_args += 2;
+            } else {
+                s->gen_opc_buf[op_index] = op_to_movi(op);
+                tcg_opt_gen_movi(gen_args, args[0], temps[args[1]].val);
+                gen_args += 2;
+            }
+            args += def->nb_iargs + 1;
+            continue;
+        }
+
         /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
         switch (op) {
         CASE_OP_32_64(and):
-- 
1.7.11.7

next prev parent reply	other threads:[~2013-01-11 23:43 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-01-11 23:42 [Qemu-devel] [PATCH 0/3] tcg-optimize with known-zero bits Richard Henderson
2013-01-11 23:42 ` [Qemu-devel] [PATCH 1/3] optimize: only write to state when clearing optimizer data Richard Henderson
2013-01-11 23:42 ` [Qemu-devel] [PATCH 2/3] optimize: track nonzero bits of registers Richard Henderson
2013-01-11 23:42 ` Richard Henderson [this message]
2013-01-12  7:55 ` [Qemu-devel] [PATCH 0/3] tcg-optimize with known-zero bits Paolo Bonzini
2013-01-19 13:58 ` Blue Swirl

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:090efbc dfblob:973d2d6 )
 OR (
bs:"[Qemu-devel] [PATCH 3/3] optimize: optimize using nonzero bits" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1357947773-31051-4-git-send-email-rth@twiddle.net \
    --to=rth@twiddle.net \
    --cc=aurelien@aurel32.net \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).