* [Qemu-devel] [PATCH for-2.11 1/6] tcg: Add tcg_reg_alloc_new
2017-08-04 6:23 [Qemu-devel] [PATCH for-2.11 0/6] tcg/i386 haswell improvements Richard Henderson
@ 2017-08-04 6:23 ` Richard Henderson
2017-08-04 6:23 ` [Qemu-devel] [PATCH for-2.11 2/6] disas/i386: Fix disassembly of two-byte vex prefixes Richard Henderson
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2017-08-04 6:23 UTC (permalink / raw)
To: qemu-devel
This allows the backend to allocate an otherwise unused register.
This can allow the backend to avoid having to reserve a full-time
temporary register.
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
tcg/tcg.h | 1 +
tcg/tcg.c | 58 +++++++++++++++++++++++++++++++++++++++++++++-------------
2 files changed, 46 insertions(+), 13 deletions(-)
diff --git a/tcg/tcg.h b/tcg/tcg.h
index ac94133870..dd97095af5 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -655,6 +655,7 @@ struct TCGContext {
uintptr_t *tb_jmp_insn_offset; /* tb->jmp_target_arg if direct_jump */
uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_arg if !direct_jump */
+ TCGRegSet regs_in_use;
TCGRegSet reserved_regs;
intptr_t current_frame_offset;
intptr_t frame_start;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index fd8a3dfe93..787c8ba0f7 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -112,6 +112,8 @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
static void tcg_out_call(TCGContext *s, tcg_insn_unit *target);
static int tcg_target_const_match(tcg_target_long val, TCGType type,
const TCGArgConstraint *arg_ct);
+static TCGReg tcg_reg_alloc_new(TCGContext *s, TCGType t)
+ __attribute__((unused));
#ifdef TCG_TARGET_NEED_LDST_LABELS
static bool tcg_out_ldst_finalize(TCGContext *s);
#endif
@@ -1947,16 +1949,19 @@ static void temp_sync(TCGContext *s, TCGTemp *ts,
/* If we're going to free the temp immediately, then we won't
require it later in a register, so attempt to store the
constant to memory directly. */
- if (free_or_dead
- && tcg_out_sti(s, ts->type, ts->val,
- ts->mem_base->reg, ts->mem_offset)) {
- break;
+ if (free_or_dead) {
+ s->regs_in_use = -1;
+ if (tcg_out_sti(s, ts->type, ts->val,
+ ts->mem_base->reg, ts->mem_offset)) {
+ break;
+ }
}
temp_load(s, ts, tcg_target_available_regs[ts->type],
allocated_regs);
/* fallthrough */
case TEMP_VAL_REG:
+ s->regs_in_use = -1;
tcg_out_st(s, ts->type, ts->reg,
ts->mem_base->reg, ts->mem_offset);
break;
@@ -2015,6 +2020,14 @@ static TCGReg tcg_reg_alloc(TCGContext *s, TCGRegSet desired_regs,
tcg_abort();
}
+static TCGReg tcg_reg_alloc_new(TCGContext *s, TCGType t)
+{
+ TCGReg r;
+ r = tcg_reg_alloc(s, tcg_target_available_regs[t], s->regs_in_use, 0);
+ tcg_regset_set_reg(s->regs_in_use, r);
+ return r;
+}
+
/* Make sure the temporary is in a register. If needed, allocate the register
from DESIRED while avoiding ALLOCATED. */
static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
@@ -2027,11 +2040,13 @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
return;
case TEMP_VAL_CONST:
reg = tcg_reg_alloc(s, desired_regs, allocated_regs, ts->indirect_base);
+ s->regs_in_use = allocated_regs;
tcg_out_movi(s, ts->type, reg, ts->val);
ts->mem_coherent = 0;
break;
case TEMP_VAL_MEM:
reg = tcg_reg_alloc(s, desired_regs, allocated_regs, ts->indirect_base);
+ s->regs_in_use = -1;
tcg_out_ld(s, ts->type, reg, ts->mem_base->reg, ts->mem_offset);
ts->mem_coherent = 1;
break;
@@ -2105,6 +2120,7 @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
{
if (ots->fixed_reg) {
/* For fixed registers, we do not do any constant propagation. */
+ s->regs_in_use = s->reserved_regs;
tcg_out_movi(s, ots->type, ots->reg, val);
return;
}
@@ -2129,17 +2145,16 @@ static void tcg_reg_alloc_movi(TCGContext *s, const TCGArg *args,
TCGTemp *ots = &s->temps[args[0]];
tcg_target_ulong val = args[1];
+ s->regs_in_use = s->reserved_regs;
tcg_reg_alloc_do_movi(s, ots, val, arg_life);
}
static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
const TCGArg *args, TCGLifeData arg_life)
{
- TCGRegSet allocated_regs;
TCGTemp *ts, *ots;
TCGType otype, itype;
- tcg_regset_set(allocated_regs, s->reserved_regs);
ots = &s->temps[args[0]];
ts = &s->temps[args[1]];
@@ -2153,6 +2168,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
if (IS_DEAD_ARG(1)) {
temp_dead(s, ts);
}
+ s->regs_in_use = s->reserved_regs;
tcg_reg_alloc_do_movi(s, ots, val, arg_life);
return;
}
@@ -2162,7 +2178,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
the SOURCE value into its own register first, that way we
don't have to reload SOURCE the next time it is used. */
if (ts->val_type == TEMP_VAL_MEM) {
- temp_load(s, ts, tcg_target_available_regs[itype], allocated_regs);
+ temp_load(s, ts, tcg_target_available_regs[itype], s->reserved_regs);
}
tcg_debug_assert(ts->val_type == TEMP_VAL_REG);
@@ -2173,12 +2189,14 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
if (!ots->mem_allocated) {
temp_allocate_frame(s, args[0]);
}
+ s->regs_in_use = -1;
tcg_out_st(s, otype, ts->reg, ots->mem_base->reg, ots->mem_offset);
if (IS_DEAD_ARG(1)) {
temp_dead(s, ts);
}
temp_dead(s, ots);
} else {
+ TCGRegSet allocated_regs;
if (IS_DEAD_ARG(1) && !ts->fixed_reg && !ots->fixed_reg) {
/* the mov can be suppressed */
if (ots->val_type == TEMP_VAL_REG) {
@@ -2188,19 +2206,21 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
temp_dead(s, ts);
} else {
if (ots->val_type != TEMP_VAL_REG) {
- /* When allocating a new register, make sure to not spill the
- input one. */
+ /* When allocating a new register, make sure to not
+ spill the input one. */
+ allocated_regs = s->reserved_regs;
tcg_regset_set_reg(allocated_regs, ts->reg);
ots->reg = tcg_reg_alloc(s, tcg_target_available_regs[otype],
allocated_regs, ots->indirect_base);
}
+ s->regs_in_use = -1;
tcg_out_mov(s, otype, ots->reg, ts->reg);
}
ots->val_type = TEMP_VAL_REG;
ots->mem_coherent = 0;
s->reg_to_temp[ots->reg] = ots;
if (NEED_SYNC_ARG(0)) {
- temp_sync(s, ots, allocated_regs, 0);
+ temp_sync(s, ots, s->reserved_regs, 0);
}
}
}
@@ -2281,6 +2301,7 @@ static void tcg_reg_alloc_op(TCGContext *s,
and move the temporary register into it */
reg = tcg_reg_alloc(s, arg_ct->u.regs, i_allocated_regs,
ts->indirect_base);
+ s->regs_in_use = -1;
tcg_out_mov(s, ts->type, reg, ts->reg);
}
new_args[i] = reg;
@@ -2355,6 +2376,7 @@ static void tcg_reg_alloc_op(TCGContext *s,
}
/* emit instruction */
+ s->regs_in_use = i_allocated_regs | o_allocated_regs;
tcg_out_op(s, opc, new_args, const_args);
/* move the outputs in the correct register if needed */
@@ -2362,6 +2384,7 @@ static void tcg_reg_alloc_op(TCGContext *s,
ts = &s->temps[args[i]];
reg = new_args[i];
if (ts->fixed_reg && ts->reg != reg) {
+ s->regs_in_use = -1;
tcg_out_mov(s, ts->type, ts->reg, reg);
}
if (NEED_SYNC_ARG(i)) {
@@ -2420,6 +2443,7 @@ static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
ts = &s->temps[arg];
temp_load(s, ts, tcg_target_available_regs[ts->type],
s->reserved_regs);
+ s->regs_in_use = -1;
tcg_out_st(s, ts->type, ts->reg, TCG_REG_CALL_STACK, stack_offset);
}
#ifndef TCG_TARGET_STACK_GROWSUP
@@ -2428,7 +2452,7 @@ static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
}
/* assign input registers */
- tcg_regset_set(allocated_regs, s->reserved_regs);
+ allocated_regs = s->reserved_regs;
for(i = 0; i < nb_regs; i++) {
arg = args[nb_oargs + i];
if (arg != TCG_CALL_DUMMY_ARG) {
@@ -2438,6 +2462,7 @@ static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
if (ts->val_type == TEMP_VAL_REG) {
if (ts->reg != reg) {
+ s->regs_in_use = -1;
tcg_out_mov(s, ts->type, reg, ts->reg);
}
} else {
@@ -2458,7 +2483,7 @@ static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
temp_dead(s, &s->temps[args[i]]);
}
}
-
+
/* clobber call registers */
for (i = 0; i < TCG_TARGET_NB_REGS; i++) {
if (tcg_regset_test_reg(tcg_target_call_clobber_regs, i)) {
@@ -2476,10 +2501,16 @@ static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
save_globals(s, allocated_regs);
}
+ s->regs_in_use = allocated_regs;
tcg_out_call(s, func_addr);
/* assign output registers and emit moves if needed */
- for(i = 0; i < nb_oargs; i++) {
+ allocated_regs = s->reserved_regs;
+ for (i = 0; i < nb_oargs; i++) {
+ reg = tcg_target_call_oarg_regs[i];
+ tcg_regset_set_reg(allocated_regs, reg);
+ }
+ for (i = 0; i < nb_oargs; i++) {
arg = args[i];
ts = &s->temps[arg];
reg = tcg_target_call_oarg_regs[i];
@@ -2487,6 +2518,7 @@ static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
if (ts->fixed_reg) {
if (ts->reg != reg) {
+ s->regs_in_use = -1;
tcg_out_mov(s, ts->type, ts->reg, reg);
}
} else {
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [Qemu-devel] [PATCH for-2.11 2/6] disas/i386: Fix disassembly of two-byte vex prefixes
2017-08-04 6:23 [Qemu-devel] [PATCH for-2.11 0/6] tcg/i386 haswell improvements Richard Henderson
2017-08-04 6:23 ` [Qemu-devel] [PATCH for-2.11 1/6] tcg: Add tcg_reg_alloc_new Richard Henderson
@ 2017-08-04 6:23 ` Richard Henderson
2017-08-04 6:23 ` [Qemu-devel] [PATCH for-2.11 3/6] disas/i386: Add disassembly of vex.0f38.f5 Richard Henderson
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2017-08-04 6:23 UTC (permalink / raw)
To: qemu-devel
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
disas/i386.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/disas/i386.c b/disas/i386.c
index f1e376ca4a..7a238b203b 100644
--- a/disas/i386.c
+++ b/disas/i386.c
@@ -3559,6 +3559,7 @@ ckvexprefix (void)
} else {
/* Two byte VEX prefix. */
newrex |= (vex2 & 0x80 ? 0 : REX_R);
+ newpfx |= PREFIX_VEX_0F;
codep += 2;
}
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [Qemu-devel] [PATCH for-2.11 3/6] disas/i386: Add disassembly of vex.0f38.f5
2017-08-04 6:23 [Qemu-devel] [PATCH for-2.11 0/6] tcg/i386 haswell improvements Richard Henderson
2017-08-04 6:23 ` [Qemu-devel] [PATCH for-2.11 1/6] tcg: Add tcg_reg_alloc_new Richard Henderson
2017-08-04 6:23 ` [Qemu-devel] [PATCH for-2.11 2/6] disas/i386: Fix disassembly of two-byte vex prefixes Richard Henderson
@ 2017-08-04 6:23 ` Richard Henderson
2017-08-04 6:23 ` [Qemu-devel] [PATCH for-2.11 4/6] disas/i386: Add disassembly of rorx Richard Henderson
` (2 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2017-08-04 6:23 UTC (permalink / raw)
To: qemu-devel
Which includes pext, pdep and bzhi.
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
disas/i386.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/disas/i386.c b/disas/i386.c
index 7a238b203b..7eaa378a10 100644
--- a/disas/i386.c
+++ b/disas/i386.c
@@ -683,6 +683,7 @@ fetch_data(struct disassemble_info *info, bfd_byte *addr)
#define PREGRP105 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 105 } }
#define PREGRP106 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 106 } }
#define PREGRP107 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 107 } }
+#define PREGRP108 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 108 } }
#define X86_64_0 NULL, { { NULL, X86_64_SPECIAL }, { NULL, 0 } }
#define X86_64_1 NULL, { { NULL, X86_64_SPECIAL }, { NULL, 1 } }
@@ -1484,7 +1485,7 @@ static const unsigned char threebyte_0x38_uses_REPNZ_prefix[256] = {
/* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */
/* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */
/* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */
- /* f0 */ 1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */
+ /* f0 */ 1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0, /* ff */
/* ------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
@@ -1508,7 +1509,7 @@ static const unsigned char threebyte_0x38_uses_REPZ_prefix[256] = {
/* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */
/* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */
/* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */
- /* f0 */ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */
+ /* f0 */ 0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0, /* ff */
/* ------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
@@ -2808,6 +2809,14 @@ static const struct dis386 prefix_user_table[][4] = {
{ "bsfS", { Gv, Ev } },
{ "(bad)", { XX } },
},
+
+ /* PREGRP108 */
+ {
+ { "bzhi", { Gv, Ev, Bv } },
+ { "pext", { Gv, Bv, Ev } },
+ { "(bad)", { XX } },
+ { "pdep", { Gv, Bv, Ev } },
+ },
};
static const struct dis386 x86_64_table[][2] = {
@@ -3108,7 +3117,7 @@ static const struct dis386 three_byte_table[][256] = {
{ PREGRP105 },
{ "(bad)", { XX } },
{ "(bad)", { XX } },
- { "(bad)", { XX } },
+ { PREGRP108 },
{ "(bad)", { XX } },
{ PREGRP106 },
/* f8 */
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [Qemu-devel] [PATCH for-2.11 4/6] disas/i386: Add disassembly of rorx
2017-08-04 6:23 [Qemu-devel] [PATCH for-2.11 0/6] tcg/i386 haswell improvements Richard Henderson
` (2 preceding siblings ...)
2017-08-04 6:23 ` [Qemu-devel] [PATCH for-2.11 3/6] disas/i386: Add disassembly of vex.0f38.f5 Richard Henderson
@ 2017-08-04 6:23 ` Richard Henderson
2017-08-04 6:23 ` [Qemu-devel] [PATCH for-2.11 5/6] tcg/i386: Use pext for extract Richard Henderson
2017-08-04 6:23 ` [Qemu-devel] [PATCH for-2.11 6/6] tcg/i386: Use pdep for deposit Richard Henderson
5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2017-08-04 6:23 UTC (permalink / raw)
To: qemu-devel
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
disas/i386.c | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/disas/i386.c b/disas/i386.c
index 7eaa378a10..a557e678ec 100644
--- a/disas/i386.c
+++ b/disas/i386.c
@@ -684,6 +684,7 @@ fetch_data(struct disassemble_info *info, bfd_byte *addr)
#define PREGRP106 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 106 } }
#define PREGRP107 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 107 } }
#define PREGRP108 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 108 } }
+#define PREGRP109 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 109 } }
#define X86_64_0 NULL, { { NULL, X86_64_SPECIAL }, { NULL, 0 } }
#define X86_64_1 NULL, { { NULL, X86_64_SPECIAL }, { NULL, 1 } }
@@ -1557,7 +1558,7 @@ static const unsigned char threebyte_0x3a_uses_REPNZ_prefix[256] = {
/* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */
/* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */
/* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */
- /* f0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */
+ /* f0 */ 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */
/* ------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
@@ -2817,6 +2818,14 @@ static const struct dis386 prefix_user_table[][4] = {
{ "(bad)", { XX } },
{ "pdep", { Gv, Bv, Ev } },
},
+
+ /* PREGRP109 */
+ {
+ { "(bad)", { XX } },
+ { "(bad)", { XX } },
+ { "(bad)", { XX } },
+ { "rorx", { Gv, Ev, Ib } },
+ },
};
static const struct dis386 x86_64_table[][2] = {
@@ -3403,7 +3412,7 @@ static const struct dis386 three_byte_table[][256] = {
{ "(bad)", { XX } },
{ "(bad)", { XX } },
/* f0 */
- { "(bad)", { XX } },
+ { PREGRP109 },
{ "(bad)", { XX } },
{ "(bad)", { XX } },
{ "(bad)", { XX } },
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [Qemu-devel] [PATCH for-2.11 5/6] tcg/i386: Use pext for extract
2017-08-04 6:23 [Qemu-devel] [PATCH for-2.11 0/6] tcg/i386 haswell improvements Richard Henderson
` (3 preceding siblings ...)
2017-08-04 6:23 ` [Qemu-devel] [PATCH for-2.11 4/6] disas/i386: Add disassembly of rorx Richard Henderson
@ 2017-08-04 6:23 ` Richard Henderson
2017-08-04 6:23 ` [Qemu-devel] [PATCH for-2.11 6/6] tcg/i386: Use pdep for deposit Richard Henderson
5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2017-08-04 6:23 UTC (permalink / raw)
To: qemu-devel
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
tcg/i386/tcg-target.h | 6 +-
tcg/i386/tcg-target.inc.c | 147 +++++++++++++++++++++++++++++++++-------------
2 files changed, 109 insertions(+), 44 deletions(-)
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index b89dababf4..85b0ccd98c 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -76,6 +76,7 @@ typedef enum {
#endif
extern bool have_bmi1;
+extern bool have_bmi2;
extern bool have_popcnt;
/* optional instructions */
@@ -153,9 +154,10 @@ extern bool have_popcnt;
/* Check for the possibility of high-byte extraction and, for 64-bit,
zero-extending 32-bit right-shift. */
-#define TCG_TARGET_extract_i32_valid(ofs, len) ((ofs) == 8 && (len) == 8)
+#define TCG_TARGET_extract_i32_valid(ofs, len) \
+ (have_bmi2 || ((ofs) == 8 && (len) == 8))
#define TCG_TARGET_extract_i64_valid(ofs, len) \
- (((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
+ (have_bmi2 || ((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
#if TCG_TARGET_REG_BITS == 64
# define TCG_AREG0 TCG_REG_R14
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 5231056fd3..69587c82de 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -124,11 +124,11 @@ static bool have_cmov;
/* We need these symbols in tcg-target.h, and we can't properly conditionalize
it there. Therefore we always define the variable. */
bool have_bmi1;
+bool have_bmi2;
bool have_popcnt;
#ifdef CONFIG_CPUID_H
static bool have_movbe;
-static bool have_bmi2;
static bool have_lzcnt;
#else
# define have_movbe 0
@@ -275,13 +275,14 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
#define P_EXT 0x100 /* 0x0f opcode prefix */
#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
-#define P_DATA16 0x400 /* 0x66 opcode prefix */
+#define P_EXT3A 0x400 /* 0x0f 0x3a opcode prefix */
+#define P_DATA16 0x800 /* 0x66 opcode prefix */
#if TCG_TARGET_REG_BITS == 64
-# define P_ADDR32 0x800 /* 0x67 opcode prefix */
-# define P_REXW 0x1000 /* Set REX.W = 1 */
-# define P_REXB_R 0x2000 /* REG field as byte register */
-# define P_REXB_RM 0x4000 /* R/M field as byte register */
-# define P_GS 0x8000 /* gs segment override */
+# define P_ADDR32 0x1000 /* 0x67 opcode prefix */
+# define P_REXW 0x2000 /* Set REX.W = 1 */
+# define P_REXB_R 0x4000 /* REG field as byte register */
+# define P_REXB_RM 0x8000 /* R/M field as byte register */
+# define P_GS 0x10000 /* gs segment override */
#else
# define P_ADDR32 0
# define P_REXW 0
@@ -289,14 +290,15 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
# define P_REXB_RM 0
# define P_GS 0
#endif
-#define P_SIMDF3 0x10000 /* 0xf3 opcode prefix */
-#define P_SIMDF2 0x20000 /* 0xf2 opcode prefix */
+#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */
+#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */
#define OPC_ARITH_EvIz (0x81)
#define OPC_ARITH_EvIb (0x83)
#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
#define OPC_ANDN (0xf2 | P_EXT38)
#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
+#define OPC_BEXTR (0xf7 | P_EXT38)
#define OPC_BSF (0xbc | P_EXT)
#define OPC_BSR (0xbd | P_EXT)
#define OPC_BSWAP (0xc8 | P_EXT)
@@ -327,12 +329,14 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
#define OPC_MOVSLQ (0x63 | P_REXW)
#define OPC_MOVZBL (0xb6 | P_EXT)
#define OPC_MOVZWL (0xb7 | P_EXT)
+#define OPC_PEXT (0xf5 | P_EXT38 | P_SIMDF3)
#define OPC_POP_r32 (0x58)
#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
#define OPC_PUSH_r32 (0x50)
#define OPC_PUSH_Iv (0x68)
#define OPC_PUSH_Ib (0x6a)
#define OPC_RET (0xc3)
+#define OPC_RORX (0xf0 | P_EXT3A | P_SIMDF2)
#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
#define OPC_SHIFT_1 (0xd1)
#define OPC_SHIFT_Ib (0xc1)
@@ -455,6 +459,8 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
tcg_out8(s, 0x0f);
if (opc & P_EXT38) {
tcg_out8(s, 0x38);
+ } else if (opc & P_EXT3A) {
+ tcg_out8(s, 0x3a);
}
}
@@ -475,6 +481,8 @@ static void tcg_out_opc(TCGContext *s, int opc)
tcg_out8(s, 0x0f);
if (opc & P_EXT38) {
tcg_out8(s, 0x38);
+ } else if (opc & P_EXT3A) {
+ tcg_out8(s, 0x3a);
}
}
tcg_out8(s, opc);
@@ -491,34 +499,29 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
}
-static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
+static void tcg_out_vex_pfx_opc(TCGContext *s, int opc, int r, int v, int rm)
{
int tmp;
- if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) {
- /* Three byte VEX prefix. */
- tcg_out8(s, 0xc4);
-
- /* VEX.m-mmmm */
- if (opc & P_EXT38) {
- tmp = 2;
- } else if (opc & P_EXT) {
- tmp = 1;
- } else {
- tcg_abort();
- }
- tmp |= 0x40; /* VEX.X */
- tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
- tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
- tcg_out8(s, tmp);
+ /* Three byte VEX prefix. */
+ tcg_out8(s, 0xc4);
- tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
+ /* VEX.m-mmmm */
+ if (opc & P_EXT3A) {
+ tmp = 3;
+ } else if (opc & P_EXT38) {
+ tmp = 2;
+ } else if (opc & P_EXT) {
+ tmp = 1;
} else {
- /* Two byte VEX prefix. */
- tcg_out8(s, 0xc5);
-
- tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
+ tcg_abort();
}
+ tmp |= 0x40; /* VEX.X */
+ tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
+ tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
+ tcg_out8(s, tmp);
+
+ tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
/* VEX.pp */
if (opc & P_DATA16) {
tmp |= 1; /* 0x66 */
@@ -530,9 +533,43 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
tmp |= (~v & 15) << 3; /* VEX.vvvv */
tcg_out8(s, tmp);
tcg_out8(s, opc);
+}
+
+static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
+{
+ tcg_out_vex_pfx_opc(s, opc, r, v, rm);
tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
}
+static void tcg_out_sfx_pool_imm(TCGContext *s, int r, tcg_target_ulong data)
+{
+ /* modrm for 64-bit rip-relative, or 32-bit absolute addressing. */
+ tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
+
+ if (TCG_TARGET_REG_BITS == 64) {
+ new_pool_label(s, data, R_386_PC32, s->code_ptr, -4);
+ } else {
+ new_pool_label(s, data, R_386_32, s->code_ptr, 0);
+ }
+ tcg_out32(s, 0);
+}
+
+#if 0
+static void tcg_out_opc_pool_imm(TCGContext *s, int opc, int r,
+ tcg_target_ulong data)
+{
+ tcg_out_opc(s, opc, r, 0, 0);
+ tcg_out_sfx_pool_imm(s, r, data);
+}
+#endif
+
+static void tcg_out_vex_pool_imm(TCGContext *s, int opc, int r, int v,
+ tcg_target_ulong data)
+{
+ tcg_out_vex_pfx_opc(s, opc, r, v, 0);
+ tcg_out_sfx_pool_imm(s, r, data);
+}
+
/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
We handle either RM and INDEX missing with a negative value. In 64-bit
mode for absolute addresses, ~RM is the size of the immediate operand
@@ -877,6 +914,13 @@ static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
}
}
+static void tcg_out_rorx(TCGContext *s, int rexw,
+ TCGReg dst, TCGReg src, int c)
+{
+ tcg_out_vex_modrm(s, OPC_RORX + rexw, dst, 0, src);
+ tcg_out8(s, c);
+}
+
/* Use SMALL != 0 to force a short forward branch. */
static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
{
@@ -1858,7 +1902,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
const TCGArg *args, const int *const_args)
{
- TCGArg a0, a1, a2;
+ TCGArg a0, a1, a2, a3;
int c, const_a2, vexop, rexw = 0;
#if TCG_TARGET_REG_BITS == 64
@@ -2244,12 +2288,18 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
/* On the off-chance that we can use the high-byte registers.
Otherwise we emit the same ext16 + shift pattern that we
would have gotten from the normal tcg-op.c expansion. */
- tcg_debug_assert(a2 == 8 && args[3] == 8);
- if (a1 < 4 && a0 < 8) {
- tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
+ a3 = args[3];
+ if (a2 == 8 && a3 == 8) {
+ if (a1 < 4 && a0 < 8) {
+ tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
+ } else {
+ tcg_out_ext16u(s, a0, a1);
+ tcg_out_shifti(s, SHIFT_SHR, a0, 8);
+ }
} else {
- tcg_out_ext16u(s, a0, a1);
- tcg_out_shifti(s, SHIFT_SHR, a0, 8);
+ tcg_debug_assert(have_bmi2);
+ tcg_out_vex_pool_imm(s, OPC_PEXT + (a2 + a3 > 32) * P_REXW,
+ a0, a1, deposit64(0, a2, a3, -1));
}
break;
@@ -2257,12 +2307,25 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
/* We don't implement sextract_i64, as we cannot sign-extend to
64-bits without using the REX prefix that explicitly excludes
access to the high-byte registers. */
- tcg_debug_assert(a2 == 8 && args[3] == 8);
- if (a1 < 4 && a0 < 8) {
- tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
+ a3 = args[3];
+ if (a2 == 8 && a3 == 8) {
+ if (a1 < 4 && a0 < 8) {
+ tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
+ } else {
+ tcg_out_ext16s(s, a0, a1, 0);
+ tcg_out_shifti(s, SHIFT_SAR, a0, 8);
+ }
} else {
- tcg_out_ext16s(s, a0, a1, 0);
- tcg_out_shifti(s, SHIFT_SAR, a0, 8);
+ /* ??? We only have one extract_i32_valid macro. But as it
+ happens we can perform a useful 3-operand shift. */
+ tcg_debug_assert(have_bmi2);
+ if (a2 + a3 < 32) {
+ /* Rotate the field in A1 to the MSB of A0. */
+ tcg_out_rorx(s, 0, a0, a1, a2 + a3);
+ } else {
+ tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
+ }
+ tcg_out_shifti(s, SHIFT_SAR, a0, 32 - a3);
}
break;
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [Qemu-devel] [PATCH for-2.11 6/6] tcg/i386: Use pdep for deposit
2017-08-04 6:23 [Qemu-devel] [PATCH for-2.11 0/6] tcg/i386 haswell improvements Richard Henderson
` (4 preceding siblings ...)
2017-08-04 6:23 ` [Qemu-devel] [PATCH for-2.11 5/6] tcg/i386: Use pext for extract Richard Henderson
@ 2017-08-04 6:23 ` Richard Henderson
5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2017-08-04 6:23 UTC (permalink / raw)
To: qemu-devel
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
tcg/i386/tcg-target.h | 4 ++-
tcg/i386/tcg-target.inc.c | 82 ++++++++++++++++++++++++++++++++++++++---------
2 files changed, 70 insertions(+), 16 deletions(-)
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 85b0ccd98c..e512648c95 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -148,7 +148,9 @@ extern bool have_popcnt;
#endif
#define TCG_TARGET_deposit_i32_valid(ofs, len) \
- (((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
+ (have_bmi2 || \
+ ((ofs) == 0 && (len) == 8) || \
+ ((ofs) == 8 && (len) == 8) || \
((ofs) == 0 && (len) == 16))
#define TCG_TARGET_deposit_i64_valid TCG_TARGET_deposit_i32_valid
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 69587c82de..aeefb72aa0 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -329,6 +329,7 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
#define OPC_MOVSLQ (0x63 | P_REXW)
#define OPC_MOVZBL (0xb6 | P_EXT)
#define OPC_MOVZWL (0xb7 | P_EXT)
+#define OPC_PDEP (0xf5 | P_EXT38 | P_SIMDF2)
#define OPC_PEXT (0xf5 | P_EXT38 | P_SIMDF3)
#define OPC_POP_r32 (0x58)
#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
@@ -554,14 +555,12 @@ static void tcg_out_sfx_pool_imm(TCGContext *s, int r, tcg_target_ulong data)
tcg_out32(s, 0);
}
-#if 0
static void tcg_out_opc_pool_imm(TCGContext *s, int opc, int r,
tcg_target_ulong data)
{
tcg_out_opc(s, opc, r, 0, 0);
tcg_out_sfx_pool_imm(s, r, data);
}
-#endif
static void tcg_out_vex_pool_imm(TCGContext *s, int opc, int r, int v,
tcg_target_ulong data)
@@ -1902,7 +1901,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
const TCGArg *args, const int *const_args)
{
- TCGArg a0, a1, a2, a3;
+ TCGArg a0, a1, a2, a3, a4;
int c, const_a2, vexop, rexw = 0;
#if TCG_TARGET_REG_BITS == 64
@@ -2262,17 +2261,68 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
#endif
OP_32_64(deposit):
- if (args[3] == 0 && args[4] == 8) {
- /* load bits 0..7 */
- tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
- } else if (args[3] == 8 && args[4] == 8) {
- /* load bits 8..15 */
- tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
- } else if (args[3] == 0 && args[4] == 16) {
- /* load bits 0..15 */
- tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
- } else {
- tcg_abort();
+ a3 = args[3];
+ a4 = args[4];
+ {
+ tcg_target_ulong mask = deposit64(0, a3, a4, -1);
+
+ if (const_args[1]) {
+ tcg_debug_assert(have_bmi2);
+ if (a3 == 0 && a0 == a2) {
+ if (a4 <= 32) {
+ tgen_arithi(s, ARITH_AND, a0, mask, 0);
+ } else {
+ tcg_out_opc_pool_imm(s, OPC_ARITH_GvEv + P_REXW
+ + ARITH_AND * 8, a0, mask);
+ }
+ } else {
+ tcg_out_vex_pool_imm(s, OPC_PDEP
+ + (a3 + a4 > 32) * P_REXW,
+ a0, a2, mask);
+ }
+ a1 &= ~mask;
+ if (a1 != 0) {
+ if (!rexw || a1 == (int)a1) {
+ tgen_arithi(s, ARITH_OR + rexw, a0, a1, 0);
+ } else {
+ tcg_out_opc_pool_imm(s, OPC_ARITH_GvEv + P_REXW
+ + ARITH_OR * 8, a0, a1);
+ }
+ }
+ } else if (a0 == a1 && a3 == 0 && a4 == 8) {
+ /* load bits 0..7 */
+ tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
+ } else if (a0 == a1 && a3 == 8 && a4 == 8 && a0 < 4 && a2 < 8) {
+ /* load bits 8..15 */
+ tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
+ } else if (a0 == a1 && a3 == 0 && a4 == 16) {
+ /* load bits 0..15 */
+ tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
+ } else {
+ TCGType type = rexw ? TCG_TYPE_I64 : TCG_TYPE_I32;
+ TCGReg t1 = tcg_reg_alloc_new(s, type);
+ TCGReg t2 = t1;
+
+ tcg_debug_assert(have_bmi2);
+ tcg_out_movi(s, type, t1, mask);
+ if (a0 == a2) {
+ t2 = tcg_reg_alloc_new(s, type);
+ tcg_out_vex_modrm(s, OPC_ANDN + rexw, t2, t1, a1);
+ if (a3 == 0) {
+ tgen_arithr(s, ARITH_AND + rexw, a0, t1);
+ } else {
+ tcg_out_vex_modrm(s, OPC_PDEP + rexw, a0, a2, t1);
+ }
+ } else {
+ tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, t1, a1);
+ if (a3 == 0) {
+ tgen_arithr(s, ARITH_AND + rexw, t1, a2);
+ } else {
+ tcg_out_vex_modrm(s, OPC_PDEP + rexw, t1, a2, t1);
+ }
+ }
+ tgen_arithr(s, ARITH_OR + rexw, a0, t2);
+ }
}
break;
@@ -2480,7 +2530,9 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
{
static const TCGTargetOpDef dep
= { .args_ct_str = { "Q", "0", "Q" } };
- return &dep;
+ static const TCGTargetOpDef pdep
+ = { .args_ct_str = { "r", "ri", "r" } };
+ return have_bmi2 ? &pdep : &dep;
}
case INDEX_op_setcond_i32:
case INDEX_op_setcond_i64:
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread