* [PATCH v2 3/5] tcg/i386: Use canonical operand ordering in expand_vec_sari
2025-08-30 3:39 [PATCH v2 0/5] tcg/i386: Improve vector shifts Richard Henderson
2025-08-30 3:39 ` [PATCH v2 1/5] cpuinfo/i386: Detect GFNI as an AVX extension Richard Henderson
2025-08-30 3:39 ` [PATCH v2 2/5] tcg/i386: Expand sari of bits-1 as pcmpgt Richard Henderson
@ 2025-08-30 3:39 ` Richard Henderson
2025-09-01 6:44 ` Philippe Mathieu-Daudé
2025-08-30 3:39 ` [PATCH v2 4/5] tcg/i386: Add INDEX_op_x86_vgf2p8affineqb_vec Richard Henderson
2025-08-30 3:39 ` [PATCH v2 5/5] tcg/i386: Use vgf2p8affineqb for MO_8 vector shifts Richard Henderson
4 siblings, 1 reply; 8+ messages in thread
From: Richard Henderson @ 2025-08-30 3:39 UTC (permalink / raw)
To: qemu-devel
The optimizer prefers to have constants as the second operand,
so expand LT x,0 instead of GT 0,x. This will not affect the
generated code at all.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/i386/tcg-target.c.inc | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 4cd5d4276c..8260c35edd 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -4399,8 +4399,8 @@ static void expand_vec_sari(TCGType type, unsigned vece,
/* Otherwise we will need to use a compare vs 0 to produce
* the sign-extend, shift and merge.
*/
- tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
- tcg_constant_vec(type, MO_64, 0), v1);
+ tcg_gen_cmp_vec(TCG_COND_LT, MO_64, t1, v1,
+ tcg_constant_vec(type, MO_64, 0));
tcg_gen_shri_vec(MO_64, v0, v1, imm);
tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
tcg_gen_or_vec(MO_64, v0, v0, t1);
--
2.43.0
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH v2 4/5] tcg/i386: Add INDEX_op_x86_vgf2p8affineqb_vec
2025-08-30 3:39 [PATCH v2 0/5] tcg/i386: Improve vector shifts Richard Henderson
` (2 preceding siblings ...)
2025-08-30 3:39 ` [PATCH v2 3/5] tcg/i386: Use canonical operand ordering in expand_vec_sari Richard Henderson
@ 2025-08-30 3:39 ` Richard Henderson
2025-08-30 3:39 ` [PATCH v2 5/5] tcg/i386: Use vgf2p8affineqb for MO_8 vector shifts Richard Henderson
4 siblings, 0 replies; 8+ messages in thread
From: Richard Henderson @ 2025-08-30 3:39 UTC (permalink / raw)
To: qemu-devel
Add a backend-specific opcode for expanding the
GFNI vgf2p8affineqb instruction, which we can use
for expanding 8-bit immediate shifts and rotates.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/i386/tcg-target-opc.h.inc | 1 +
tcg/i386/tcg-target.c.inc | 6 ++++++
2 files changed, 7 insertions(+)
diff --git a/tcg/i386/tcg-target-opc.h.inc b/tcg/i386/tcg-target-opc.h.inc
index 8cc0dbaeaf..8a5cb34dbe 100644
--- a/tcg/i386/tcg-target-opc.h.inc
+++ b/tcg/i386/tcg-target-opc.h.inc
@@ -35,3 +35,4 @@ DEF(x86_punpckh_vec, 1, 2, 0, TCG_OPF_VECTOR)
DEF(x86_vpshldi_vec, 1, 2, 1, TCG_OPF_VECTOR)
DEF(x86_vpshldv_vec, 1, 3, 0, TCG_OPF_VECTOR)
DEF(x86_vpshrdv_vec, 1, 3, 0, TCG_OPF_VECTOR)
+DEF(x86_vgf2p8affineqb_vec, 1, 2, 1, TCG_OPF_VECTOR)
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 8260c35edd..efaca0ca67 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -451,6 +451,7 @@ static bool tcg_target_const_match(int64_t val, int ct,
#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
+#define OPC_VGF2P8AFFINEQB (0xce | P_EXT3A | P_DATA16 | P_VEXW)
#define OPC_VPMOVM2B (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX)
#define OPC_VPMOVM2W (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
#define OPC_VPMOVM2D (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX)
@@ -4084,6 +4085,10 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
insn = vpshldi_insn[vece];
sub = args[3];
goto gen_simd_imm8;
+ case INDEX_op_x86_vgf2p8affineqb_vec:
+ insn = OPC_VGF2P8AFFINEQB;
+ sub = args[3];
+ goto gen_simd_imm8;
case INDEX_op_not_vec:
insn = OPC_VPTERNLOGQ;
@@ -4188,6 +4193,7 @@ tcg_target_op_def(TCGOpcode op, TCGType type, unsigned flags)
case INDEX_op_x86_punpckl_vec:
case INDEX_op_x86_punpckh_vec:
case INDEX_op_x86_vpshldi_vec:
+ case INDEX_op_x86_vgf2p8affineqb_vec:
#if TCG_TARGET_REG_BITS == 32
case INDEX_op_dup2_vec:
#endif
--
2.43.0
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH v2 5/5] tcg/i386: Use vgf2p8affineqb for MO_8 vector shifts
2025-08-30 3:39 [PATCH v2 0/5] tcg/i386: Improve vector shifts Richard Henderson
` (3 preceding siblings ...)
2025-08-30 3:39 ` [PATCH v2 4/5] tcg/i386: Add INDEX_op_x86_vgf2p8affineqb_vec Richard Henderson
@ 2025-08-30 3:39 ` Richard Henderson
4 siblings, 0 replies; 8+ messages in thread
From: Richard Henderson @ 2025-08-30 3:39 UTC (permalink / raw)
To: qemu-devel
A constant matrix can describe the movement of the 8 bits,
so these shifts can be performed with one instruction.
Logic courtesy of Andi Kleen <ak@linux.intel.com>:
https://gcc.gnu.org/pipermail/gcc-patches/2025-August/691624.html
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/i386/tcg-target.c.inc | 75 ++++++++++++++++++++++++++++++++++++---
1 file changed, 71 insertions(+), 4 deletions(-)
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index efaca0ca67..ee27266861 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -4342,12 +4342,46 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
}
}
+static void gen_vgf2p8affineqb0(TCGType type, TCGv_vec v0,
+ TCGv_vec v1, uint64_t matrix)
+{
+ vec_gen_4(INDEX_op_x86_vgf2p8affineqb_vec, type, MO_8,
+ tcgv_vec_arg(v0), tcgv_vec_arg(v1),
+ tcgv_vec_arg(tcg_constant_vec(type, MO_64, matrix)), 0);
+}
+
static void expand_vec_shi(TCGType type, unsigned vece, bool right,
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
{
+ static const uint64_t gf2_shi[2][8] = {
+ /* left shift */
+ { 0,
+ 0x0001020408102040ull,
+ 0x0000010204081020ull,
+ 0x0000000102040810ull,
+ 0x0000000001020408ull,
+ 0x0000000000010204ull,
+ 0x0000000000000102ull,
+ 0x0000000000000001ull },
+ /* right shift */
+ { 0,
+ 0x0204081020408000ull,
+ 0x0408102040800000ull,
+ 0x0810204080000000ull,
+ 0x1020408000000000ull,
+ 0x2040800000000000ull,
+ 0x4080000000000000ull,
+ 0x8000000000000000ull }
+ };
uint8_t mask;
tcg_debug_assert(vece == MO_8);
+
+ if (cpuinfo & CPUINFO_GFNI) {
+ gen_vgf2p8affineqb0(type, v0, v1, gf2_shi[right][imm]);
+ return;
+ }
+
if (right) {
mask = 0xff >> imm;
tcg_gen_shri_vec(MO_16, v0, v1, imm);
@@ -4361,6 +4395,16 @@ static void expand_vec_shi(TCGType type, unsigned vece, bool right,
static void expand_vec_sari(TCGType type, unsigned vece,
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
{
+ static const uint64_t gf2_sar[8] = {
+ 0,
+ 0x0204081020408080ull,
+ 0x0408102040808080ull,
+ 0x0810204080808080ull,
+ 0x1020408080808080ull,
+ 0x2040808080808080ull,
+ 0x4080808080808080ull,
+ 0x8080808080808080ull,
+ };
TCGv_vec t1, t2;
if (imm >= (8 << vece) - 1) {
@@ -4371,6 +4415,11 @@ static void expand_vec_sari(TCGType type, unsigned vece,
switch (vece) {
case MO_8:
+ if (cpuinfo & CPUINFO_GFNI) {
+ gen_vgf2p8affineqb0(type, v0, v1, gf2_sar[imm]);
+ break;
+ }
+
/* Unpack to 16-bit, shift, and repack. */
t1 = tcg_temp_new_vec(type);
t2 = tcg_temp_new_vec(type);
@@ -4422,12 +4471,30 @@ static void expand_vec_sari(TCGType type, unsigned vece,
static void expand_vec_rotli(TCGType type, unsigned vece,
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
{
+ static const uint64_t gf2_rol[8] = {
+ 0,
+ 0x8001020408102040ull,
+ 0x4080010204081020ull,
+ 0x2040800102040810ull,
+ 0x1020408001020408ull,
+ 0x0810204080010204ull,
+ 0x0408102040800102ull,
+ 0x0204081020408001ull,
+ };
TCGv_vec t;
- if (vece != MO_8 && have_avx512vbmi2) {
- vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
- tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
- return;
+ if (vece == MO_8) {
+ if (cpuinfo & CPUINFO_GFNI) {
+ gen_vgf2p8affineqb0(type, v0, v1, gf2_rol[imm]);
+ return;
+ }
+ } else {
+ if (have_avx512vbmi2) {
+ vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
+ tcgv_vec_arg(v0), tcgv_vec_arg(v1),
+ tcgv_vec_arg(v1), imm);
+ return;
+ }
}
t = tcg_temp_new_vec(type);
--
2.43.0
^ permalink raw reply related [flat|nested] 8+ messages in thread