* [PATCH v4 01/13] tcg/ppc: Untabify tcg-target.c.inc
2023-10-13 17:09 [PATCH v4 00/13] tcg/ppc: direct branching, power9, power10 Richard Henderson
@ 2023-10-13 17:10 ` Richard Henderson
2023-10-13 17:10 ` [PATCH v4 02/13] tcg/ppc: Enable direct branching tcg_out_goto_tb with TCG_REG_TB Richard Henderson
` (11 subsequent siblings)
12 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2023-10-13 17:10 UTC (permalink / raw)
To: qemu-devel; +Cc: jniethe5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/ppc/tcg-target.c.inc | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 5c873b2161..5cecc6ed95 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -221,7 +221,7 @@ static inline bool in_range_b(tcg_target_long target)
}
static uint32_t reloc_pc24_val(const tcg_insn_unit *pc,
- const tcg_insn_unit *target)
+ const tcg_insn_unit *target)
{
ptrdiff_t disp = tcg_ptr_byte_diff(target, pc);
tcg_debug_assert(in_range_b(disp));
@@ -241,7 +241,7 @@ static bool reloc_pc24(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
}
static uint16_t reloc_pc14_val(const tcg_insn_unit *pc,
- const tcg_insn_unit *target)
+ const tcg_insn_unit *target)
{
ptrdiff_t disp = tcg_ptr_byte_diff(target, pc);
tcg_debug_assert(disp == (int16_t) disp);
@@ -3645,7 +3645,7 @@ static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0,
tcgv_vec_arg(t1), tcgv_vec_arg(t2));
vec_gen_3(INDEX_op_ppc_pkum_vec, type, vece, tcgv_vec_arg(v0),
tcgv_vec_arg(v0), tcgv_vec_arg(t1));
- break;
+ break;
case MO_32:
tcg_debug_assert(!have_isa_2_07);
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 02/13] tcg/ppc: Enable direct branching tcg_out_goto_tb with TCG_REG_TB
2023-10-13 17:09 [PATCH v4 00/13] tcg/ppc: direct branching, power9, power10 Richard Henderson
2023-10-13 17:10 ` [PATCH v4 01/13] tcg/ppc: Untabify tcg-target.c.inc Richard Henderson
@ 2023-10-13 17:10 ` Richard Henderson
2023-10-13 17:10 ` [PATCH v4 03/13] tcg/ppc: Reinterpret tb-relative to TB+4 Richard Henderson
` (10 subsequent siblings)
12 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2023-10-13 17:10 UTC (permalink / raw)
To: qemu-devel; +Cc: jniethe5
From: Jordan Niethe <jniethe5@gmail.com>
Direct branch patching was disabled when using TCG_REG_TB in commit
736a1588c1 ("tcg/ppc: Fix race in goto_tb implementation").
The issue with direct branch patching with TCG_REG_TB is the lack of
synchronization between the new TCG_REG_TB being established and the
direct branch being patched in.
If each translation block is responsible for establishing its own
TCG_REG_TB then there can be no synchronization issue.
Make each translation block begin by setting up its own TCG_REG_TB.
Use the preferred 'bcl 20,31,$+4' sequence.
Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
[rth: Split out tcg_out_tb_start, power9 addpcis]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/ppc/tcg-target.c.inc | 48 ++++++++++++++--------------------------
1 file changed, 17 insertions(+), 31 deletions(-)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 5cecc6ed95..9197cfd6c6 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -2509,9 +2509,6 @@ static void tcg_target_qemu_prologue(TCGContext *s)
tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
tcg_out32(s, MTSPR | RS(tcg_target_call_iarg_regs[1]) | CTR);
- if (USE_REG_TB) {
- tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB, tcg_target_call_iarg_regs[1]);
- }
tcg_out32(s, BCCTR | BO_ALWAYS);
/* Epilogue */
@@ -2529,7 +2526,13 @@ static void tcg_target_qemu_prologue(TCGContext *s)
static void tcg_out_tb_start(TCGContext *s)
{
- /* nothing to do */
+ /* Load TCG_REG_TB. */
+ if (USE_REG_TB) {
+ /* bcl 20,31,$+4 (preferred form for getting nia) */
+ tcg_out32(s, BC | BO_ALWAYS | BI(7, CR_SO) | 0x4 | LK);
+ tcg_out32(s, MFSPR | RT(TCG_REG_TB) | LR);
+ tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, -4));
+ }
}
static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
@@ -2542,32 +2545,22 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
{
uintptr_t ptr = get_jmp_target_addr(s, which);
+ /* Direct branch will be patched by tb_target_set_jmp_target. */
+ set_jmp_insn_offset(s, which);
+ tcg_out32(s, NOP);
+
+ /* When branch is out of range, fall through to indirect. */
if (USE_REG_TB) {
ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
- tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
-
- /* TODO: Use direct branches when possible. */
- set_jmp_insn_offset(s, which);
- tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
-
- tcg_out32(s, BCCTR | BO_ALWAYS);
-
- /* For the unlinked case, need to reset TCG_REG_TB. */
- set_jmp_reset_offset(s, which);
- tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
- -tcg_current_code_size(s));
+ tcg_out_mem_long(s, LD, LDX, TCG_REG_TMP1, TCG_REG_TB, offset);
} else {
- /* Direct branch will be patched by tb_target_set_jmp_target. */
- set_jmp_insn_offset(s, which);
- tcg_out32(s, NOP);
-
- /* When branch is out of range, fall through to indirect. */
tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
- tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
- tcg_out32(s, BCCTR | BO_ALWAYS);
- set_jmp_reset_offset(s, which);
}
+
+ tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
+ tcg_out32(s, BCCTR | BO_ALWAYS);
+ set_jmp_reset_offset(s, which);
}
void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
@@ -2577,10 +2570,6 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
intptr_t diff = addr - jmp_rx;
tcg_insn_unit insn;
- if (USE_REG_TB) {
- return;
- }
-
if (in_range_b(diff)) {
insn = B | (diff & 0x3fffffc);
} else {
@@ -2600,9 +2589,6 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
switch (opc) {
case INDEX_op_goto_ptr:
tcg_out32(s, MTSPR | RS(args[0]) | CTR);
- if (USE_REG_TB) {
- tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB, args[0]);
- }
tcg_out32(s, ADDI | TAI(TCG_REG_R3, 0, 0));
tcg_out32(s, BCCTR | BO_ALWAYS);
break;
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 03/13] tcg/ppc: Reinterpret tb-relative to TB+4
2023-10-13 17:09 [PATCH v4 00/13] tcg/ppc: direct branching, power9, power10 Richard Henderson
2023-10-13 17:10 ` [PATCH v4 01/13] tcg/ppc: Untabify tcg-target.c.inc Richard Henderson
2023-10-13 17:10 ` [PATCH v4 02/13] tcg/ppc: Enable direct branching tcg_out_goto_tb with TCG_REG_TB Richard Henderson
@ 2023-10-13 17:10 ` Richard Henderson
2023-10-13 17:10 ` [PATCH v4 04/13] tcg/ppc: Use ADDPCIS in tcg_out_tb_start Richard Henderson
` (9 subsequent siblings)
12 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2023-10-13 17:10 UTC (permalink / raw)
To: qemu-devel; +Cc: jniethe5
It saves one insn to load the address of TB+4 instead of TB.
Adjust all of the indexing to match.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/ppc/tcg-target.c.inc | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 9197cfd6c6..aafbf2db4e 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -215,6 +215,12 @@ static const int tcg_target_callee_save_regs[] = {
TCG_REG_R31
};
+/* For PPC, we use TB+4 instead of TB as the base. */
+static inline ptrdiff_t ppc_tbrel_diff(TCGContext *s, const void *target)
+{
+ return tcg_tbrel_diff(s, target) - 4;
+}
+
static inline bool in_range_b(tcg_target_long target)
{
return target == sextract64(target, 0, 26);
@@ -991,7 +997,7 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
}
/* Load addresses within the TB with one insn. */
- tb_diff = tcg_tbrel_diff(s, (void *)arg);
+ tb_diff = ppc_tbrel_diff(s, (void *)arg);
if (!in_prologue && USE_REG_TB && tb_diff == (int16_t)tb_diff) {
tcg_out32(s, ADDI | TAI(ret, TCG_REG_TB, tb_diff));
return;
@@ -1044,7 +1050,7 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
/* Use the constant pool, if possible. */
if (!in_prologue && USE_REG_TB) {
new_pool_label(s, arg, R_PPC_ADDR16, s->code_ptr,
- tcg_tbrel_diff(s, NULL));
+ ppc_tbrel_diff(s, NULL));
tcg_out32(s, LD | TAI(ret, TCG_REG_TB, 0));
return;
}
@@ -1104,7 +1110,7 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
*/
if (USE_REG_TB) {
rel = R_PPC_ADDR16;
- add = tcg_tbrel_diff(s, NULL);
+ add = ppc_tbrel_diff(s, NULL);
} else {
rel = R_PPC_ADDR32;
add = 0;
@@ -2531,7 +2537,6 @@ static void tcg_out_tb_start(TCGContext *s)
/* bcl 20,31,$+4 (preferred form for getting nia) */
tcg_out32(s, BC | BO_ALWAYS | BI(7, CR_SO) | 0x4 | LK);
tcg_out32(s, MFSPR | RT(TCG_REG_TB) | LR);
- tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, -4));
}
}
@@ -2551,7 +2556,7 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
/* When branch is out of range, fall through to indirect. */
if (USE_REG_TB) {
- ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
+ ptrdiff_t offset = ppc_tbrel_diff(s, (void *)ptr);
tcg_out_mem_long(s, LD, LDX, TCG_REG_TMP1, TCG_REG_TB, offset);
} else {
tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 04/13] tcg/ppc: Use ADDPCIS in tcg_out_tb_start
2023-10-13 17:09 [PATCH v4 00/13] tcg/ppc: direct branching, power9, power10 Richard Henderson
` (2 preceding siblings ...)
2023-10-13 17:10 ` [PATCH v4 03/13] tcg/ppc: Reinterpret tb-relative to TB+4 Richard Henderson
@ 2023-10-13 17:10 ` Richard Henderson
2023-10-13 17:10 ` [PATCH v4 05/13] tcg/ppc: Use ADDPCIS in tcg_out_movi_int Richard Henderson
` (8 subsequent siblings)
12 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2023-10-13 17:10 UTC (permalink / raw)
To: qemu-devel; +Cc: jniethe5
With ISA v3.0, we can use ADDPCIS instead of BCL+MFLR to load NIA.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/ppc/tcg-target.c.inc | 25 ++++++++++++++++++++++---
1 file changed, 22 insertions(+), 3 deletions(-)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index aafbf2db4e..b0b8cd2390 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -362,6 +362,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
#define CRNAND XO19(225)
#define CROR XO19(449)
#define CRNOR XO19( 33)
+#define ADDPCIS XO19( 2)
#define EXTSB XO31(954)
#define EXTSH XO31(922)
@@ -859,6 +860,19 @@ static inline void tcg_out_sari64(TCGContext *s, TCGReg dst, TCGReg src, int c)
tcg_out32(s, SRADI | RA(dst) | RS(src) | SH(c & 0x1f) | ((c >> 4) & 2));
}
+static void tcg_out_addpcis(TCGContext *s, TCGReg dst, intptr_t imm)
+{
+ uint32_t d0, d1, d2;
+
+ tcg_debug_assert((imm & 0xffff) == 0);
+ tcg_debug_assert(imm == (int32_t)imm);
+
+ d2 = extract32(imm, 16, 1);
+ d1 = extract32(imm, 17, 5);
+ d0 = extract32(imm, 22, 10);
+ tcg_out32(s, ADDPCIS | RT(dst) | (d1 << 16) | (d0 << 6) | d2);
+}
+
static void tcg_out_bswap16(TCGContext *s, TCGReg dst, TCGReg src, int flags)
{
TCGReg tmp = dst == src ? TCG_REG_R0 : dst;
@@ -2534,9 +2548,14 @@ static void tcg_out_tb_start(TCGContext *s)
{
/* Load TCG_REG_TB. */
if (USE_REG_TB) {
- /* bcl 20,31,$+4 (preferred form for getting nia) */
- tcg_out32(s, BC | BO_ALWAYS | BI(7, CR_SO) | 0x4 | LK);
- tcg_out32(s, MFSPR | RT(TCG_REG_TB) | LR);
+ if (have_isa_3_00) {
+ /* lnia REG_TB */
+ tcg_out_addpcis(s, TCG_REG_TB, 0);
+ } else {
+ /* bcl 20,31,$+4 (preferred form for getting nia) */
+ tcg_out32(s, BC | BO_ALWAYS | BI(7, CR_SO) | 0x4 | LK);
+ tcg_out32(s, MFSPR | RT(TCG_REG_TB) | LR);
+ }
}
}
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 05/13] tcg/ppc: Use ADDPCIS in tcg_out_movi_int
2023-10-13 17:09 [PATCH v4 00/13] tcg/ppc: direct branching, power9, power10 Richard Henderson
` (3 preceding siblings ...)
2023-10-13 17:10 ` [PATCH v4 04/13] tcg/ppc: Use ADDPCIS in tcg_out_tb_start Richard Henderson
@ 2023-10-13 17:10 ` Richard Henderson
2023-10-13 17:10 ` [PATCH v4 06/13] tcg/ppc: Use ADDPCIS for the constant pool Richard Henderson
` (7 subsequent siblings)
12 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2023-10-13 17:10 UTC (permalink / raw)
To: qemu-devel; +Cc: jniethe5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/ppc/tcg-target.c.inc | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index b0b8cd2390..226b5598ac 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -1055,6 +1055,19 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
return;
}
+ /* Load addresses within 2GB with 2 insns. */
+ if (have_isa_3_00) {
+ intptr_t hi = tcg_pcrel_diff(s, (void *)arg) - 4;
+ int16_t lo = hi;
+
+ hi -= lo;
+ if (hi == (int32_t)hi) {
+ tcg_out_addpcis(s, TCG_REG_TMP2, hi);
+ tcg_out32(s, ADDI | TAI(ret, TCG_REG_TMP2, lo));
+ return;
+ }
+ }
+
/* Load addresses within 2GB of TB with 2 (or rarely 3) insns. */
if (!in_prologue && USE_REG_TB && tb_diff == (int32_t)tb_diff) {
tcg_out_mem_long(s, ADDI, ADD, ret, TCG_REG_TB, tb_diff);
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 06/13] tcg/ppc: Use ADDPCIS for the constant pool
2023-10-13 17:09 [PATCH v4 00/13] tcg/ppc: direct branching, power9, power10 Richard Henderson
` (4 preceding siblings ...)
2023-10-13 17:10 ` [PATCH v4 05/13] tcg/ppc: Use ADDPCIS in tcg_out_movi_int Richard Henderson
@ 2023-10-13 17:10 ` Richard Henderson
2023-10-13 17:10 ` [PATCH v4 07/13] tcg/ppc: Use ADDPCIS in tcg_out_goto_tb Richard Henderson
` (6 subsequent siblings)
12 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2023-10-13 17:10 UTC (permalink / raw)
To: qemu-devel; +Cc: jniethe5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/ppc/tcg-target.c.inc | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 226b5598ac..720f92ff33 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -1081,6 +1081,12 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
tcg_out32(s, LD | TAI(ret, TCG_REG_TB, 0));
return;
}
+ if (have_isa_3_00) {
+ tcg_out_addpcis(s, TCG_REG_TMP2, 0);
+ new_pool_label(s, arg, R_PPC_REL14, s->code_ptr, 0);
+ tcg_out32(s, LD | TAI(ret, TCG_REG_TMP2, 0));
+ return;
+ }
tmp = arg >> 31 >> 1;
tcg_out_movi(s, TCG_TYPE_I32, ret, tmp);
@@ -1138,6 +1144,10 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
if (USE_REG_TB) {
rel = R_PPC_ADDR16;
add = ppc_tbrel_diff(s, NULL);
+ } else if (have_isa_3_00) {
+ tcg_out_addpcis(s, TCG_REG_TMP1, 0);
+ rel = R_PPC_REL14;
+ add = 0;
} else {
rel = R_PPC_ADDR32;
add = 0;
@@ -1164,6 +1174,8 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
if (USE_REG_TB) {
tcg_out32(s, ADDI | TAI(TCG_REG_TMP1, 0, 0));
load_insn |= RA(TCG_REG_TB);
+ } else if (have_isa_3_00) {
+ tcg_out32(s, ADDI | TAI(TCG_REG_TMP1, TCG_REG_TMP1, 0));
} else {
tcg_out32(s, ADDIS | TAI(TCG_REG_TMP1, 0, 0));
tcg_out32(s, ADDI | TAI(TCG_REG_TMP1, TCG_REG_TMP1, 0));
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 07/13] tcg/ppc: Use ADDPCIS in tcg_out_goto_tb
2023-10-13 17:09 [PATCH v4 00/13] tcg/ppc: direct branching, power9, power10 Richard Henderson
` (5 preceding siblings ...)
2023-10-13 17:10 ` [PATCH v4 06/13] tcg/ppc: Use ADDPCIS for the constant pool Richard Henderson
@ 2023-10-13 17:10 ` Richard Henderson
2023-10-13 17:10 ` [PATCH v4 08/13] tcg/ppc: Use PADDI in tcg_out_movi Richard Henderson
` (5 subsequent siblings)
12 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2023-10-13 17:10 UTC (permalink / raw)
To: qemu-devel; +Cc: jniethe5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/ppc/tcg-target.c.inc | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 720f92ff33..6337b1e8be 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -2593,6 +2593,7 @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
static void tcg_out_goto_tb(TCGContext *s, int which)
{
uintptr_t ptr = get_jmp_target_addr(s, which);
+ int16_t lo;
/* Direct branch will be patched by tb_target_set_jmp_target. */
set_jmp_insn_offset(s, which);
@@ -2602,9 +2603,15 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
if (USE_REG_TB) {
ptrdiff_t offset = ppc_tbrel_diff(s, (void *)ptr);
tcg_out_mem_long(s, LD, LDX, TCG_REG_TMP1, TCG_REG_TB, offset);
+ } else if (have_isa_3_00) {
+ ptrdiff_t offset = tcg_pcrel_diff(s, (void *)ptr) - 4;
+ lo = offset;
+ tcg_out_addpcis(s, TCG_REG_TMP1, offset - lo);
+ tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, lo);
} else {
- tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
- tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
+ lo = ptr;
+ tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - lo);
+ tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, lo);
}
tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 08/13] tcg/ppc: Use PADDI in tcg_out_movi
2023-10-13 17:09 [PATCH v4 00/13] tcg/ppc: direct branching, power9, power10 Richard Henderson
` (6 preceding siblings ...)
2023-10-13 17:10 ` [PATCH v4 07/13] tcg/ppc: Use ADDPCIS in tcg_out_goto_tb Richard Henderson
@ 2023-10-13 17:10 ` Richard Henderson
2023-10-13 17:10 ` [PATCH v4 09/13] tcg/ppc: Use prefixed instructions in tcg_out_mem_long Richard Henderson
` (4 subsequent siblings)
12 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2023-10-13 17:10 UTC (permalink / raw)
To: qemu-devel; +Cc: jniethe5
PADDI can load 34-bit immediates and 34-bit pc-relative addresses.
Reviewed-by: Jordan Niethe <jniethe5@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/ppc/tcg-target.c.inc | 51 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 51 insertions(+)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 6337b1e8be..f4235383c6 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -719,6 +719,38 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
return true;
}
+/* Ensure that the prefixed instruction does not cross a 64-byte boundary. */
+static bool tcg_out_need_prefix_align(TCGContext *s)
+{
+ return ((uintptr_t)s->code_ptr & 0x3f) == 0x3c;
+}
+
+static void tcg_out_prefix_align(TCGContext *s)
+{
+ if (tcg_out_need_prefix_align(s)) {
+ tcg_out32(s, NOP);
+ }
+}
+
+static ptrdiff_t tcg_pcrel_diff_for_prefix(TCGContext *s, const void *target)
+{
+ return tcg_pcrel_diff(s, target) - (tcg_out_need_prefix_align(s) ? 4 : 0);
+}
+
+/* Output Type 10 Prefix - Modified Load/Store Form (MLS:D) */
+static void tcg_out_mls_d(TCGContext *s, tcg_insn_unit opc, unsigned rt,
+ unsigned ra, tcg_target_long imm, bool r)
+{
+ tcg_insn_unit p, i;
+
+ p = OPCD(1) | (2 << 24) | (r << 20) | ((imm >> 16) & 0x3ffff);
+ i = opc | TAI(rt, ra, imm);
+
+ tcg_out_prefix_align(s);
+ tcg_out32(s, p);
+ tcg_out32(s, i);
+}
+
static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
TCGReg base, tcg_target_long offset);
@@ -1017,6 +1049,25 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
return;
}
+ /*
+ * Load values up to 34 bits, and pc-relative addresses,
+ * with one prefixed insn.
+ */
+ if (have_isa_3_10) {
+ if (arg == sextract64(arg, 0, 34)) {
+ /* pli ret,value = paddi ret,0,value,0 */
+ tcg_out_mls_d(s, ADDI, ret, 0, arg, 0);
+ return;
+ }
+
+ tmp = tcg_pcrel_diff_for_prefix(s, (void *)arg);
+ if (tmp == sextract64(tmp, 0, 34)) {
+ /* pla ret,value = paddi ret,0,value,1 */
+ tcg_out_mls_d(s, ADDI, ret, 0, tmp, 1);
+ return;
+ }
+ }
+
/* Load 32-bit immediates with two insns. Note that we've already
eliminated bare ADDIS, so we know both insns are required. */
if (TCG_TARGET_REG_BITS == 32 || arg == (int32_t)arg) {
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 09/13] tcg/ppc: Use prefixed instructions in tcg_out_mem_long
2023-10-13 17:09 [PATCH v4 00/13] tcg/ppc: direct branching, power9, power10 Richard Henderson
` (7 preceding siblings ...)
2023-10-13 17:10 ` [PATCH v4 08/13] tcg/ppc: Use PADDI in tcg_out_movi Richard Henderson
@ 2023-10-13 17:10 ` Richard Henderson
2023-10-13 17:10 ` [PATCH v4 10/13] tcg/ppc: Use PLD in tcg_out_movi for constant pool Richard Henderson
` (3 subsequent siblings)
12 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2023-10-13 17:10 UTC (permalink / raw)
To: qemu-devel; +Cc: jniethe5
When the offset is out of range of the non-prefixed insn, but
fits the 34-bit immediate of the prefixed insn, use that.
Reviewed-by: Jordan Niethe <jniethe5@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/ppc/tcg-target.c.inc | 66 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 66 insertions(+)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index f4235383c6..34df9144cc 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -329,6 +329,15 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
#define STDX XO31(149)
#define STQ XO62( 2)
+#define PLWA OPCD( 41)
+#define PLD OPCD( 57)
+#define PLXSD OPCD( 42)
+#define PLXV OPCD(25 * 2 + 1) /* force tx=1 */
+
+#define PSTD OPCD( 61)
+#define PSTXSD OPCD( 46)
+#define PSTXV OPCD(27 * 2 + 1) /* force sx=1 */
+
#define ADDIC OPCD( 12)
#define ADDI OPCD( 14)
#define ADDIS OPCD( 15)
@@ -737,6 +746,20 @@ static ptrdiff_t tcg_pcrel_diff_for_prefix(TCGContext *s, const void *target)
return tcg_pcrel_diff(s, target) - (tcg_out_need_prefix_align(s) ? 4 : 0);
}
+/* Output Type 00 Prefix - 8-Byte Load/Store Form (8LS:D) */
+static void tcg_out_8ls_d(TCGContext *s, tcg_insn_unit opc, unsigned rt,
+ unsigned ra, tcg_target_long imm, bool r)
+{
+ tcg_insn_unit p, i;
+
+ p = OPCD(1) | (r << 20) | ((imm >> 16) & 0x3ffff);
+ i = opc | TAI(rt, ra, imm);
+
+ tcg_out_prefix_align(s);
+ tcg_out32(s, p);
+ tcg_out32(s, i);
+}
+
/* Output Type 10 Prefix - Modified Load/Store Form (MLS:D) */
static void tcg_out_mls_d(TCGContext *s, tcg_insn_unit opc, unsigned rt,
unsigned ra, tcg_target_long imm, bool r)
@@ -1418,6 +1441,49 @@ static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
break;
}
+ /* For unaligned or large offsets, use the prefixed form. */
+ if (have_isa_3_10
+ && (offset != (int16_t)offset || (offset & align))
+ && offset == sextract64(offset, 0, 34)) {
+ /*
+ * Note that the MLS:D insns retain their un-prefixed opcode,
+ * while the 8LS:D insns use a different opcode space.
+ */
+ switch (opi) {
+ case LBZ:
+ case LHZ:
+ case LHA:
+ case LWZ:
+ case STB:
+ case STH:
+ case STW:
+ case ADDI:
+ tcg_out_mls_d(s, opi, rt, base, offset, 0);
+ return;
+ case LWA:
+ tcg_out_8ls_d(s, PLWA, rt, base, offset, 0);
+ return;
+ case LD:
+ tcg_out_8ls_d(s, PLD, rt, base, offset, 0);
+ return;
+ case STD:
+ tcg_out_8ls_d(s, PSTD, rt, base, offset, 0);
+ return;
+ case LXSD:
+ tcg_out_8ls_d(s, PLXSD, rt & 31, base, offset, 0);
+ return;
+ case STXSD:
+ tcg_out_8ls_d(s, PSTXSD, rt & 31, base, offset, 0);
+ return;
+ case LXV:
+ tcg_out_8ls_d(s, PLXV, rt & 31, base, offset, 0);
+ return;
+ case STXV:
+ tcg_out_8ls_d(s, PSTXV, rt & 31, base, offset, 0);
+ return;
+ }
+ }
+
/* For unaligned, or very large offsets, use the indexed form. */
if (offset & align || offset != (int32_t)offset || opi == 0) {
if (rs == base) {
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 10/13] tcg/ppc: Use PLD in tcg_out_movi for constant pool
2023-10-13 17:09 [PATCH v4 00/13] tcg/ppc: direct branching, power9, power10 Richard Henderson
` (8 preceding siblings ...)
2023-10-13 17:10 ` [PATCH v4 09/13] tcg/ppc: Use prefixed instructions in tcg_out_mem_long Richard Henderson
@ 2023-10-13 17:10 ` Richard Henderson
2023-10-13 17:10 ` [PATCH v4 11/13] tcg/ppc: Use prefixed instructions in tcg_out_dupi_vec Richard Henderson
` (2 subsequent siblings)
12 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2023-10-13 17:10 UTC (permalink / raw)
To: qemu-devel; +Cc: jniethe5
The prefixed instruction has a pc-relative form to use here.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/ppc/tcg-target.c.inc | 24 ++++++++++++++++++++++++
1 file changed, 24 insertions(+)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 34df9144cc..79e82d2f94 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -101,6 +101,10 @@
#define ALL_GENERAL_REGS 0xffffffffu
#define ALL_VECTOR_REGS 0xffffffff00000000ull
+#ifndef R_PPC64_PCREL34
+#define R_PPC64_PCREL34 132
+#endif
+
#define have_isel (cpuinfo & CPUINFO_ISEL)
#ifndef CONFIG_SOFTMMU
@@ -266,6 +270,19 @@ static bool reloc_pc14(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
return false;
}
+static bool reloc_pc34(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
+{
+ const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
+ ptrdiff_t disp = tcg_ptr_byte_diff(target, src_rx);
+
+ if (disp == sextract64(disp, 0, 34)) {
+ src_rw[0] = (src_rw[0] & ~0x3ffff) | ((disp >> 16) & 0x3ffff);
+ src_rw[1] = (src_rw[1] & ~0xffff) | (disp & 0xffff);
+ return true;
+ }
+ return false;
+}
+
/* test if a constant matches the constraint */
static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
{
@@ -696,6 +713,8 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
return reloc_pc14(code_ptr, target);
case R_PPC_REL24:
return reloc_pc24(code_ptr, target);
+ case R_PPC64_PCREL34:
+ return reloc_pc34(code_ptr, target);
case R_PPC_ADDR16:
/*
* We are (slightly) abusing this relocation type. In particular,
@@ -1155,6 +1174,11 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
tcg_out32(s, LD | TAI(ret, TCG_REG_TB, 0));
return;
}
+ if (have_isa_3_10) {
+ tcg_out_8ls_d(s, PLD, ret, 0, 0, 1);
+ new_pool_label(s, arg, R_PPC64_PCREL34, s->code_ptr - 2, 0);
+ return;
+ }
if (have_isa_3_00) {
tcg_out_addpcis(s, TCG_REG_TMP2, 0);
new_pool_label(s, arg, R_PPC_REL14, s->code_ptr, 0);
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 11/13] tcg/ppc: Use prefixed instructions in tcg_out_dupi_vec
2023-10-13 17:09 [PATCH v4 00/13] tcg/ppc: direct branching, power9, power10 Richard Henderson
` (9 preceding siblings ...)
2023-10-13 17:10 ` [PATCH v4 10/13] tcg/ppc: Use PLD in tcg_out_movi for constant pool Richard Henderson
@ 2023-10-13 17:10 ` Richard Henderson
2023-10-13 17:10 ` [PATCH v4 12/13] tcg/ppc: Use PLD in tcg_out_goto_tb Richard Henderson
2023-10-13 17:10 ` [PATCH v4 13/13] tcg/ppc: Disable TCG_REG_TB for Power9/Power10 Richard Henderson
12 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2023-10-13 17:10 UTC (permalink / raw)
To: qemu-devel; +Cc: jniethe5
The prefixed instructions have a pc-relative form to use here.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/ppc/tcg-target.c.inc | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 79e82d2f94..db3212083b 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -1242,6 +1242,15 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
if (USE_REG_TB) {
rel = R_PPC_ADDR16;
add = ppc_tbrel_diff(s, NULL);
+ } else if (have_isa_3_10) {
+ if (type == TCG_TYPE_V64) {
+ tcg_out_8ls_d(s, PLXSD, ret & 31, 0, 0, 1);
+ new_pool_label(s, val, R_PPC64_PCREL34, s->code_ptr - 2, 0);
+ } else {
+ tcg_out_8ls_d(s, PLXV, ret & 31, 0, 0, 1);
+ new_pool_l2(s, R_PPC64_PCREL34, s->code_ptr - 2, 0, val, val);
+ }
+ return;
} else if (have_isa_3_00) {
tcg_out_addpcis(s, TCG_REG_TMP1, 0);
rel = R_PPC_REL14;
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 12/13] tcg/ppc: Use PLD in tcg_out_goto_tb
2023-10-13 17:09 [PATCH v4 00/13] tcg/ppc: direct branching, power9, power10 Richard Henderson
` (10 preceding siblings ...)
2023-10-13 17:10 ` [PATCH v4 11/13] tcg/ppc: Use prefixed instructions in tcg_out_dupi_vec Richard Henderson
@ 2023-10-13 17:10 ` Richard Henderson
2023-10-13 17:10 ` [PATCH v4 13/13] tcg/ppc: Disable TCG_REG_TB for Power9/Power10 Richard Henderson
12 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2023-10-13 17:10 UTC (permalink / raw)
To: qemu-devel; +Cc: jniethe5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/ppc/tcg-target.c.inc | 3 +++
1 file changed, 3 insertions(+)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index db3212083b..6496f76e41 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -2753,6 +2753,9 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
if (USE_REG_TB) {
ptrdiff_t offset = ppc_tbrel_diff(s, (void *)ptr);
tcg_out_mem_long(s, LD, LDX, TCG_REG_TMP1, TCG_REG_TB, offset);
+ } else if (have_isa_3_10) {
+ ptrdiff_t offset = tcg_pcrel_diff_for_prefix(s, (void *)ptr);
+ tcg_out_8ls_d(s, PLD, TCG_REG_TMP1, 0, offset, 1);
} else if (have_isa_3_00) {
ptrdiff_t offset = tcg_pcrel_diff(s, (void *)ptr) - 4;
lo = offset;
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 13/13] tcg/ppc: Disable TCG_REG_TB for Power9/Power10
2023-10-13 17:09 [PATCH v4 00/13] tcg/ppc: direct branching, power9, power10 Richard Henderson
` (11 preceding siblings ...)
2023-10-13 17:10 ` [PATCH v4 12/13] tcg/ppc: Use PLD in tcg_out_goto_tb Richard Henderson
@ 2023-10-13 17:10 ` Richard Henderson
12 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2023-10-13 17:10 UTC (permalink / raw)
To: qemu-devel; +Cc: jniethe5
This appears to slightly improve performance on power9/10.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/ppc/tcg-target.c.inc | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 6496f76e41..c31da4da9d 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -83,7 +83,7 @@
#define TCG_VEC_TMP2 TCG_REG_V1
#define TCG_REG_TB TCG_REG_R31
-#define USE_REG_TB (TCG_TARGET_REG_BITS == 64)
+#define USE_REG_TB (TCG_TARGET_REG_BITS == 64 && !have_isa_3_00)
/* Shorthand for size of a pointer. Avoid promotion to unsigned. */
#define SZP ((int)sizeof(void *))
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread