* [RFC PATCH 1/3] accel/tcg: Option to permit incoherent translation block cache vs stores
2025-03-31 15:54 [RFC PATCH 0/3] translation performance improvements Nicholas Piggin
@ 2025-03-31 15:54 ` Nicholas Piggin
2025-03-31 19:51 ` Richard Henderson
2025-03-31 15:54 ` [RFC PATCH 2/3] target/ppc: define TARGET_HAS_LAZY_ICACHE Nicholas Piggin
` (2 subsequent siblings)
3 siblings, 1 reply; 9+ messages in thread
From: Nicholas Piggin @ 2025-03-31 15:54 UTC (permalink / raw)
To: qemu-ppc; +Cc: Nicholas Piggin, qemu-devel, Richard Henderson
Add an option TARGET_HAS_LAZY_ICACHE that does not invalidate TBs upon
store, but instead tracks that the icache has become incoherent, and
provides a tb_flush_incoherent() function that the target may call to
bring the TB back to coherency.
XXX: docs/devel/tcg.rst says that this is not permitted because TB must
be coherent with memory to handle exceptions correctly... I'm not sure
where this is, maybe it can be worked around and is not a showstopper?
---
accel/tcg/tb-internal.h | 10 ++++++
include/exec/tb-flush.h | 3 ++
accel/tcg/cputlb.c | 15 +++++++--
accel/tcg/tb-maint.c | 73 ++++++++++++++++++++++++++++++++++++++++
system/memory_ldst.c.inc | 2 +-
5 files changed, 99 insertions(+), 4 deletions(-)
diff --git a/accel/tcg/tb-internal.h b/accel/tcg/tb-internal.h
index 68aa8d17f41..ccc59eac3f5 100644
--- a/accel/tcg/tb-internal.h
+++ b/accel/tcg/tb-internal.h
@@ -82,6 +82,16 @@ void tb_unlock_pages(TranslationBlock *);
void tb_invalidate_phys_range_fast(ram_addr_t ram_addr,
unsigned size,
uintptr_t retaddr);
+#ifdef TARGET_HAS_LAZY_ICACHE
+void tb_store_to_phys_range(ram_addr_t ram_addr,
+ unsigned size, uintptr_t retaddr);
+#else
+static inline void tb_store_to_phys_range(ram_addr_t ram_addr,
+ unsigned size, uintptr_t retaddr)
+{
+ tb_invalidate_phys_range_fast(ram_addr, size, retaddr);
+}
+#endif
#endif /* CONFIG_SOFTMMU */
bool tb_invalidate_phys_page_unwind(tb_page_addr_t addr, uintptr_t pc);
diff --git a/include/exec/tb-flush.h b/include/exec/tb-flush.h
index 142c240d94c..1f8b718be57 100644
--- a/include/exec/tb-flush.h
+++ b/include/exec/tb-flush.h
@@ -23,6 +23,9 @@
*/
void tb_flush(CPUState *cs);
+/* like tb_flush() but only flush incoherent blocks */
+void tb_flush_incoherent(CPUState *cpu);
+
void tcg_flush_jmp_cache(CPUState *cs);
#endif /* _TB_FLUSH_H_ */
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 500c8d3adc0..2a19c82e5d4 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -1338,18 +1338,27 @@ static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
CPUTLBEntryFull *full, uintptr_t retaddr)
{
ram_addr_t ram_addr = mem_vaddr + full->xlat_section;
+ uint8_t mask;
trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
- tb_invalidate_phys_range_fast(ram_addr, size, retaddr);
+ tb_store_to_phys_range(ram_addr, size, retaddr);
}
/*
* Set both VGA and migration bits for simplicity and to remove
- * the notdirty callback faster.
+ * the notdirty callback faster. Incoherent icache also sets the
+ * code bit because incoherency is tracked and resolved in the TB
+ * code.
*/
- cpu_physical_memory_set_dirty_range(ram_addr, size, DIRTY_CLIENTS_NOCODE);
+#ifdef TARGET_HAS_LAZY_ICACHE
+ mask = DIRTY_CLIENTS_ALL;
+#else
+ mask = DIRTY_CLIENTS_NOCODE;
+#endif
+
+ cpu_physical_memory_set_dirty_range(ram_addr, size, mask);
/* We remove the notdirty callback only if the code has been flushed. */
if (!cpu_physical_memory_is_clean(ram_addr)) {
diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
index 3f1bebf6ab5..4f634469456 100644
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -60,10 +60,20 @@ static bool tb_cmp(const void *ap, const void *bp)
tb_page_addr1(a) == tb_page_addr1(b));
}
+#ifdef TARGET_HAS_LAZY_ICACHE
+static QemuSpin icache_incoherent_lock;
+static bool icache_incoherent;
+static ram_addr_t icache_incoherent_start;
+static ram_addr_t icache_incoherent_end;
+#endif
+
void tb_htable_init(void)
{
unsigned int mode = QHT_MODE_AUTO_RESIZE;
+#ifdef TARGET_HAS_LAZY_ICACHE
+ qemu_spin_init(&icache_incoherent_lock);
+#endif
qht_init(&tb_ctx.htable, tb_cmp, CODE_GEN_HTABLE_SIZE, mode);
}
@@ -803,6 +813,35 @@ void tb_flush(CPUState *cpu)
}
}
+#ifdef TARGET_HAS_LAZY_ICACHE
+static void do_tb_flush_incoherent(CPUState *cpu, run_on_cpu_data data)
+{
+ /* Should be no other CPUs running */
+ assert(!qemu_spin_trylock(&icache_incoherent_lock));
+ if (icache_incoherent) {
+ unsigned tb_flush_count = qatomic_read(&tb_ctx.tb_flush_count);
+
+ tb_invalidate_phys_range(icache_incoherent_start,
+ icache_incoherent_end);
+ icache_incoherent = false;
+ icache_incoherent_start = 0;
+ icache_incoherent_end = 0;
+ qemu_spin_unlock(&icache_incoherent_lock);
+
+ do_tb_flush(cpu, RUN_ON_CPU_HOST_INT(tb_flush_count));
+ } else {
+ qemu_spin_unlock(&icache_incoherent_lock);
+ }
+}
+
+void tb_flush_incoherent(CPUState *cpu)
+{
+ if (tcg_enabled() && icache_incoherent) {
+ async_safe_run_on_cpu(cpu, do_tb_flush_incoherent, RUN_ON_CPU_NULL);
+ }
+}
+#endif
+
/* remove @orig from its @n_orig-th jump list */
static inline void tb_remove_from_jmp_list(TranslationBlock *orig, int n_orig)
{
@@ -1231,4 +1270,38 @@ void tb_invalidate_phys_range_fast(ram_addr_t ram_addr,
page_collection_unlock(pages);
}
+
+void tb_store_to_phys_range(ram_addr_t ram_addr,
+ unsigned size, uintptr_t retaddr)
+{
+#ifndef TARGET_HAS_LAZY_ICACHE
+ tb_invalidate_phys_range_fast(ram_addr, size, retaddr);
+#else
+ ram_addr_t start, end;
+
+ /*
+ * Address comes in as byte-wise, but the cputlb dirty tracking operates
+ * on a page basis and will not be called a second time on the same page,
+ * so must cover a full page here.
+ */
+ start = ROUND_DOWN(ram_addr, TARGET_PAGE_SIZE);
+ end = ROUND_UP(ram_addr + size, TARGET_PAGE_SIZE) - 1;
+
+ qemu_spin_lock(&icache_incoherent_lock);
+ if (icache_incoherent) {
+ if (start < icache_incoherent_start) {
+ icache_incoherent_start = start;
+ }
+ if (end > icache_incoherent_end) {
+ icache_incoherent_end = end;
+ }
+ } else {
+ icache_incoherent = true;
+ icache_incoherent_start = start;
+ icache_incoherent_end = end;
+ }
+ qemu_spin_unlock(&icache_incoherent_lock);
+#endif
+}
+
#endif /* CONFIG_USER_ONLY */
diff --git a/system/memory_ldst.c.inc b/system/memory_ldst.c.inc
index 7f32d3d9ff3..029d9749d57 100644
--- a/system/memory_ldst.c.inc
+++ b/system/memory_ldst.c.inc
@@ -286,7 +286,7 @@ void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL,
stl_p(ptr, val);
dirty_log_mask = memory_region_get_dirty_log_mask(mr);
- dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
+ dirty_log_mask &= DIRTY_CLIENTS_NOCODE;
cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr,
4, dirty_log_mask);
r = MEMTX_OK;
--
2.47.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [RFC PATCH 2/3] target/ppc: define TARGET_HAS_LAZY_ICACHE
2025-03-31 15:54 [RFC PATCH 0/3] translation performance improvements Nicholas Piggin
2025-03-31 15:54 ` [RFC PATCH 1/3] accel/tcg: Option to permit incoherent translation block cache vs stores Nicholas Piggin
@ 2025-03-31 15:54 ` Nicholas Piggin
2025-03-31 15:54 ` [RFC PATCH 3/3] target/ppc: Allow goto-tb on fixed real mode translations Nicholas Piggin
2025-03-31 19:40 ` [RFC PATCH 0/3] translation performance improvements Richard Henderson
3 siblings, 0 replies; 9+ messages in thread
From: Nicholas Piggin @ 2025-03-31 15:54 UTC (permalink / raw)
To: qemu-ppc; +Cc: Nicholas Piggin, qemu-devel, Richard Henderson
Use the new incoherent icache (incoherent TB) feature in the ppc target.
Performance problems with notdirty write accesses have been encountered
in two places now. One is where a large number of executable pages have
been freed (typically in KVM when a guest exits) and are being cleared
for reuse, most stores in a page will take the notdirty slowpath, which
can cause such s slowdown that the OS reports lockups. The other case is
PowerVM boot firmware which has real-mode interrupt handler code that
stores to memory in the same page-sized region as interrupt handler code
which causes significant slowdowns.
ppc implements TARGET_HAS_LAZY_ICACHE by calling tb_flush_incoherent()
from the ICBI instruction, which should conform to the ISA's CMODX (aka
SMC) requirement.
---
target/ppc/cpu.h | 16 ++++++++++++++++
target/ppc/mem_helper.c | 2 ++
target/ppc/translate.c | 1 +
3 files changed, 19 insertions(+)
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 74ed28c8dac..de274d29637 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -34,6 +34,22 @@
#define TARGET_PAGE_BITS_64K 16
#define TARGET_PAGE_BITS_16M 24
+/* icaches are not kept coherent with dcaches. target is to call
+ * tb_flush_incoherent() to bring them into coherency */
+#define TARGET_HAS_LAZY_ICACHE
+/*
+ * Note that this does not model implementation specific behaviour of all
+ * CPUs, notably recent Power CPUs do keep i/d coherent, and only require
+ * context synchronization after code modification to ensure CPU pipeline
+ * is coherent. The ISA and User Manuals do say that icbi (to any address) ;
+ * isync should be used even for these CPUs, so tb_flush_incoherent() in
+ * icbi should work reasonably. The ppc target should continue to work without
+ * TARGET_HAS_LAZY_ICACHE, but some performance corner cases benefit (e.g.,
+ * KVM when clearing a lot of memory freed from a guest that has a lot of exec
+ * pages; PowerVM PFW/boot firmware that stores to globals in the same page as
+ * it executes from).
+ */
+
#if defined(TARGET_PPC64)
#define PPC_ELF_MACHINE EM_PPC64
#else
diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
index 51b137febd6..647d37195dd 100644
--- a/target/ppc/mem_helper.c
+++ b/target/ppc/mem_helper.c
@@ -24,6 +24,7 @@
#include "exec/helper-proto.h"
#include "helper_regs.h"
#include "exec/cpu_ldst.h"
+#include "exec/tb-flush.h"
#include "internal.h"
#include "qemu/atomic128.h"
@@ -335,6 +336,7 @@ void helper_icbi(CPUPPCState *env, target_ulong addr)
* do the load "by hand".
*/
cpu_ldl_data_ra(env, addr, GETPC());
+ tb_flush_incoherent(env_cpu(env));
}
void helper_icbiep(CPUPPCState *env, target_ulong addr)
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 7f933537aaa..5e610bf29a5 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -4565,6 +4565,7 @@ static void gen_dss(DisasContext *ctx)
static void gen_icbi(DisasContext *ctx)
{
TCGv t0;
+ translator_io_start(&ctx->base);
gen_set_access_type(ctx, ACCESS_CACHE);
t0 = tcg_temp_new();
gen_addr_reg_index(ctx, t0);
--
2.47.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [RFC PATCH 3/3] target/ppc: Allow goto-tb on fixed real mode translations
2025-03-31 15:54 [RFC PATCH 0/3] translation performance improvements Nicholas Piggin
2025-03-31 15:54 ` [RFC PATCH 1/3] accel/tcg: Option to permit incoherent translation block cache vs stores Nicholas Piggin
2025-03-31 15:54 ` [RFC PATCH 2/3] target/ppc: define TARGET_HAS_LAZY_ICACHE Nicholas Piggin
@ 2025-03-31 15:54 ` Nicholas Piggin
2025-03-31 19:40 ` [RFC PATCH 0/3] translation performance improvements Richard Henderson
3 siblings, 0 replies; 9+ messages in thread
From: Nicholas Piggin @ 2025-03-31 15:54 UTC (permalink / raw)
To: qemu-ppc; +Cc: Nicholas Piggin, qemu-devel, Richard Henderson
Fixed translations (mapping and protections unchanged) do not have
to restrict TB chaining to within a target page.
Hypervisor-real mode is a fixed translation.
TODO: Supervisor-real mode in spapr should also be a fixed translation.
---
target/ppc/translate.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 5e610bf29a5..4b4440c8a16 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -183,6 +183,7 @@ struct DisasContext {
bool sf_mode;
bool has_cfar;
bool has_bhrb;
+ bool ifetch_fixed_xlate;
#endif
bool fpu_enabled;
bool altivec_enabled;
@@ -3656,6 +3657,18 @@ static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest)
if (unlikely(ctx->singlestep_enabled)) {
return false;
}
+
+#if defined(TARGET_PPC64)
+ /* XXX: make translator_use_goto_tb take a 'fixed map' bool */
+ /* Suppress goto_tb if requested. */
+ if (ctx->ifetch_fixed_xlate) {
+ if (tb_cflags(ctx->base.tb) & CF_NO_GOTO_TB) {
+ return false;
+ }
+ return true;
+ }
+#endif
+
return translator_use_goto_tb(&ctx->base, dest);
}
@@ -6545,6 +6558,7 @@ static void ppc_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
ctx->sf_mode = (hflags >> HFLAGS_64) & 1;
ctx->has_cfar = !!(env->flags & POWERPC_FLAG_CFAR);
ctx->has_bhrb = !!(env->flags & POWERPC_FLAG_BHRB);
+ ctx->ifetch_fixed_xlate = ((hflags >> HFLAGS_IMMU_IDX) & 7) == 3;
#endif
ctx->lazy_tlb_flush = env->mmu_model == POWERPC_MMU_32B
|| env->mmu_model & POWERPC_MMU_64;
@@ -6627,6 +6641,12 @@ static void ppc_tr_translate_insn(DisasContextBase *dcbase, CPUState *cs)
gen_invalid(ctx);
}
+#if defined(TARGET_PPC64)
+ if (ctx->ifetch_fixed_xlate) {
+ return;
+ }
+#endif
+
/* End the TB when crossing a page boundary. */
if (ctx->base.is_jmp == DISAS_NEXT && !(pc & ~TARGET_PAGE_MASK)) {
ctx->base.is_jmp = DISAS_TOO_MANY;
--
2.47.1
^ permalink raw reply related [flat|nested] 9+ messages in thread