From: Pierrick Bouvier <pierrick.bouvier@linaro.org>
To: "Alex Bennée" <alex.bennee@linaro.org>, qemu-devel@nongnu.org
Cc: Richard Henderson <richard.henderson@linaro.org>,
Paolo Bonzini <pbonzini@redhat.com>,
Riku Voipio <riku.voipio@iki.fi>
Subject: Re: [RFC PATCH] accel/tcg: add tracepoints for cpu_loop_exit_atomic
Date: Fri, 4 Oct 2024 17:09:13 -0700 [thread overview]
Message-ID: <c6df1bc2-f725-4f34-9beb-81a370173d00@linaro.org> (raw)
In-Reply-To: <20241004135226.903570-1-alex.bennee@linaro.org>
On 10/4/24 06:52, Alex Bennée wrote:
> We try to avoid using cpu_loop_exit_atomic as it brings in an all-core
> sync point. However on some cpu/kernel/benchmark combinations it is
> starting to show up in the performance profile. To make it easier to
> see whats going on add tracepoints for the slow path so we can see
> what is triggering the wait.
>
> It seems for a modern CPU it can be quite a bit, for example:
>
> ./qemu-system-aarch64 \
> -machine type=virt,virtualization=on,pflash0=rom,pflash1=efivars,gic-version=max \
> -smp 4 \
> -accel tcg \
> -device virtio-net-pci,netdev=unet \
> -device virtio-scsi-pci \
> -device scsi-hd,drive=hd \
> -netdev user,id=unet,hostfwd=tcp::2222-:22 \
> -blockdev driver=raw,node-name=hd,file.driver=host_device,file.filename=/dev/zen-ssd2/trixie-arm64,discard=unmap \
> -serial mon:stdio \
> -blockdev node-name=rom,driver=file,filename=(pwd)/pc-bios/edk2-aarch64-code.fd,read-only=true \
> -blockdev node-name=efivars,driver=file,filename=$HOME/images/qemu-arm64-efivars \
> -m 8192 \
> -object memory-backend-memfd,id=mem,size=8G,share=on \
> -kernel /home/alex/lsrc/linux.git/builds/arm64/arch/arm64/boot/Image -append "root=/dev/sda2 console=ttyAMA0 systemd.unit=benchmark-stress-ng.service" \
> -display none -d trace:load_atom\*_fallback,trace:store_atom\*_fallback
>
> With:
>
> -cpu neoverse-v1,pauth-impdef=on => 2203343
>
> With:
>
> -cpu cortex-a76 => 0
>
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Cc: Pierrick Bouvier <pierrick.bouvier@linaro.org>
> ---
> accel/tcg/user-exec.c | 2 +-
> accel/tcg/ldst_atomicity.c.inc | 9 +++++++++
> accel/tcg/trace-events | 12 ++++++++++++
> 3 files changed, 22 insertions(+), 1 deletion(-)
>
> diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
> index 7ddc47b0ba..f3a440ca29 100644
> --- a/accel/tcg/user-exec.c
> +++ b/accel/tcg/user-exec.c
> @@ -29,7 +29,7 @@
> #include "exec/page-protection.h"
> #include "exec/helper-proto.h"
> #include "qemu/atomic128.h"
> -#include "trace/trace-root.h"
> +#include "trace.h"
> #include "tcg/tcg-ldst.h"
> #include "internal-common.h"
> #include "internal-target.h"
> diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
> index 134da3c1da..c735add261 100644
> --- a/accel/tcg/ldst_atomicity.c.inc
> +++ b/accel/tcg/ldst_atomicity.c.inc
> @@ -168,6 +168,7 @@ static uint64_t load_atomic8_or_exit(CPUState *cpu, uintptr_t ra, void *pv)
> #endif
>
> /* Ultimate fallback: re-execute in serial context. */
> + trace_load_atom8_or_exit_fallback(ra);
> cpu_loop_exit_atomic(cpu, ra);
> }
>
> @@ -212,6 +213,7 @@ static Int128 load_atomic16_or_exit(CPUState *cpu, uintptr_t ra, void *pv)
> }
>
> /* Ultimate fallback: re-execute in serial context. */
> + trace_load_atom16_or_exit_fallback(ra);
> cpu_loop_exit_atomic(cpu, ra);
> }
>
> @@ -519,6 +521,7 @@ static uint64_t load_atom_8(CPUState *cpu, uintptr_t ra,
> if (HAVE_al8) {
> return load_atom_extract_al8x2(pv);
> }
> + trace_load_atom8_fallback(memop, ra);
> cpu_loop_exit_atomic(cpu, ra);
> default:
> g_assert_not_reached();
> @@ -563,6 +566,7 @@ static Int128 load_atom_16(CPUState *cpu, uintptr_t ra,
> break;
> case MO_64:
> if (!HAVE_al8) {
> + trace_load_atom16_fallback(memop, ra);
> cpu_loop_exit_atomic(cpu, ra);
> }
> a = load_atomic8(pv);
> @@ -570,6 +574,7 @@ static Int128 load_atom_16(CPUState *cpu, uintptr_t ra,
> break;
> case -MO_64:
> if (!HAVE_al8) {
> + trace_load_atom16_fallback(memop, ra);
> cpu_loop_exit_atomic(cpu, ra);
> }
> a = load_atom_extract_al8x2(pv);
> @@ -897,6 +902,7 @@ static void store_atom_2(CPUState *cpu, uintptr_t ra,
> g_assert_not_reached();
> }
>
> + trace_store_atom2_fallback(memop, ra);
> cpu_loop_exit_atomic(cpu, ra);
> }
>
> @@ -961,6 +967,7 @@ static void store_atom_4(CPUState *cpu, uintptr_t ra,
> return;
> }
> }
> + trace_store_atom4_fallback(memop, ra);
> cpu_loop_exit_atomic(cpu, ra);
> default:
> g_assert_not_reached();
> @@ -1029,6 +1036,7 @@ static void store_atom_8(CPUState *cpu, uintptr_t ra,
> default:
> g_assert_not_reached();
> }
> + trace_store_atom8_fallback(memop, ra);
> cpu_loop_exit_atomic(cpu, ra);
> }
>
> @@ -1107,5 +1115,6 @@ static void store_atom_16(CPUState *cpu, uintptr_t ra,
> default:
> g_assert_not_reached();
> }
> + trace_store_atom16_fallback(memop, ra);
> cpu_loop_exit_atomic(cpu, ra);
> }
> diff --git a/accel/tcg/trace-events b/accel/tcg/trace-events
> index 4e9b450520..0ce69d744f 100644
> --- a/accel/tcg/trace-events
> +++ b/accel/tcg/trace-events
> @@ -12,3 +12,15 @@ memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64
>
> # translate-all.c
> translate_block(void *tb, uintptr_t pc, const void *tb_code) "tb:%p, pc:0x%"PRIxPTR", tb_code:%p"
> +
> +# ldst_atomicity
> +load_atom2_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
> +load_atom4_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
> +load_atom8_or_exit_fallback(uintptr_t ra) "ra:%"PRIxPTR""
> +load_atom8_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
> +load_atom16_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
> +load_atom16_or_exit_fallback(uintptr_t ra) "ra:%"PRIxPTR""
> +store_atom2_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
> +store_atom4_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
> +store_atom8_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
> +store_atom16_fallback(uint32_t memop, uintptr_t ra) "mop:0x%"PRIx32", ra:%"PRIxPTR""
Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
prev parent reply other threads:[~2024-10-05 0:10 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-10-04 13:52 [RFC PATCH] accel/tcg: add tracepoints for cpu_loop_exit_atomic Alex Bennée
2024-10-04 14:34 ` Richard Henderson
2024-10-05 0:09 ` Pierrick Bouvier [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=c6df1bc2-f725-4f34-9beb-81a370173d00@linaro.org \
--to=pierrick.bouvier@linaro.org \
--cc=alex.bennee@linaro.org \
--cc=pbonzini@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=richard.henderson@linaro.org \
--cc=riku.voipio@iki.fi \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).