From: James Clark <james.clark@linaro.org>
To: Leo Yan <leo.yan@arm.com>
Cc: linux-arm-kernel@lists.infradead.org, coresight@lists.linaro.org,
linux-perf-users@vger.kernel.org,
Arnaldo Carvalho de Melo <acme@kernel.org>,
John Garry <john.g.garry@oracle.com>,
Will Deacon <will@kernel.org>, Mike Leach <mike.leach@arm.com>,
Suzuki K Poulose <suzuki.poulose@arm.com>,
Namhyung Kim <namhyung@kernel.org>,
Mark Rutland <mark.rutland@arm.com>,
Alexander Shishkin <alexander.shishkin@linux.intel.com>,
Jiri Olsa <jolsa@kernel.org>, Ian Rogers <irogers@google.com>,
Adrian Hunter <adrian.hunter@intel.com>,
Al Grant <al.grant@arm.com>,
Paschalis Mpeis <paschalis.mpeis@arm.com>,
Amir Ayupov <aaupov@fb.com>
Subject: Re: [PATCH v7 4/8] perf cs-etm: Use thread-stack for last branch entries
Date: Thu, 11 Jun 2026 10:01:33 +0100 [thread overview]
Message-ID: <896279e2-eded-4388-afb9-0a633404928b@linaro.org> (raw)
In-Reply-To: <20260611-b4-arm_cs_callchain_support_v1-v7-4-1ba770c862ae@arm.com>
On 11/06/2026 8:56 am, Leo Yan wrote:
> CS ETM maintains its own circular array for last branch entries, with
> local helpers to update, copy and reset the branch stack. This
> duplicates logic already provided by the common code.
>
> Record taken branches with thread_stack__event() and synthesize
> PERF_SAMPLE_BRANCH_STACK data with thread_stack__br_sample(). This
> removes the private last_branch_rb buffer and its position tracking.
>
> This also makes the branch history state belong to the thread rather
> than the trace queue. That is a better fit for CoreSight traces where
> a trace queue can effectively be CPU scoped, while call/return history
> is per thread.
>
> Keep the buffer number updated via thread_stack__set_trace_nr(), which
> is used when exporting samples to Python scripts. Pass callstack=false
> for now; synthesized callchains are added by a later patch.
>
> The output should remain same, except that be->flags.predicted is no
> longer set. Since CoreSight trace does not provide branch prediction
> information, clearing the flag avoids confusion.
>
> Signed-off-by: Leo Yan <leo.yan@arm.com>
Reviewed-by: James Clark <james.clark@linaro.org>
> ---
> tools/perf/util/cs-etm.c | 159 ++++++++++++++---------------------------------
> 1 file changed, 46 insertions(+), 113 deletions(-)
>
> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> index 4127120459418389ca7aabb9a49dead2b50e7533..8798bf0471faf3b1813780b45c588263ff6b4416 100644
> --- a/tools/perf/util/cs-etm.c
> +++ b/tools/perf/util/cs-etm.c
> @@ -84,10 +84,9 @@ struct cs_etm_auxtrace {
> struct cs_etm_traceid_queue {
> u8 trace_chan_id;
> u64 period_instructions;
> - size_t last_branch_pos;
> union perf_event *event_buf;
> + unsigned int br_stack_sz;
> struct branch_stack *last_branch;
> - struct branch_stack *last_branch_rb;
> struct cs_etm_packet *prev_packet;
> struct cs_etm_packet *packet;
> struct cs_etm_packet_queue packet_queue;
> @@ -644,9 +643,8 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
> tidq->last_branch = zalloc(sz);
> if (!tidq->last_branch)
> goto out_free;
> - tidq->last_branch_rb = zalloc(sz);
> - if (!tidq->last_branch_rb)
> - goto out_free;
> +
> + tidq->br_stack_sz = etm->synth_opts.last_branch_sz;
> }
>
> tidq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE);
> @@ -656,7 +654,6 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
> return 0;
>
> out_free:
> - zfree(&tidq->last_branch_rb);
> zfree(&tidq->last_branch);
> zfree(&tidq->prev_packet);
> zfree(&tidq->packet);
> @@ -939,7 +936,6 @@ static void cs_etm__free_traceid_queues(struct cs_etm_queue *etmq)
> thread__zput(tidq->decode_thread);
> zfree(&tidq->event_buf);
> zfree(&tidq->last_branch);
> - zfree(&tidq->last_branch_rb);
> zfree(&tidq->prev_packet);
> zfree(&tidq->packet);
> zfree(&tidq);
> @@ -1299,57 +1295,6 @@ static int cs_etm__queue_first_cs_timestamp(struct cs_etm_auxtrace *etm,
> return ret;
> }
>
> -static inline
> -void cs_etm__copy_last_branch_rb(struct cs_etm_queue *etmq,
> - struct cs_etm_traceid_queue *tidq)
> -{
> - struct branch_stack *bs_src = tidq->last_branch_rb;
> - struct branch_stack *bs_dst = tidq->last_branch;
> - size_t nr = 0;
> -
> - /*
> - * Set the number of records before early exit: ->nr is used to
> - * determine how many branches to copy from ->entries.
> - */
> - bs_dst->nr = bs_src->nr;
> -
> - /*
> - * Early exit when there is nothing to copy.
> - */
> - if (!bs_src->nr)
> - return;
> -
> - /*
> - * As bs_src->entries is a circular buffer, we need to copy from it in
> - * two steps. First, copy the branches from the most recently inserted
> - * branch ->last_branch_pos until the end of bs_src->entries buffer.
> - */
> - nr = etmq->etm->synth_opts.last_branch_sz - tidq->last_branch_pos;
> - memcpy(&bs_dst->entries[0],
> - &bs_src->entries[tidq->last_branch_pos],
> - sizeof(struct branch_entry) * nr);
> -
> - /*
> - * If we wrapped around at least once, the branches from the beginning
> - * of the bs_src->entries buffer and until the ->last_branch_pos element
> - * are older valid branches: copy them over. The total number of
> - * branches copied over will be equal to the number of branches asked by
> - * the user in last_branch_sz.
> - */
> - if (bs_src->nr >= etmq->etm->synth_opts.last_branch_sz) {
> - memcpy(&bs_dst->entries[nr],
> - &bs_src->entries[0],
> - sizeof(struct branch_entry) * tidq->last_branch_pos);
> - }
> -}
> -
> -static inline
> -void cs_etm__reset_last_branch_rb(struct cs_etm_traceid_queue *tidq)
> -{
> - tidq->last_branch_pos = 0;
> - tidq->last_branch_rb->nr = 0;
> -}
> -
> static inline int cs_etm__t32_instr_size(struct cs_etm_queue *etmq,
> struct cs_etm_traceid_queue *tidq,
> struct cs_etm_packet *packet, u64 addr)
> @@ -1419,38 +1364,6 @@ static inline u64 cs_etm__instr_addr(struct cs_etm_queue *etmq,
> return addr;
> }
>
> -static void cs_etm__update_last_branch_rb(struct cs_etm_queue *etmq,
> - struct cs_etm_traceid_queue *tidq)
> -{
> - struct branch_stack *bs = tidq->last_branch_rb;
> - struct branch_entry *be;
> -
> - /*
> - * The branches are recorded in a circular buffer in reverse
> - * chronological order: we start recording from the last element of the
> - * buffer down. After writing the first element of the stack, move the
> - * insert position back to the end of the buffer.
> - */
> - if (!tidq->last_branch_pos)
> - tidq->last_branch_pos = etmq->etm->synth_opts.last_branch_sz;
> -
> - tidq->last_branch_pos -= 1;
> -
> - be = &bs->entries[tidq->last_branch_pos];
> - be->from = cs_etm__last_executed_instr(tidq->prev_packet);
> - be->to = cs_etm__first_executed_instr(tidq->packet);
> - /* No support for mispredict */
> - be->flags.mispred = 0;
> - be->flags.predicted = 1;
> -
> - /*
> - * Increment bs->nr until reaching the number of last branches asked by
> - * the user on the command line.
> - */
> - if (bs->nr < etmq->etm->synth_opts.last_branch_sz)
> - bs->nr += 1;
> -}
> -
> static int cs_etm__inject_event(struct cs_etm_auxtrace *etm, union perf_event *event,
> struct perf_sample *sample, u64 type)
> {
> @@ -1614,6 +1527,42 @@ static inline u64 cs_etm__resolve_sample_time(struct cs_etm_queue *etmq,
> return etm->latest_kernel_timestamp;
> }
>
> +static bool cs_etm__packet_has_taken_branch(struct cs_etm_packet *packet)
> +{
> + if (packet->sample_type == CS_ETM_RANGE &&
> + packet->last_instr_taken_branch)
> + return true;
> +
> + return false;
> +}
> +
> +static void cs_etm__add_stack_event(struct cs_etm_queue *etmq,
> + struct cs_etm_traceid_queue *tidq)
> +{
> + u64 from, to;
> + int size;
> +
> + if (!cs_etm__packet_has_taken_branch(tidq->prev_packet))
> + return;
> +
> + if (etmq->etm->synth_opts.last_branch) {
> + from = cs_etm__last_executed_instr(tidq->prev_packet);
> + to = cs_etm__first_executed_instr(tidq->packet);
> +
> + size = cs_etm__instr_size(etmq, tidq, tidq->prev_packet, from);
> +
> + /* Enable callchain so thread stack entry can be allocated */
> + thread_stack__event(tidq->frontend_thread, tidq->prev_packet->cpu,
> + tidq->prev_packet->flags, from, to, size,
> + etmq->buffer->buffer_nr + 1, false,
> + tidq->br_stack_sz, 0);
> + } else {
> + thread_stack__set_trace_nr(tidq->frontend_thread,
> + tidq->prev_packet->cpu,
> + etmq->buffer->buffer_nr + 1);
> + }
> +}
> +
> static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
> struct cs_etm_traceid_queue *tidq,
> struct cs_etm_packet *packet,
> @@ -1644,8 +1593,11 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
>
> cs_etm__copy_insn(etmq, tidq, packet, &sample);
>
> - if (etm->synth_opts.last_branch)
> + if (etm->synth_opts.last_branch) {
> + thread_stack__br_sample(tidq->frontend_thread, tidq->packet->cpu,
> + tidq->last_branch, tidq->br_stack_sz);
> sample.branch_stack = tidq->last_branch;
> + }
>
> if (etm->synth_opts.inject) {
> ret = cs_etm__inject_event(etm, event, &sample,
> @@ -1836,14 +1788,7 @@ static int cs_etm__sample(struct cs_etm_queue *etmq,
>
> tidq->period_instructions += tidq->packet->instr_count;
>
> - /*
> - * Record a branch when the last instruction in
> - * PREV_PACKET is a branch.
> - */
> - if (etm->synth_opts.last_branch &&
> - tidq->prev_packet->sample_type == CS_ETM_RANGE &&
> - tidq->prev_packet->last_instr_taken_branch)
> - cs_etm__update_last_branch_rb(etmq, tidq);
> + cs_etm__add_stack_event(etmq, tidq);
>
> if (etm->synth_opts.instructions &&
> tidq->period_instructions >= etm->instructions_sample_period) {
> @@ -1902,10 +1847,6 @@ static int cs_etm__sample(struct cs_etm_queue *etmq,
> u64 offset = etm->instructions_sample_period - instrs_prev;
> u64 addr;
>
> - /* Prepare last branches for instruction sample */
> - if (etm->synth_opts.last_branch)
> - cs_etm__copy_last_branch_rb(etmq, tidq);
> -
> while (tidq->period_instructions >=
> etm->instructions_sample_period) {
> /*
> @@ -1936,8 +1877,7 @@ static int cs_etm__sample(struct cs_etm_queue *etmq,
> generate_sample = true;
>
> /* Generate sample for branch taken packet */
> - if (tidq->prev_packet->sample_type == CS_ETM_RANGE &&
> - tidq->prev_packet->last_instr_taken_branch)
> + if (cs_etm__packet_has_taken_branch(tidq->prev_packet))
> generate_sample = true;
>
> if (generate_sample) {
> @@ -1985,10 +1925,6 @@ static int cs_etm__flush(struct cs_etm_queue *etmq,
> etmq->etm->synth_opts.instructions &&
> tidq->prev_packet->sample_type == CS_ETM_RANGE) {
> u64 addr;
> -
> - /* Prepare last branches for instruction sample */
> - cs_etm__copy_last_branch_rb(etmq, tidq);
> -
> /*
> * Generate a last branch event for the branches left in the
> * circular buffer at the end of the trace.
> @@ -2020,7 +1956,7 @@ static int cs_etm__flush(struct cs_etm_queue *etmq,
>
> /* Reset last branches after flush the trace */
> if (etm->synth_opts.last_branch)
> - cs_etm__reset_last_branch_rb(tidq);
> + thread_stack__flush(tidq->frontend_thread);
>
> return err;
> }
> @@ -2044,9 +1980,6 @@ static int cs_etm__end_block(struct cs_etm_queue *etmq,
> tidq->prev_packet->sample_type == CS_ETM_RANGE) {
> u64 addr;
>
> - /* Prepare last branches for instruction sample */
> - cs_etm__copy_last_branch_rb(etmq, tidq);
> -
> /*
> * Use the address of the end of the last reported execution
> * range.
>
next prev parent reply other threads:[~2026-06-11 9:01 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-11 7:56 [PATCH v7 0/8] perf cs-etm: Support thread stack and callchain Leo Yan
2026-06-11 7:56 ` [PATCH v7 1/8] perf cs-etm: Filter synthesized branch samples Leo Yan
2026-06-11 8:58 ` James Clark
2026-06-11 7:56 ` [PATCH v7 2/8] perf cs-etm: Decode ETE exception packets Leo Yan
2026-06-11 7:56 ` [PATCH v7 3/8] perf cs-etm: Refactor instruction size handling Leo Yan
2026-06-11 7:56 ` [PATCH v7 4/8] perf cs-etm: Use thread-stack for last branch entries Leo Yan
2026-06-11 9:01 ` James Clark [this message]
2026-06-11 7:56 ` [PATCH v7 5/8] perf cs-etm: Flush thread stacks after decoder reset Leo Yan
2026-06-11 7:57 ` [PATCH v7 6/8] perf cs-etm: Support call indentation Leo Yan
2026-06-11 7:57 ` [PATCH v7 7/8] perf cs-etm: Synthesize callchains for instruction samples Leo Yan
2026-06-11 7:57 ` [PATCH v7 8/8] perf test: Add Arm CoreSight callchain test Leo Yan
2026-06-11 9:11 ` James Clark
2026-06-11 12:42 ` Leo Yan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=896279e2-eded-4388-afb9-0a633404928b@linaro.org \
--to=james.clark@linaro.org \
--cc=aaupov@fb.com \
--cc=acme@kernel.org \
--cc=adrian.hunter@intel.com \
--cc=al.grant@arm.com \
--cc=alexander.shishkin@linux.intel.com \
--cc=coresight@lists.linaro.org \
--cc=irogers@google.com \
--cc=john.g.garry@oracle.com \
--cc=jolsa@kernel.org \
--cc=leo.yan@arm.com \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-perf-users@vger.kernel.org \
--cc=mark.rutland@arm.com \
--cc=mike.leach@arm.com \
--cc=namhyung@kernel.org \
--cc=paschalis.mpeis@arm.com \
--cc=suzuki.poulose@arm.com \
--cc=will@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox