From: Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com>
To: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>, linux-kernel@vger.kernel.org
Subject: Re: [PATCH 6/6] perf thread-stack: Hide x86 retpolines
Date: Fri, 22 Feb 2019 16:42:44 -0300 [thread overview]
Message-ID: <20190222194244.GF26132@kernel.org> (raw)
In-Reply-To: <20190109091835.5570-7-adrian.hunter@intel.com>
Em Wed, Jan 09, 2019 at 11:18:35AM +0200, Adrian Hunter escreveu:
> x86 retpoline functions pollute the call graph by showing up everywhere
> there is an indirect branch, but they do not really mean anything. Make
> changes so that the default retpoline functions will no longer appear in
> the call graph. Note this only affects the call graph, since all the
> original branches are left unchanged.
>
> This does not handle function return thunks, nor is there any improvement
> for the handling of inline thunks or extern thunks.
>
> Example:
>
> $ cat simple-retpoline.c
> __attribute__((noinline)) int bar(void)
> {
> return -1;
> }
>
> int foo(void)
> {
> return bar() + 1;
> }
>
> __attribute__((indirect_branch("thunk"))) int main()
> {
> int (*volatile fn)(void) = foo;
>
> fn();
> return fn();
> }
> $ gcc -ggdb3 -Wall -Wextra -O2 -o simple-retpoline simple-retpoline.c
> $ objdump -d simple-retpoline
> <SNIP>
> 0000000000001040 <main>:
> 1040: 48 83 ec 18 sub $0x18,%rsp
> 1044: 48 8d 05 25 01 00 00 lea 0x125(%rip),%rax # 1170 <foo>
> 104b: 48 89 44 24 08 mov %rax,0x8(%rsp)
> 1050: 48 8b 44 24 08 mov 0x8(%rsp),%rax
> 1055: e8 1f 01 00 00 callq 1179 <__x86_indirect_thunk_rax>
> 105a: 48 8b 44 24 08 mov 0x8(%rsp),%rax
> 105f: 48 83 c4 18 add $0x18,%rsp
> 1063: e9 11 01 00 00 jmpq 1179 <__x86_indirect_thunk_rax>
> <SNIP>
> 0000000000001160 <bar>:
> 1160: b8 ff ff ff ff mov $0xffffffff,%eax
> 1165: c3 retq
> <SNIP>
> 0000000000001170 <foo>:
> 1170: e8 eb ff ff ff callq 1160 <bar>
> 1175: 83 c0 01 add $0x1,%eax
> 1178: c3 retq
> 0000000000001179 <__x86_indirect_thunk_rax>:
> 1179: e8 07 00 00 00 callq 1185 <__x86_indirect_thunk_rax+0xc>
> 117e: f3 90 pause
> 1180: 0f ae e8 lfence
> 1183: eb f9 jmp 117e <__x86_indirect_thunk_rax+0x5>
> 1185: 48 89 04 24 mov %rax,(%rsp)
> 1189: c3 retq
> <SNIP>
> $ perf record -o simple-retpoline.perf.data -e intel_pt/cyc/u ./simple-retpoline
> [ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 0,017 MB simple-retpoline.perf.data ]
> $ perf script -i simple-retpoline.perf.data --itrace=be -s ~/libexec/perf-core/scripts/python/export-to-sqlite.py simple-retpoline.db branches calls
> 2019-01-08 14:03:37.851655 Creating database...
> 2019-01-08 14:03:37.863256 Writing records...
> 2019-01-08 14:03:38.069750 Adding indexes
> 2019-01-08 14:03:38.078799 Done
> $ ~/libexec/perf-core/scripts/python/exported-sql-viewer.py simple-retpoline.db
>
> Before:
>
> main
> -> __x86_indirect_thunk_rax
> -> __x86_indirect_thunk_rax
> -> foo
> -> bar
>
> After:
>
> main
> -> foo
> -> bar
>
> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
> ---
> tools/perf/util/thread-stack.c | 112 ++++++++++++++++++++++++++++++++-
> 1 file changed, 109 insertions(+), 3 deletions(-)
>
> diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c
> index 632c07a125ab..805e30836460 100644
> --- a/tools/perf/util/thread-stack.c
> +++ b/tools/perf/util/thread-stack.c
> @@ -20,6 +20,7 @@
> #include "thread.h"
> #include "event.h"
> #include "machine.h"
> +#include "env.h"
> #include "util.h"
> #include "debug.h"
> #include "symbol.h"
> @@ -29,6 +30,19 @@
>
> #define STACK_GROWTH 2048
>
> +/*
> + * State of retpoline detection.
> + *
> + * RETPOLINE_NONE: no retpoline detection
> + * X86_RETPOLINE_POSSIBLE: x86 retpoline possible
> + * X86_RETPOLINE_DETECTED: x86 retpoline detected
> + */
> +enum retpoline_state_t {
> + RETPOLINE_NONE,
> + X86_RETPOLINE_POSSIBLE,
> + X86_RETPOLINE_DETECTED,
> +};
> +
> /**
> * struct thread_stack_entry - thread stack entry.
> * @ret_addr: return address
> @@ -64,6 +78,7 @@ struct thread_stack_entry {
> * @crp: call/return processor
> * @comm: current comm
> * @arr_sz: size of array if this is the first element of an array
> + * @rstate: used to detect retpolines
> */
> struct thread_stack {
> struct thread_stack_entry *stack;
> @@ -76,6 +91,7 @@ struct thread_stack {
> struct call_return_processor *crp;
> struct comm *comm;
> unsigned int arr_sz;
> + enum retpoline_state_t rstate;
> };
>
> /*
> @@ -115,10 +131,16 @@ static int thread_stack__init(struct thread_stack *ts, struct thread *thread,
> if (err)
> return err;
>
> - if (thread->mg && thread->mg->machine)
> - ts->kernel_start = machine__kernel_start(thread->mg->machine);
> - else
> + if (thread->mg && thread->mg->machine) {
> + struct machine *machine = thread->mg->machine;
> + const char *arch = perf_env__arch(machine->env);
> +
> + ts->kernel_start = machine__kernel_start(machine);
> + if (!strcmp(arch, "x86"))
> + ts->rstate = X86_RETPOLINE_POSSIBLE;
> + } else {
> ts->kernel_start = 1ULL << 63;
> + }
> ts->crp = crp;
>
> return 0;
> @@ -733,6 +755,70 @@ static int thread_stack__trace_end(struct thread_stack *ts,
> false, true);
> }
>
> +static bool is_x86_retpoline(const char *name)
> +{
> + const char *p = strstr(name, "__x86_indirect_thunk_");
> +
> + return p == name || !strcmp(name, "__indirect_thunk_start");
> +}
> +
> +/*
> + * x86 retpoline functions pollute the call graph. This function removes them.
> + * This does not handle function return thunks, nor is there any improvement
> + * for the handling of inline thunks or extern thunks.
> + */
> +static int thread_stack__x86_retpoline(struct thread_stack *ts,
> + struct perf_sample *sample,
> + struct addr_location *to_al)
> +{
> + struct thread_stack_entry *tse = &ts->stack[ts->cnt - 1];
> + struct call_path_root *cpr = ts->crp->cpr;
> + struct symbol *sym = tse->cp->sym;
> + struct symbol *tsym = to_al->sym;
> + struct call_path *cp;
> +
> + if (sym && sym->name && is_x86_retpoline(sym->name)) {
CC /tmp/build/perf/util/scripting-engines/trace-event-perl.o
CC /tmp/build/perf/util/intel-pt.o
CC /tmp/build/perf/util/intel-pt-decoder/intel-pt-log.o
util/thread-stack.c:780:18: error: address of array 'sym->name' will always evaluate to 'true' [-Werror,-Wpointer-bool-conversion]
if (sym && sym->name && is_x86_retpoline(sym->name)) {
~~ ~~~~~^~~~
1 error generated.
mv: cannot stat '/tmp/build/perf/util/.thread-stack.o.tmp': No such file or directory
make[4]: *** [/git/linux/tools/build/Makefile.build:96: /tmp/build/perf/util/thread-stack.o] Error 1
[acme@quaco perf]$ pahole -C symbol ~/bin/perf
struct symbol {
struct rb_node rb_node; /* 0 24 */
u64 start; /* 24 8 */
u64 end; /* 32 8 */
u16 namelen; /* 40 2 */
u8 type:4; /* 42: 4 1 */
u8 binding:4; /* 42: 0 1 */
u8 idle:1; /* 43: 7 1 */
u8 ignore:1; /* 43: 6 1 */
u8 inlined:1; /* 43: 5 1 */
/* XXX 5 bits hole, try to pack */
u8 arch_sym; /* 44 1 */
_Bool annotate2; /* 45 1 */
char name[0]; /* 46 0 */
/* size: 48, cachelines: 1, members: 12 */
/* bit holes: 1, sum bit holes: 5 bits */
/* padding: 2 */
/* last cacheline: 48 bytes */
};
[acme@quaco perf]$
I'm removing that sym->name test.
> + /*
> + * This is a x86 retpoline fn. It pollutes the call graph by
> + * showing up everywhere there is an indirect branch, but does
> + * not itself mean anything. Here the top-of-stack is removed,
> + * by decrementing the stack count, and then further down, the
> + * resulting top-of-stack is replaced with the actual target.
> + * The result is that the retpoline functions will no longer
> + * appear in the call graph. Note this only affects the call
> + * graph, since all the original branches are left unchanged.
> + */
> + ts->cnt -= 1;
> + sym = ts->stack[ts->cnt - 2].cp->sym;
> + if (sym && sym == tsym && to_al->addr != tsym->start) {
> + /*
> + * Target is back to the middle of the symbol we came
> + * from so assume it is an indirect jmp and forget it
> + * altogether.
> + */
> + ts->cnt -= 1;
> + return 0;
> + }
> + } else if (sym && sym == tsym) {
> + /*
> + * Target is back to the symbol we came from so assume it is an
> + * indirect jmp and forget it altogether.
> + */
> + ts->cnt -= 1;
> + return 0;
> + }
> +
> + cp = call_path__findnew(cpr, ts->stack[ts->cnt - 2].cp, tsym,
> + sample->addr, ts->kernel_start);
> + if (!cp)
> + return -ENOMEM;
> +
> + /* Replace the top-of-stack with the actual target */
> + ts->stack[ts->cnt - 1].cp = cp;
> +
> + return 0;
> +}
> +
> int thread_stack__process(struct thread *thread, struct comm *comm,
> struct perf_sample *sample,
> struct addr_location *from_al,
> @@ -740,6 +826,7 @@ int thread_stack__process(struct thread *thread, struct comm *comm,
> struct call_return_processor *crp)
> {
> struct thread_stack *ts = thread__stack(thread, sample->cpu);
> + enum retpoline_state_t rstate;
> int err = 0;
>
> if (ts && !ts->crp) {
> @@ -755,6 +842,10 @@ int thread_stack__process(struct thread *thread, struct comm *comm,
> ts->comm = comm;
> }
>
> + rstate = ts->rstate;
> + if (rstate == X86_RETPOLINE_DETECTED)
> + ts->rstate = X86_RETPOLINE_POSSIBLE;
> +
> /* Flush stack on exec */
> if (ts->comm != comm && thread->pid_ == thread->tid) {
> err = __thread_stack__flush(thread, ts);
> @@ -791,10 +882,25 @@ int thread_stack__process(struct thread *thread, struct comm *comm,
> ts->kernel_start);
> err = thread_stack__push_cp(ts, ret_addr, sample->time, ref,
> cp, false, trace_end);
> +
> + /*
> + * A call to the same symbol but not the start of the symbol,
> + * may be the start of a x86 retpoline.
> + */
> + if (!err && rstate == X86_RETPOLINE_POSSIBLE && to_al->sym &&
> + from_al->sym == to_al->sym &&
> + to_al->addr != to_al->sym->start)
> + ts->rstate = X86_RETPOLINE_DETECTED;
> +
> } else if (sample->flags & PERF_IP_FLAG_RETURN) {
> if (!sample->ip || !sample->addr)
> return 0;
>
> + /* x86 retpoline 'return' doesn't match the stack */
> + if (rstate == X86_RETPOLINE_DETECTED && ts->cnt > 2 &&
> + ts->stack[ts->cnt - 1].ret_addr != sample->addr)
> + return thread_stack__x86_retpoline(ts, sample, to_al);
> +
> err = thread_stack__pop_cp(thread, ts, sample->addr,
> sample->time, ref, from_al->sym);
> if (err) {
> --
> 2.17.1
--
- Arnaldo
next prev parent reply other threads:[~2019-02-22 19:42 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-01-09 9:18 [PATCH 0/6] perf thread-stack: x86 retpolines Adrian Hunter
2019-01-09 9:18 ` [PATCH 1/6] perf tools: Fix split_kallsyms_for_kcore for trampoline symbols Adrian Hunter
2019-02-09 12:57 ` [tip:perf/core] perf tools: Fix split_kallsyms_for_kcore() " tip-bot for Adrian Hunter
2019-01-09 9:18 ` [PATCH 2/6] perf thread-stack: Tidy thread_stack__push_cp() usage Adrian Hunter
2019-02-09 12:58 ` [tip:perf/core] " tip-bot for Adrian Hunter
2019-01-09 9:18 ` [PATCH 3/6] perf thread-stack: Tidy thread_stack__no_call_return() by adding more local variables Adrian Hunter
2019-02-09 12:58 ` [tip:perf/core] " tip-bot for Adrian Hunter
2019-01-09 9:18 ` [PATCH 4/6] perf thread-stack: Represent jmps to the start of a different symbol Adrian Hunter
2019-02-06 12:39 ` Arnaldo Carvalho de Melo
2019-02-06 13:25 ` Adrian Hunter
2019-02-09 12:59 ` [tip:perf/core] " tip-bot for Adrian Hunter
2019-01-09 9:18 ` [PATCH 5/6] perf thread-stack: Improve thread_stack__no_call_return() Adrian Hunter
2019-02-22 9:48 ` Adrian Hunter
2019-02-22 14:39 ` Arnaldo Carvalho de Melo
2019-02-28 7:49 ` [tip:perf/core] " tip-bot for Adrian Hunter
2019-01-09 9:18 ` [PATCH 6/6] perf thread-stack: Hide x86 retpolines Adrian Hunter
2019-01-09 15:38 ` Jiri Olsa
2019-01-10 7:52 ` Adrian Hunter
2019-02-22 19:42 ` Arnaldo Carvalho de Melo [this message]
2019-02-22 20:53 ` Arnaldo Carvalho de Melo
2019-02-28 7:49 ` [tip:perf/core] " tip-bot for Adrian Hunter
2019-01-10 9:55 ` [PATCH 0/6] perf thread-stack: " Jiri Olsa
2019-02-06 9:10 ` Adrian Hunter
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190222194244.GF26132@kernel.org \
--to=arnaldo.melo@gmail.com \
--cc=adrian.hunter@intel.com \
--cc=jolsa@redhat.com \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.