* [PATCH 2/4] perf, tools: Add probing for udev86
2016-03-25 23:02 [PATCH 1/4] perf, tools: Add support for skipping itrace instructions Andi Kleen
@ 2016-03-25 23:02 ` Andi Kleen
2016-03-27 11:26 ` Jiri Olsa
2016-03-25 23:02 ` [PATCH 3/4] perf, tools, script: Add support for printing assembler Andi Kleen
2016-03-25 23:02 ` [PATCH 4/4] perf, tools, script: Add brstackasm output for branch stacks Andi Kleen
2 siblings, 1 reply; 8+ messages in thread
From: Andi Kleen @ 2016-03-25 23:02 UTC (permalink / raw)
To: acme; +Cc: jolsa, linux-kernel, Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
Add autoprobing for the udev86 disassembler library.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
tools/build/Makefile.feature | 6 ++++--
tools/build/feature/Makefile | 8 ++++++--
tools/build/feature/test-all.c | 5 +++++
tools/build/feature/test-udis86.c | 8 ++++++++
tools/perf/config/Makefile | 5 +++++
5 files changed, 28 insertions(+), 4 deletions(-)
create mode 100644 tools/build/feature/test-udis86.c
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 6b77072..db4f426 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -55,7 +55,8 @@ FEATURE_TESTS_BASIC := \
zlib \
lzma \
get_cpuid \
- bpf
+ bpf \
+ udis86
# FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list
# of all feature tests
@@ -94,7 +95,8 @@ FEATURE_DISPLAY ?= \
zlib \
lzma \
get_cpuid \
- bpf
+ bpf \
+ udis86
# Set FEATURE_CHECK_(C|LD)FLAGS-all for all FEATURE_TESTS features.
# If in the future we need per-feature checks/flags for features not
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index c5f4c41..d05c312 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -36,7 +36,8 @@ FILES= \
test-zlib.bin \
test-lzma.bin \
test-bpf.bin \
- test-get_cpuid.bin
+ test-get_cpuid.bin \
+ test-udis86.bin
FILES := $(addprefix $(OUTPUT),$(FILES))
@@ -51,7 +52,7 @@ __BUILD = $(CC) $(CFLAGS) -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFL
###############################
$(OUTPUT)test-all.bin:
- $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -laudit -I/usr/include/slang -lslang $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null) $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma
+ $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -laudit -I/usr/include/slang -lslang $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null) $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma -ludis86
$(OUTPUT)test-hello.bin:
$(BUILD)
@@ -97,6 +98,9 @@ $(OUTPUT)test-numa_num_possible_cpus.bin:
$(OUTPUT)test-libunwind.bin:
$(BUILD) -lelf
+$(OUTPUT)test-udis86.bin:
+ $(BUILD) -ludis86
+
$(OUTPUT)test-libunwind-debug-frame.bin:
$(BUILD) -lelf
diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c
index e499a36..76b0de3 100644
--- a/tools/build/feature/test-all.c
+++ b/tools/build/feature/test-all.c
@@ -133,6 +133,10 @@
# include "test-libcrypto.c"
#undef main
+#define main main_test_udis86
+# include "test-udis86.c"
+#endif
+
int main(int argc, char *argv[])
{
main_test_libpython();
@@ -163,6 +167,7 @@ int main(int argc, char *argv[])
main_test_get_cpuid();
main_test_bpf();
main_test_libcrypto();
+ main_test_udis86();
return 0;
}
diff --git a/tools/build/feature/test-udis86.c b/tools/build/feature/test-udis86.c
new file mode 100644
index 0000000..623c545
--- /dev/null
+++ b/tools/build/feature/test-udis86.c
@@ -0,0 +1,8 @@
+#include <udis86.h>
+
+int main(void)
+{
+ ud_t ud;
+ ud_init(&ud);
+ return 0;
+}
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index f7aeaf3..970d800 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -580,6 +580,11 @@ ifneq ($(filter -lbfd,$(EXTLIBS)),)
CFLAGS += -DHAVE_LIBBFD_SUPPORT
endif
+ifeq ($(feature-udis86), 1)
+ CFLAGS += -DHAVE_UDIS86
+ EXTLIBS += -ludis86
+endif
+
ifndef NO_ZLIB
ifeq ($(feature-zlib), 1)
CFLAGS += -DHAVE_ZLIB_SUPPORT
--
2.5.5
^ permalink raw reply related [flat|nested] 8+ messages in thread* Re: [PATCH 2/4] perf, tools: Add probing for udev86
2016-03-25 23:02 ` [PATCH 2/4] perf, tools: Add probing for udev86 Andi Kleen
@ 2016-03-27 11:26 ` Jiri Olsa
0 siblings, 0 replies; 8+ messages in thread
From: Jiri Olsa @ 2016-03-27 11:26 UTC (permalink / raw)
To: Andi Kleen; +Cc: acme, jolsa, linux-kernel, Andi Kleen
On Fri, Mar 25, 2016 at 04:02:36PM -0700, Andi Kleen wrote:
> From: Andi Kleen <ak@linux.intel.com>
>
> Add autoprobing for the udev86 disassembler library.
>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
thanks,
jirka
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 3/4] perf, tools, script: Add support for printing assembler
2016-03-25 23:02 [PATCH 1/4] perf, tools: Add support for skipping itrace instructions Andi Kleen
2016-03-25 23:02 ` [PATCH 2/4] perf, tools: Add probing for udev86 Andi Kleen
@ 2016-03-25 23:02 ` Andi Kleen
2016-03-27 11:05 ` Jiri Olsa
2016-03-25 23:02 ` [PATCH 4/4] perf, tools, script: Add brstackasm output for branch stacks Andi Kleen
2 siblings, 1 reply; 8+ messages in thread
From: Andi Kleen @ 2016-03-25 23:02 UTC (permalink / raw)
To: acme; +Cc: jolsa, linux-kernel, Andi Kleen, adrian.hunter
From: Andi Kleen <ak@linux.intel.com>
When dumping PT traces with perf script it is very useful to see the
assembler for each sample, so that it is easily possible to follow
the control flow.
As using objdump is difficult and inefficient from perf script this
patch uses the udis86 library to implement assembler output.
The library can be downloaded from http://udis86.sourceforge.net/
The library is probed as an external dependency in the usual way. Then perf
script calls into it when needed, and handles callbacks to resolve
symbols.
% perf record -e intel_pt//u true
% perf script -F sym,symoff,ip,asm --itrace=i0ns | head
7fc7188b4190 _start+0x0 mov %rsp, %rdi
7fc7188b4193 _start+0x3 call _dl_start
7fc7188b7710 _dl_start+0x0 push %rbp
7fc7188b7711 _dl_start+0x1 mov %rsp, %rbp
7fc7188b7714 _dl_start+0x4 push %r15
7fc7188b7716 _dl_start+0x6 push %r14
7fc7188b7718 _dl_start+0x8 push %r13
7fc7188b771a _dl_start+0xa push %r12
7fc7188b771c _dl_start+0xc mov %rdi, %r12
7fc7188b771f _dl_start+0xf push %rbx
Current issues:
- Some jump references do not get resolved to symbols.
- udis86 release does not support STAC/CLAC, which are used in the kernel,
but there is a pending patch for it.
Cc: adrian.hunter@intel.com
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
tools/perf/Documentation/perf-script.txt | 4 +-
tools/perf/builtin-script.c | 108 +++++++++++++++++++++++++++++--
2 files changed, 106 insertions(+), 6 deletions(-)
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 382ddfb..c834f4d 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -116,7 +116,7 @@ OPTIONS
--fields::
Comma separated list of fields to print. Options are:
comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
- srcline, period, iregs, brstack, brstacksym, flags.
+ srcline, period, iregs, brstack, brstacksym, flags, asm.
Field list can be prepended with the type, trace, sw or hw,
to indicate to which event type the field list applies.
e.g., -f sw:comm,tid,time,ip,sym and -f trace:time,cpu,trace
@@ -185,6 +185,8 @@ OPTIONS
The brstacksym is identical to brstack, except that the FROM and TO addresses are printed in a symbolic form if possible.
+ When asm is specified the assembler instruction of each sample is printed in disassembled form.
+
-k::
--vmlinux=<file>::
vmlinux pathname
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 57f9a7e..706ece8 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -25,6 +25,10 @@
#include "asm/bug.h"
#include "util/mem-events.h"
+#ifdef HAVE_UDIS86
+#include <udis86.h>
+#endif
+
static char const *script_name;
static char const *generate_script_lang;
static bool debug_mode;
@@ -62,6 +66,7 @@ enum perf_output_field {
PERF_OUTPUT_DATA_SRC = 1U << 17,
PERF_OUTPUT_WEIGHT = 1U << 18,
PERF_OUTPUT_BPF_OUTPUT = 1U << 19,
+ PERF_OUTPUT_ASM = 1U << 20,
};
struct output_option {
@@ -88,6 +93,7 @@ struct output_option {
{.str = "data_src", .field = PERF_OUTPUT_DATA_SRC},
{.str = "weight", .field = PERF_OUTPUT_WEIGHT},
{.str = "bpf-output", .field = PERF_OUTPUT_BPF_OUTPUT},
+ {.str = "asm", .field = PERF_OUTPUT_ASM},
};
/* default set to maintain compatibility with current format */
@@ -282,7 +288,11 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
"selected. Hence, no address to lookup the source line number.\n");
return -EINVAL;
}
-
+ if (PRINT_FIELD(ASM) && !PRINT_FIELD(IP)) {
+ pr_err("Display of assembler requested but sample IP is not\n"
+ "selected.\n");
+ return -EINVAL;
+ }
if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID",
PERF_OUTPUT_TID|PERF_OUTPUT_PID))
@@ -423,6 +433,89 @@ static void print_sample_iregs(union perf_event *event __maybe_unused,
}
}
+#ifdef HAVE_UDIS86
+
+struct perf_ud {
+ ud_t ud_obj;
+ struct thread *thread;
+ u8 cpumode;
+ int cpu;
+};
+
+static const char *dis_resolve(struct ud *u, uint64_t addr, int64_t *off)
+{
+ struct perf_ud *ud = container_of(u, struct perf_ud, ud_obj);
+ struct addr_location al;
+
+ memset(&al, 0, sizeof(struct addr_location));
+
+ thread__find_addr_map(ud->thread, ud->cpumode, MAP__FUNCTION, addr, &al);
+ if (!al.map)
+ thread__find_addr_map(ud->thread, ud->cpumode, MAP__VARIABLE,
+ addr, &al);
+ al.cpu = ud->cpu;
+ al.sym = NULL;
+
+ if (al.map)
+ al.sym = map__find_symbol(al.map, al.addr, NULL);
+
+ if (!al.sym)
+ return NULL;
+
+ if (addr < al.sym->end)
+ *off = addr - al.sym->start;
+ else
+ *off = addr - al.map->start - al.sym->start;
+ return al.sym->name;
+}
+#endif
+
+static void print_sample_asm(union perf_event *event __maybe_unused,
+ struct perf_sample *sample __maybe_unused,
+ struct thread *thread __maybe_unused,
+ struct perf_event_attr *attr __maybe_unused,
+ struct addr_location *al __maybe_unused,
+ struct machine *machine __maybe_unused)
+{
+#ifdef HAVE_UDIS86
+ static bool ud_initialized = false;
+ static struct perf_ud ud;
+ u8 buffer[32];
+ int len;
+ u64 offset;
+
+ if (!ud_initialized) {
+ ud_initialized = true;
+ ud_init(&ud.ud_obj);
+ ud_set_syntax(&ud.ud_obj, UD_SYN_ATT);
+ ud_set_sym_resolver(&ud.ud_obj, dis_resolve);
+ }
+ ud.thread = thread;
+ ud.cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+ ud.cpu = sample->cpu;
+
+ if (!al->map || !al->map->dso)
+ return;
+ if (al->map->dso->data.status == DSO_DATA_STATUS_ERROR)
+ return;
+
+ /* Load maps to ensure dso->is_64_bit has been updated */
+ map__load(al->map, machine->symbol_filter);
+
+ offset = al->map->map_ip(al->map, sample->ip);
+ len = dso__data_read_offset(al->map->dso, machine,
+ offset, buffer, 32);
+ if (len <= 0)
+ return;
+
+ ud_set_mode(&ud.ud_obj, al->map->dso->is_64_bit ? 64 : 32);
+ ud_set_pc(&ud.ud_obj, sample->ip);
+ ud_set_input_buffer(&ud.ud_obj, buffer, len);
+ ud_disassemble(&ud.ud_obj);
+ printf("\t%s", ud_insn_asm(&ud.ud_obj));
+#endif
+}
+
static void print_sample_start(struct perf_sample *sample,
struct thread *thread,
struct perf_evsel *evsel)
@@ -749,7 +842,8 @@ static size_t data_src__printf(u64 data_src)
static void process_event(struct perf_script *script, union perf_event *event,
struct perf_sample *sample, struct perf_evsel *evsel,
- struct addr_location *al)
+ struct addr_location *al,
+ struct machine *machine)
{
struct thread *thread = al->thread;
struct perf_event_attr *attr = &evsel->attr;
@@ -777,7 +871,7 @@ static void process_event(struct perf_script *script, union perf_event *event,
if (is_bts_event(attr)) {
print_sample_bts(event, sample, evsel, thread, al);
- return;
+ goto print_rest;
}
if (PRINT_FIELD(TRACE))
@@ -806,6 +900,7 @@ static void process_event(struct perf_script *script, union perf_event *event,
if (PRINT_FIELD(IREGS))
print_sample_iregs(event, sample, thread, attr);
+print_rest:
if (PRINT_FIELD(BRSTACK))
print_sample_brstack(event, sample, thread, attr);
else if (PRINT_FIELD(BRSTACKSYM))
@@ -814,6 +909,9 @@ static void process_event(struct perf_script *script, union perf_event *event,
if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
print_sample_bpf_output(sample);
+ if (PRINT_FIELD(ASM))
+ print_sample_asm(event, sample, thread, attr, al, machine);
+
printf("\n");
}
@@ -920,7 +1018,7 @@ static int process_sample_event(struct perf_tool *tool,
if (scripting_ops)
scripting_ops->process_event(event, sample, evsel, &al);
else
- process_event(scr, event, sample, evsel, &al);
+ process_event(scr, event, sample, evsel, &al, machine);
out_put:
addr_location__put(&al);
@@ -2020,7 +2118,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
"comma separated output fields prepend with 'type:'. "
"Valid types: hw,sw,trace,raw. "
"Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
- "addr,symoff,period,iregs,brstack,brstacksym,flags", parse_output_fields),
+ "addr,symoff,period,iregs,brstack,brstacksym,flags,asm", parse_output_fields),
OPT_BOOLEAN('a', "all-cpus", &system_wide,
"system-wide collection from all CPUs"),
OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
--
2.5.5
^ permalink raw reply related [flat|nested] 8+ messages in thread* Re: [PATCH 3/4] perf, tools, script: Add support for printing assembler
2016-03-25 23:02 ` [PATCH 3/4] perf, tools, script: Add support for printing assembler Andi Kleen
@ 2016-03-27 11:05 ` Jiri Olsa
2016-03-27 15:35 ` Andi Kleen
0 siblings, 1 reply; 8+ messages in thread
From: Jiri Olsa @ 2016-03-27 11:05 UTC (permalink / raw)
To: Andi Kleen; +Cc: acme, jolsa, linux-kernel, Andi Kleen, adrian.hunter
On Fri, Mar 25, 2016 at 04:02:37PM -0700, Andi Kleen wrote:
> From: Andi Kleen <ak@linux.intel.com>
>
> When dumping PT traces with perf script it is very useful to see the
> assembler for each sample, so that it is easily possible to follow
> the control flow.
>
> As using objdump is difficult and inefficient from perf script this
> patch uses the udis86 library to implement assembler output.
> The library can be downloaded from http://udis86.sourceforge.net/
>
> The library is probed as an external dependency in the usual way. Then perf
> script calls into it when needed, and handles callbacks to resolve
> symbols.
>
> % perf record -e intel_pt//u true
> % perf script -F sym,symoff,ip,asm --itrace=i0ns | head
> 7fc7188b4190 _start+0x0 mov %rsp, %rdi
> 7fc7188b4193 _start+0x3 call _dl_start
> 7fc7188b7710 _dl_start+0x0 push %rbp
> 7fc7188b7711 _dl_start+0x1 mov %rsp, %rbp
> 7fc7188b7714 _dl_start+0x4 push %r15
> 7fc7188b7716 _dl_start+0x6 push %r14
> 7fc7188b7718 _dl_start+0x8 push %r13
> 7fc7188b771a _dl_start+0xa push %r12
> 7fc7188b771c _dl_start+0xc mov %rdi, %r12
> 7fc7188b771f _dl_start+0xf push %rbx
>
> Current issues:
> - Some jump references do not get resolved to symbols.
> - udis86 release does not support STAC/CLAC, which are used in the kernel,
> but there is a pending patch for it.
can't apply this on latest Arnaldo's tree:
patching file Documentation/perf-script.txt
patching file builtin-script.c
Hunk #5 succeeded at 431 (offset -2 lines).
Hunk #6 succeeded at 832 with fuzz 2 (offset -10 lines).
Hunk #7 FAILED at 871.
Hunk #8 succeeded at 890 with fuzz 2 (offset -10 lines).
Hunk #9 succeeded at 899 (offset -10 lines).
Hunk #10 FAILED at 1018.
Hunk #11 succeeded at 2108 (offset -10 lines).
2 out of 11 hunks FAILED -- saving rejects to file builtin-script.c.rej
jirka
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 3/4] perf, tools, script: Add support for printing assembler
2016-03-27 11:05 ` Jiri Olsa
@ 2016-03-27 15:35 ` Andi Kleen
0 siblings, 0 replies; 8+ messages in thread
From: Andi Kleen @ 2016-03-27 15:35 UTC (permalink / raw)
To: Jiri Olsa
Cc: Andi Kleen, acme, jolsa, linux-kernel, Andi Kleen, adrian.hunter
> > Current issues:
> > - Some jump references do not get resolved to symbols.
> > - udis86 release does not support STAC/CLAC, which are used in the kernel,
> > but there is a pending patch for it.
>
> can't apply this on latest Arnaldo's tree:
This is due to which is only in Arnaldo's tree
commit a3dff304ca50cbfbe3f0e76ee49c439990932ed5
Author: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed Mar 23 11:55:24 2016 -0300
perf script: Remove lots of unused arguments
But the asm printer actually needs the "event" argument,
so that change would break my patchkit.
Arnaldo, can you undo the patch or at least keep "event" ?
-Andi
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 4/4] perf, tools, script: Add brstackasm output for branch stacks
2016-03-25 23:02 [PATCH 1/4] perf, tools: Add support for skipping itrace instructions Andi Kleen
2016-03-25 23:02 ` [PATCH 2/4] perf, tools: Add probing for udev86 Andi Kleen
2016-03-25 23:02 ` [PATCH 3/4] perf, tools, script: Add support for printing assembler Andi Kleen
@ 2016-03-25 23:02 ` Andi Kleen
2 siblings, 0 replies; 8+ messages in thread
From: Andi Kleen @ 2016-03-25 23:02 UTC (permalink / raw)
To: acme; +Cc: jolsa, linux-kernel, Andi Kleen
From: Andi Kleen <ak@linux.intel.com>
Implement printing full disassembled sequences for branch stacks in perf
script. This allows to directly print hot paths for individual samples,
together with branch misprediction and even cycle count information.
% perf record -b ...
% perf script -F brstackasm
...
00007f0668d54e88 movsx (%rsi), %ecx
00007f0668d54e8b lea -0x30(%rcx), %eax
00007f0668d54e8e cmp $0x9, %al
00007f0668d54e90 jbe 0x68d54eaf
00007f0668d54e92 cmp %cl, %dl
00007f0668d54e94 jnz 0x68d54eb5
00007f0668d54e96 add $0x1, %rdi
00007f0668d54e9a movsx (%rdi), %edx
00007f0668d54e9d add $0x1, %rsi
00007f0668d54ea1 test %dl, %dl
00007f0668d54ea3 jnz _dl_cache_libcmp+11 # PRED 21 cycles
00007f0668d54dfb lea -0x30(%rdx), %eax
00007f0668d54dfe cmp $0x9, %al
00007f0668d54e00 ja _dl_cache_libcmp+152 # PRED 2 cycles
00007f0668d54e88 movsx (%rsi), %ecx
00007f0668d54e8b lea -0x30(%rcx), %eax
00007f0668d54e8e cmp $0x9, %al
00007f0668d54e90 jbe 0x68d54eaf
00007f0668d54e92 cmp %cl, %dl
00007f0668d54e94 jnz 0x68d54eb5 # PRED 3 cycles
00007f0668d54eb5 movsx %dl, %eax
00007f0668d54eb8 sub %ecx, %eax
00007f0668d54eba ret # PRED 1 cycles
00007f0668d54fae test %eax, %eax
00007f0668d54fb0 jz _dl_load_cache_lookup+688
00007f0668d54fb6 jns 0x68d54f70
00007f0668d54fb8 lea 0x1(%r14), %ebx
00007f0668d54fbc cmp %r15d, %ebx
00007f0668d54fbf nop
00007f0668d54fc0 jle 0x68d54f79 # PRED 2 cycles
Open issues:
- Occasionally the path does not reach up to the sample IP, as the LBRs
may be freezed earlier.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
tools/perf/Documentation/perf-script.txt | 9 +-
tools/perf/builtin-script.c | 191 ++++++++++++++++++++++++++++++-
2 files changed, 195 insertions(+), 5 deletions(-)
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index c834f4d..4a30f02 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -116,7 +116,7 @@ OPTIONS
--fields::
Comma separated list of fields to print. Options are:
comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
- srcline, period, iregs, brstack, brstacksym, flags, asm.
+ srcline, period, iregs, brstack, brstacksym, flags, asm, brstackasm
Field list can be prepended with the type, trace, sw or hw,
to indicate to which event type the field list applies.
e.g., -f sw:comm,tid,time,ip,sym and -f trace:time,cpu,trace
@@ -176,17 +176,22 @@ OPTIONS
i.e., -f "" is not allowed.
The brstack output includes branch related information with raw addresses using the
- /v/v/v/v/ syntax in the following order:
+ /v/v/v/v/cycles syntax in the following order:
FROM: branch source instruction
TO : branch target instruction
M/P/-: M=branch target mispredicted or branch direction was mispredicted, P=target predicted or direction predicted, -=not supported
X/- : X=branch inside a transactional region, -=not in transaction region or not supported
A/- : A=TSX abort entry, -=not aborted region or not supported
+ cycles
The brstacksym is identical to brstack, except that the FROM and TO addresses are printed in a symbolic form if possible.
When asm is specified the assembler instruction of each sample is printed in disassembled form.
+ When brstackasm is specified the full assembler sequences of branch sequences for each sample
+ is printed. This is the full execution path leading to the sample. This is only supported when the
+ sample was recorded with perf record -b or -j any.
+
-k::
--vmlinux=<file>::
vmlinux pathname
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 706ece8..766242b 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -42,6 +42,7 @@ static bool nanosecs;
static const char *cpu_list;
static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
static struct perf_stat_config stat_config;
+static int max_blocks;
unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;
@@ -67,6 +68,7 @@ enum perf_output_field {
PERF_OUTPUT_WEIGHT = 1U << 18,
PERF_OUTPUT_BPF_OUTPUT = 1U << 19,
PERF_OUTPUT_ASM = 1U << 20,
+ PERF_OUTPUT_BRSTACKASM = 1U << 21,
};
struct output_option {
@@ -94,6 +96,7 @@ struct output_option {
{.str = "weight", .field = PERF_OUTPUT_WEIGHT},
{.str = "bpf-output", .field = PERF_OUTPUT_BPF_OUTPUT},
{.str = "asm", .field = PERF_OUTPUT_ASM},
+ {.str = "brstackasm", .field = PERF_OUTPUT_BRSTACKASM},
};
/* default set to maintain compatibility with current format */
@@ -293,6 +296,13 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
"selected.\n");
return -EINVAL;
}
+ if (PRINT_FIELD(BRSTACKASM) &&
+ !(perf_evlist__combined_branch_type(session->evlist) &
+ PERF_SAMPLE_BRANCH_ANY)) {
+ pr_err("Display of branch stack assembler requested, but non all-branch filter set\n");
+ return -EINVAL;
+ }
+
if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID",
PERF_OUTPUT_TID|PERF_OUTPUT_PID))
@@ -462,10 +472,10 @@ static const char *dis_resolve(struct ud *u, uint64_t addr, int64_t *off)
if (!al.sym)
return NULL;
- if (addr < al.sym->end)
- *off = addr - al.sym->start;
+ if (al.addr < al.sym->end)
+ *off = al.addr - al.sym->start;
else
- *off = addr - al.map->start - al.sym->start;
+ *off = al.addr - al.map->start - al.sym->start;
return al.sym->name;
}
#endif
@@ -630,6 +640,176 @@ static void print_sample_brstacksym(union perf_event *event __maybe_unused,
}
}
+#ifdef HAVE_UDIS86
+#define MAXBB 16384UL
+#define MAXINSN 16
+
+static int grab_bb(char *buffer, u64 start, u64 end,
+ struct machine *machine, struct thread *thread,
+ bool *is64bit, u8 *cpumode)
+{
+ int offset, len;
+ struct addr_location al;
+ bool kernel;
+
+ if (!start || !end)
+ return 0;
+
+ kernel = machine__kernel_ip(machine, start);
+ if (kernel)
+ *cpumode = PERF_RECORD_MISC_KERNEL;
+ else
+ *cpumode = PERF_RECORD_MISC_USER;
+ if (kernel != machine__kernel_ip(machine, end))
+ return 0;
+
+ memset(&al, 0, sizeof(al));
+ if (end - start > MAXBB - MAXINSN) {
+ pr_debug("\tbasic block %" PRIx64 "-%" PRIx64 " (%ld) too long to dump\n",
+ start, end, end - start);
+ return 0;
+ }
+
+ thread__find_addr_map(thread, *cpumode, MAP__FUNCTION, start, &al);
+ if (!al.map || !al.map->dso) {
+ printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n",
+ start, end);
+ return 0;
+ }
+ if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR) {
+ printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n",
+ start, end);
+ return 0;
+ }
+
+ /* Load maps to ensure dso->is_64_bit has been updated */
+ map__load(al.map, machine->symbol_filter);
+
+ offset = al.map->map_ip(al.map, start);
+ len = dso__data_read_offset(al.map->dso, machine,
+ offset, (u8 *)buffer,
+ end - start + MAXINSN);
+
+ *is64bit = al.map->dso->is_64_bit;
+ return len;
+}
+#endif
+
+static void print_sample_brstackasm(union perf_event *event __maybe_unused,
+ struct perf_sample *sample,
+ struct thread *thread __maybe_unused,
+ struct perf_event_attr *attr __maybe_unused,
+ struct machine *machine __maybe_unused)
+{
+#ifdef HAVE_UDIS86
+ struct branch_stack *br = sample->branch_stack;
+ u64 start, end;
+ int i;
+ static bool ud_initialized = false;
+ static struct perf_ud ud;
+ char buffer[MAXBB];
+ int len;
+ bool last;
+ bool is64bit;
+ int nr;
+
+ if (!(br && br->nr))
+ return;
+ nr = br->nr;
+ if (max_blocks && nr > max_blocks + 1)
+ nr = max_blocks + 1;
+
+ if (!ud_initialized) {
+ ud_initialized = true;
+ ud_init(&ud.ud_obj);
+ ud_set_syntax(&ud.ud_obj, UD_SYN_ATT);
+ ud_set_sym_resolver(&ud.ud_obj, dis_resolve);
+ }
+ ud.thread = thread;
+ ud.cpu = sample->cpu;
+
+ putchar('\n');
+ for (i = nr - 2; i >= 0; i--) {
+ if (br->entries[i].from || br->entries[i].to)
+ printf("%d: %lx-%lx\n", i,
+ br->entries[i].from,
+ br->entries[i].to);
+ start = br->entries[i + 1].to;
+ end = br->entries[i].from;
+
+ /*
+ * Leave extra bytes for the final jump instruction for
+ * which we don't know the length
+ */
+ len = grab_bb(buffer, start, end + MAXINSN,
+ machine, thread, &is64bit,
+ &ud.cpumode);
+ if (len <= 0)
+ continue;
+
+ ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+ ud_set_pc(&ud.ud_obj, start);
+ ud_set_input_buffer(&ud.ud_obj, (uint8_t *)buffer, len);
+ last = false;
+ while (ud_disassemble(&ud.ud_obj) && !last) {
+ if (ud_insn_ptr(&ud.ud_obj) ==
+ (uint8_t *)buffer + end - start) {
+ printf("\t%016" PRIx64 "\t%-30s\t#%s%s%s%s\n",
+ ud_insn_off(&ud.ud_obj),
+ ud_insn_asm(&ud.ud_obj),
+ br->entries[i].flags.predicted ? " PRED" : "",
+ br->entries[i].flags.mispred ? " MISPRED" : "",
+ br->entries[i].flags.in_tx ? " INTX" : "",
+ br->entries[i].flags.abort ? " ABORT" : "");
+ if (br->entries[i].flags.cycles)
+ printf(" %d cycles", br->entries[i].flags.cycles);
+ last = true;
+ } else {
+ printf("\t%016" PRIx64 "\t%s\n",
+ ud_insn_off(&ud.ud_obj),
+ ud_insn_asm(&ud.ud_obj));
+ }
+ }
+ }
+
+ /*
+ * Hit the branch? In this case we are already done, and the target
+ * has not been executed yet.
+ */
+ if (br->entries[0].from == sample->ip)
+ return;
+ if (br->entries[0].flags.abort)
+ return;
+
+ /*
+ * Print final block upto sample
+ */
+ start = br->entries[0].to;
+ end = sample->ip;
+ len = grab_bb(buffer, start, end, machine, thread, &is64bit,
+ &ud.cpumode);
+ ud_set_input_buffer(&ud.ud_obj, (uint8_t *)buffer, len);
+ if (len <= 0) {
+ /* Print at least last IP if basic block did not work */
+ len = grab_bb(buffer, sample->ip, sample->ip + MAXINSN,
+ machine, thread, &is64bit, &ud.cpumode);
+ if (len <= 0)
+ return;
+ ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+ ud_set_pc(&ud.ud_obj, sample->ip);
+ if (ud_disassemble(&ud.ud_obj))
+ printf("\t%016" PRIx64 "\t%s\n", ud_insn_off(&ud.ud_obj),
+ ud_insn_asm(&ud.ud_obj));
+ return;
+ }
+ ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+ ud_set_pc(&ud.ud_obj, start);
+ while (ud_disassemble(&ud.ud_obj) &&
+ ud_insn_ptr(&ud.ud_obj) <= (uint8_t *)buffer + end - start)
+ printf("\t%016" PRIx64 "\t%s\n", ud_insn_off(&ud.ud_obj),
+ ud_insn_asm(&ud.ud_obj));
+#endif
+}
static void print_sample_addr(union perf_event *event,
struct perf_sample *sample,
@@ -909,6 +1089,9 @@ print_rest:
if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
print_sample_bpf_output(sample);
+ if (PRINT_FIELD(BRSTACKASM))
+ print_sample_brstackasm(event, sample, thread, attr,
+ machine);
if (PRINT_FIELD(ASM))
print_sample_asm(event, sample, thread, attr, al, machine);
@@ -2140,6 +2323,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
"Show the mmap events"),
OPT_BOOLEAN('\0', "show-switch-events", &script.show_switch_events,
"Show context switch events (if recorded)"),
+ OPT_INTEGER(0, "max-blocks", &max_blocks,
+ "Maximum number of code blocks to dump with brstackasm"),
OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
OPT_BOOLEAN(0, "ns", &nanosecs,
"Use 9 decimal places when displaying time"),
--
2.5.5
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 3/4] perf, tools, script: Add support for printing assembler
2017-01-03 9:00 New attempt at adding an disassembler to perf Andi Kleen
@ 2017-01-03 9:00 ` Andi Kleen
0 siblings, 0 replies; 8+ messages in thread
From: Andi Kleen @ 2017-01-03 9:00 UTC (permalink / raw)
To: acme; +Cc: jolsa, mingo, linux-kernel, Andi Kleen, adrian.hunter
From: Andi Kleen <ak@linux.intel.com>
When dumping PT traces with perf script it is very useful to see the
assembler for each sample, so that it is easily possible to follow
the control flow.
As using objdump is difficult and inefficient from perf script this
patch uses the Intel xed library to implement assembler output.
The library can be downloaded from http://github.com/intelxed/xed
The previous version of this patch used udis86, but was
rejected because udis86 was unmaintained and a runtime dependency.
Using the recently released xed avoids both of these problems:
- XED is well maintained and used by many Intel tools
- XED is linked statically so there is no runtime dependency.
The library is probed as an external dependency in the usual way. Then perf
script calls into it when needed, and handles callbacks to resolve
symbols.
% perf record -e intel_pt//u true
% perf script -F sym,symoff,ip,asm --itrace=i0ns | head
7fc7188b4190 _start+0x0 mov %rsp, %rdi
7fc7188b4193 _start+0x3 call _dl_start
7fc7188b7710 _dl_start+0x0 push %rbp
7fc7188b7711 _dl_start+0x1 mov %rsp, %rbp
7fc7188b7714 _dl_start+0x4 push %r15
7fc7188b7716 _dl_start+0x6 push %r14
7fc7188b7718 _dl_start+0x8 push %r13
7fc7188b771a _dl_start+0xa push %r12
7fc7188b771c _dl_start+0xc mov %rdi, %r12
7fc7188b771f _dl_start+0xf push %rbx
v2:
Converted to use XED instead of udis86.
Separate disassembler interface into separate arch specific file.
Lots of cleanups and improvements.
Cc: adrian.hunter@intel.com
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
tools/perf/Documentation/perf-script.txt | 4 +-
tools/perf/builtin-script.c | 72 +++++++++++++++++++++++++++-----
2 files changed, 64 insertions(+), 12 deletions(-)
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 5dc5c6a09ac4..863579b4d2e2 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -116,7 +116,7 @@ OPTIONS
--fields::
Comma separated list of fields to print. Options are:
comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
- srcline, period, iregs, brstack, brstacksym, flags, bpf-output,
+ srcline, period, iregs, brstack, brstacksym, flags, bpf-output, asm.
callindent, insn, insnlen. Field list can be prepended with the type, trace, sw or hw,
to indicate to which event type the field list applies.
e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace
@@ -198,6 +198,8 @@ OPTIONS
The brstacksym is identical to brstack, except that the FROM and TO addresses are printed in a symbolic form if possible.
+ When asm is specified the assembler instruction of each sample is printed in disassembled form.
+
-k::
--vmlinux=<file>::
vmlinux pathname
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 2f3ff69fc4e7..e96668b1cc63 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -28,6 +28,7 @@
#include <linux/time64.h>
#include "asm/bug.h"
#include "util/mem-events.h"
+#include "util/dis.h"
static char const *script_name;
static char const *generate_script_lang;
@@ -69,6 +70,7 @@ enum perf_output_field {
PERF_OUTPUT_CALLINDENT = 1U << 20,
PERF_OUTPUT_INSN = 1U << 21,
PERF_OUTPUT_INSNLEN = 1U << 22,
+ PERF_OUTPUT_ASM = 1U << 23,
};
struct output_option {
@@ -98,6 +100,7 @@ struct output_option {
{.str = "callindent", .field = PERF_OUTPUT_CALLINDENT},
{.str = "insn", .field = PERF_OUTPUT_INSN},
{.str = "insnlen", .field = PERF_OUTPUT_INSNLEN},
+ {.str = "asm", .field = PERF_OUTPUT_ASM},
};
/* default set to maintain compatibility with current format */
@@ -292,7 +295,11 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
"selected. Hence, no address to lookup the source line number.\n");
return -EINVAL;
}
-
+ if (PRINT_FIELD(ASM) && !PRINT_FIELD(IP)) {
+ pr_err("Display of assembler requested but sample IP is not\n"
+ "selected.\n");
+ return -EINVAL;
+ }
if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID",
PERF_OUTPUT_TID|PERF_OUTPUT_PID))
@@ -436,6 +443,39 @@ static void print_sample_iregs(struct perf_sample *sample,
}
}
+static void print_sample_asm(union perf_event *event,
+ struct perf_sample *sample,
+ struct thread *thread,
+ struct addr_location *al,
+ struct machine *machine)
+{
+ struct perf_dis x;
+ u8 buffer[32];
+ int len;
+ u64 offset;
+
+ x.thread = thread;
+ x.cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+ x.cpu = sample->cpu;
+
+ if (!al->map || !al->map->dso)
+ return;
+ if (al->map->dso->data.status == DSO_DATA_STATUS_ERROR)
+ return;
+
+ /* Load maps to ensure dso->is_64_bit has been updated */
+ map__load(al->map);
+ x.is64bit = al->map->dso->is_64_bit;
+
+ offset = al->map->map_ip(al->map, sample->ip);
+ len = dso__data_read_offset(al->map->dso, machine,
+ offset, buffer, MAXINSN);
+ if (len <= 0)
+ return;
+
+ printf("\t%s", disas_inst(&x, sample->ip, buffer, len, NULL));
+}
+
static void print_sample_start(struct perf_sample *sample,
struct thread *thread,
struct perf_evsel *evsel)
@@ -631,8 +671,12 @@ static void print_sample_callindent(struct perf_sample *sample,
printf("%*s", spacing - len, "");
}
-static void print_insn(struct perf_sample *sample,
- struct perf_event_attr *attr)
+static void print_insn(union perf_event *event,
+ struct perf_sample *sample,
+ struct perf_event_attr *attr,
+ struct thread *thread,
+ struct addr_location *al,
+ struct machine *machine)
{
if (PRINT_FIELD(INSNLEN))
printf(" ilen: %d", sample->insn_len);
@@ -643,12 +687,16 @@ static void print_insn(struct perf_sample *sample,
for (i = 0; i < sample->insn_len; i++)
printf(" %02x", (unsigned char)sample->insn[i]);
}
+ if (PRINT_FIELD(ASM))
+ print_sample_asm(event, sample, thread, al, machine);
}
-static void print_sample_bts(struct perf_sample *sample,
+static void print_sample_bts(union perf_event *event,
+ struct perf_sample *sample,
struct perf_evsel *evsel,
struct thread *thread,
- struct addr_location *al)
+ struct addr_location *al,
+ struct machine *machine)
{
struct perf_event_attr *attr = &evsel->attr;
bool print_srcline_last = false;
@@ -689,7 +737,7 @@ static void print_sample_bts(struct perf_sample *sample,
if (print_srcline_last)
map__fprintf_srcline(al->map, al->addr, "\n ", stdout);
- print_insn(sample, attr);
+ print_insn(event, sample, attr, thread, al, machine);
printf("\n");
}
@@ -871,7 +919,9 @@ static size_t data_src__printf(u64 data_src)
static void process_event(struct perf_script *script,
struct perf_sample *sample, struct perf_evsel *evsel,
- struct addr_location *al)
+ struct addr_location *al,
+ struct machine *machine,
+ union perf_event *event)
{
struct thread *thread = al->thread;
struct perf_event_attr *attr = &evsel->attr;
@@ -898,7 +948,7 @@ static void process_event(struct perf_script *script,
print_sample_flags(sample->flags);
if (is_bts_event(attr)) {
- print_sample_bts(sample, evsel, thread, al);
+ print_sample_bts(event, sample, evsel, thread, al, machine);
return;
}
@@ -936,7 +986,7 @@ static void process_event(struct perf_script *script,
if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
print_sample_bpf_output(sample);
- print_insn(sample, attr);
+ print_insn(event, sample, attr, thread, al, machine);
printf("\n");
}
@@ -1046,7 +1096,7 @@ static int process_sample_event(struct perf_tool *tool,
if (scripting_ops)
scripting_ops->process_event(event, sample, evsel, &al);
else
- process_event(scr, sample, evsel, &al);
+ process_event(scr, sample, evsel, &al, machine, event);
out_put:
addr_location__put(&al);
@@ -2152,7 +2202,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
"Valid types: hw,sw,trace,raw. "
"Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
"addr,symoff,period,iregs,brstack,brstacksym,flags,"
- "bpf-output,callindent,insn,insnlen", parse_output_fields),
+ "bpf-output,callindent,insn,insnlen,asm", parse_output_fields),
OPT_BOOLEAN('a', "all-cpus", &system_wide,
"system-wide collection from all CPUs"),
OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
--
2.9.3
^ permalink raw reply related [flat|nested] 8+ messages in thread