[PATCH 1/2] perf callchain: Create an address space per thread

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/2] perf callchain: Create an address space per thread
@ 2014-09-23  6:30 Namhyung Kim
  2014-09-23  6:30 ` [PATCH 2/2] perf callchain: Use global caching provided by libunwind Namhyung Kim
  2014-09-23 12:24 ` [PATCH 1/2] perf callchain: Create an address space per thread Jiri Olsa
  0 siblings, 2 replies; 14+ messages in thread
From: Namhyung Kim @ 2014-09-23  6:30 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Peter Zijlstra, Ingo Molnar, Paul Mackerras, Namhyung Kim,
	Namhyung Kim, LKML, Jiri Olsa, Jean Pihet, Arun Sharma

The unw_addr_space_t in libunwind represents an address space to be
used for stack unwinding.  It doesn't need to be create/destory
everytime to unwind callchain (as in get_entries) and can have a same
lifetime as thread (unless exec called).

So move the address space construction/destruction logic to the thread
lifetime handling functions.  This is a preparation to enable caching
in the unwind library.

Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Jean Pihet <jean.pihet@linaro.org>
Cc: Arun Sharma <asharma@fb.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/thread.c           |  8 ++++++++
 tools/perf/util/unwind-libunwind.c | 30 +++++++++++++++++++++++++-----
 tools/perf/util/unwind.h           | 17 +++++++++++++++++
 3 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index a9df7f2c6dc9..c1fa4a3597ea 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -7,6 +7,7 @@
 #include "util.h"
 #include "debug.h"
 #include "comm.h"
+#include "unwind.h"
 
 int thread__init_map_groups(struct thread *thread, struct machine *machine)
 {
@@ -48,6 +49,12 @@ struct thread *thread__new(pid_t pid, pid_t tid)
 			goto err_thread;
 
 		list_add(&comm->list, &thread->comm_list);
+
+		if (unwind__prepare_access(thread) < 0) {
+			list_del(&comm->list);
+			free(comm);
+			goto err_thread;
+		}
 	}
 
 	return thread;
@@ -69,6 +76,7 @@ void thread__delete(struct thread *thread)
 		list_del(&comm->list);
 		comm__free(comm);
 	}
+	unwind__finish_access(thread);
 
 	free(thread);
 }
diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index 92b56db52471..76ec25663c95 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -525,12 +525,9 @@ static unw_accessors_t accessors = {
 	.get_proc_name		= get_proc_name,
 };
 
-static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
-		       void *arg, int max_stack)
+int unwind__prepare_access(struct thread *thread)
 {
 	unw_addr_space_t addr_space;
-	unw_cursor_t c;
-	int ret;
 
 	addr_space = unw_create_addr_space(&accessors, 0);
 	if (!addr_space) {
@@ -538,6 +535,30 @@ static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
 		return -ENOMEM;
 	}
 
+	thread__set_priv(thread, addr_space);
+
+	return 0;
+}
+
+void unwind__finish_access(struct thread *thread)
+{
+	unw_addr_space_t addr_space;
+
+	addr_space = thread__priv(thread);
+	unw_destroy_addr_space(addr_space);
+}
+
+static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
+		       void *arg, int max_stack)
+{
+	unw_addr_space_t addr_space;
+	unw_cursor_t c;
+	int ret;
+
+	addr_space = thread__priv(ui->thread);
+	if (addr_space == NULL)
+		return -1;
+
 	ret = unw_init_remote(&c, addr_space, ui);
 	if (ret)
 		display_error(ret);
@@ -549,7 +570,6 @@ static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
 		ret = ip ? entry(ip, ui->thread, ui->machine, cb, arg) : 0;
 	}
 
-	unw_destroy_addr_space(addr_space);
 	return ret;
 }
 
diff --git a/tools/perf/util/unwind.h b/tools/perf/util/unwind.h
index f03061260b4e..4b99c6280c2a 100644
--- a/tools/perf/util/unwind.h
+++ b/tools/perf/util/unwind.h
@@ -4,6 +4,7 @@
 #include <linux/types.h>
 #include "event.h"
 #include "symbol.h"
+#include "thread.h"
 
 struct unwind_entry {
 	struct map	*map;
@@ -21,6 +22,15 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 /* libunwind specific */
 #ifdef HAVE_LIBUNWIND_SUPPORT
 int libunwind__arch_reg_id(int regnum);
+int unwind__prepare_access(struct thread *thread);
+void unwind__finish_access(struct thread *thread);
+#else
+static inline int unwind__prepare_access(struct thread *thread)
+{
+	return 0;
+}
+
+static inline void unwind__finish_access(struct thread *thread) {}
 #endif
 #else
 static inline int
@@ -33,5 +43,12 @@ unwind__get_entries(unwind_entry_cb_t cb __maybe_unused,
 {
 	return 0;
 }
+
+static inline int unwind__prepare_access(struct thread *thread)
+{
+	return 0;
+}
+
+static inline void unwind__finish_access(struct thread *thread) {}
 #endif /* HAVE_DWARF_UNWIND_SUPPORT */
 #endif /* __UNWIND_H */
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/2] perf callchain: Use global caching provided by libunwind
  2014-09-23  6:30 [PATCH 1/2] perf callchain: Create an address space per thread Namhyung Kim
@ 2014-09-23  6:30 ` Namhyung Kim
  2014-09-23 12:28   ` Jiri Olsa
  2014-09-23 14:01   ` Arun Sharma
  2014-09-23 12:24 ` [PATCH 1/2] perf callchain: Create an address space per thread Jiri Olsa
  1 sibling, 2 replies; 14+ messages in thread
From: Namhyung Kim @ 2014-09-23  6:30 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Peter Zijlstra, Ingo Molnar, Paul Mackerras, Namhyung Kim,
	Namhyung Kim, LKML, Jiri Olsa, Jean Pihet, Arun Sharma

The libunwind provides two caching policy which are global and
per-thread.  As perf unwinds callchains in a single thread, it'd
sufficient to use global caching.

This speeds up my perf report from 14s to 7s on a ~260MB data file.
Although the output contains a slight difference (~0.01% in terms of
number of lines printed) on callchains which were not resolved.

Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Jean Pihet <jean.pihet@linaro.org>
Cc: Arun Sharma <asharma@fb.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/thread.c           | 3 +++
 tools/perf/util/unwind-libunwind.c | 9 +++++++++
 tools/perf/util/unwind.h           | 3 +++
 3 files changed, 15 insertions(+)

diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index c1fa4a3597ea..e67d4ca6de44 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -119,6 +119,9 @@ int __thread__set_comm(struct thread *thread, const char *str, u64 timestamp,
 		if (!new)
 			return -ENOMEM;
 		list_add(&new->list, &thread->comm_list);
+
+		if (exec)
+			unwind__flush_access(thread);
 	}
 
 	thread->comm_set = true;
diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index 76ec25663c95..6df06f0cd177 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -535,11 +535,20 @@ int unwind__prepare_access(struct thread *thread)
 		return -ENOMEM;
 	}
 
+	unw_set_caching_policy(addr_space, UNW_CACHE_GLOBAL);
 	thread__set_priv(thread, addr_space);
 
 	return 0;
 }
 
+void unwind__flush_access(struct thread *thread)
+{
+	unw_addr_space_t addr_space;
+
+	addr_space = thread__priv(thread);
+	unw_flush_cache(addr_space, 0, 0);
+}
+
 void unwind__finish_access(struct thread *thread)
 {
 	unw_addr_space_t addr_space;
diff --git a/tools/perf/util/unwind.h b/tools/perf/util/unwind.h
index 4b99c6280c2a..d68f24d4f01b 100644
--- a/tools/perf/util/unwind.h
+++ b/tools/perf/util/unwind.h
@@ -23,6 +23,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 #ifdef HAVE_LIBUNWIND_SUPPORT
 int libunwind__arch_reg_id(int regnum);
 int unwind__prepare_access(struct thread *thread);
+void unwind__flush_access(struct thread *thread);
 void unwind__finish_access(struct thread *thread);
 #else
 static inline int unwind__prepare_access(struct thread *thread)
@@ -30,6 +31,7 @@ static inline int unwind__prepare_access(struct thread *thread)
 	return 0;
 }
 
+static inline void unwind__flush_access(struct thread *thread) {}
 static inline void unwind__finish_access(struct thread *thread) {}
 #endif
 #else
@@ -49,6 +51,7 @@ static inline int unwind__prepare_access(struct thread *thread)
 	return 0;
 }
 
+static inline void unwind__flush_access(struct thread *thread) {}
 static inline void unwind__finish_access(struct thread *thread) {}
 #endif /* HAVE_DWARF_UNWIND_SUPPORT */
 #endif /* __UNWIND_H */
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] perf callchain: Use global caching provided by libunwind
  2014-09-23  6:30 ` [PATCH 2/2] perf callchain: Use global caching provided by libunwind Namhyung Kim
@ 2014-09-23 12:28   ` Jiri Olsa
  2014-09-23 12:53     ` Namhyung Kim
  2014-09-23 14:01   ` Arun Sharma
  1 sibling, 1 reply; 14+ messages in thread
From: Jiri Olsa @ 2014-09-23 12:28 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Arnaldo Carvalho de Melo, Peter Zijlstra, Ingo Molnar,
	Paul Mackerras, Namhyung Kim, LKML, Jean Pihet, Arun Sharma

On Tue, Sep 23, 2014 at 03:30:28PM +0900, Namhyung Kim wrote:
> The libunwind provides two caching policy which are global and
> per-thread.  As perf unwinds callchains in a single thread, it'd
> sufficient to use global caching.
> 
> This speeds up my perf report from 14s to 7s on a ~260MB data file.
> Although the output contains a slight difference (~0.01% in terms of
> number of lines printed) on callchains which were not resolved.

hum, the speedup is nice, but what was the diff output.. any example?
The new version does not print some lines or different ones?

thanks,
jirka

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] perf callchain: Use global caching provided by libunwind
  2014-09-23 12:28   ` Jiri Olsa
@ 2014-09-23 12:53     ` Namhyung Kim
  2014-09-24  1:04       ` Namhyung Kim
  0 siblings, 1 reply; 14+ messages in thread
From: Namhyung Kim @ 2014-09-23 12:53 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Arnaldo Carvalho de Melo, Peter Zijlstra, Ingo Molnar,
	Paul Mackerras, Namhyung Kim, LKML, Jean Pihet, Arun Sharma

On Tue, Sep 23, 2014 at 9:28 PM, Jiri Olsa <jolsa@redhat.com> wrote:
> On Tue, Sep 23, 2014 at 03:30:28PM +0900, Namhyung Kim wrote:
>> The libunwind provides two caching policy which are global and
>> per-thread.  As perf unwinds callchains in a single thread, it'd
>> sufficient to use global caching.
>>
>> This speeds up my perf report from 14s to 7s on a ~260MB data file.
>> Although the output contains a slight difference (~0.01% in terms of
>> number of lines printed) on callchains which were not resolved.
>
> hum, the speedup is nice, but what was the diff output.. any example?
> The new version does not print some lines or different ones?

I don't have the result now - will post the diff when I go to the
office tomorrow.  But IIRC new version only adds new lines..

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] perf callchain: Use global caching provided by libunwind
  2014-09-23 12:53     ` Namhyung Kim
@ 2014-09-24  1:04       ` Namhyung Kim
  0 siblings, 0 replies; 14+ messages in thread
From: Namhyung Kim @ 2014-09-24  1:04 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Arnaldo Carvalho de Melo, Peter Zijlstra, Ingo Molnar,
	Paul Mackerras, Namhyung Kim, LKML, Jean Pihet, Arun Sharma

On Tue, 23 Sep 2014 21:53:43 +0900, Namhyung Kim wrote:
> On Tue, Sep 23, 2014 at 9:28 PM, Jiri Olsa <jolsa@redhat.com> wrote:
>> On Tue, Sep 23, 2014 at 03:30:28PM +0900, Namhyung Kim wrote:
>>> The libunwind provides two caching policy which are global and
>>> per-thread.  As perf unwinds callchains in a single thread, it'd
>>> sufficient to use global caching.
>>>
>>> This speeds up my perf report from 14s to 7s on a ~260MB data file.
>>> Although the output contains a slight difference (~0.01% in terms of
>>> number of lines printed) on callchains which were not resolved.
>>
>> hum, the speedup is nice, but what was the diff output.. any example?
>> The new version does not print some lines or different ones?
>
> I don't have the result now - will post the diff when I go to the
> office tomorrow.  But IIRC new version only adds new lines..

Okay, this is the result..  But it doesn't always make a difference.  I
can see it produces exactly same output for other (even bigger) data files.


  $ diff -U0 callchain-result.{old,new}
  --- callchain-result.old	2014-09-23 14:52:29.630711402 +0900
  +++ callchain-result.new	2014-09-23 14:52:52.709505500 +0900
  @@ -5322,0 +5323,9 @@
  +                          |          |          |          
  +                          |          |           --0.00%-- 0x406520
  +                          |          |                     0x4064f0
  +                          |          |                     |          
  +                          |          |                     |--0.00%-- 0x1ad52a0
  +                          |          |                     |          
  +                          |          |                     |--0.00%-- 0x1a0f2e0
  +                          |          |                     |          
  +                          |          |                      --0.00%-- 0x1a9e0c0
  @@ -108899,0 +108909,3 @@
  +                            |          |          0x406520
  +                            |          |          0x4064f0
  +                            |          |          0x1a9e0c0
  @@ -180410,0 +180423,9 @@
  +                          |          |          
  +                          |           --0.00%-- 0x406520
  +                          |                     0x4064f0
  +                          |                     |          
  +                          |                     |--0.00%-- 0x1ad52a0
  +                          |                     |          
  +                          |                     |--0.00%-- 0x1a0f2e0
  +                          |                     |          
  +                          |                      --0.00%-- 0x1a9e0c0
  
  $ wc -l callchain-result.{old,new}
    191412 callchain-result.old
    191433 callchain-result.new
    382845 total


Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] perf callchain: Use global caching provided by libunwind
  2014-09-23  6:30 ` [PATCH 2/2] perf callchain: Use global caching provided by libunwind Namhyung Kim
  2014-09-23 12:28   ` Jiri Olsa
@ 2014-09-23 14:01   ` Arun Sharma
  2014-09-24  2:24     ` Namhyung Kim
  1 sibling, 1 reply; 14+ messages in thread
From: Arun Sharma @ 2014-09-23 14:01 UTC (permalink / raw)
  To: Namhyung Kim, Arnaldo Carvalho de Melo
  Cc: Peter Zijlstra, Ingo Molnar, Paul Mackerras, Namhyung Kim, LKML,
	Jiri Olsa, Jean Pihet

On 9/23/14, 12:00 PM, Namhyung Kim wrote:

> +	unw_set_caching_policy(addr_space, UNW_CACHE_GLOBAL);

The result is a bit surprising for me. In micro benchmarking (eg:
Lperf-simple), the per-thread policy is generally faster because it
doesn't involve locking.

libunwind/tests/Lperf-simple
unw_getcontext : cold avg=  109.673 nsec, warm avg=   28.610 nsec
unw_init_local : cold avg=  259.876 nsec, warm avg=    9.537 nsec
no cache        : unw_step : 1st= 3258.387 min= 2922.331 avg= 3002.384 nsec
global cache    : unw_step : 1st= 1192.093 min=  960.486 avg=  982.208 nsec
per-thread cache: unw_step : 1st=  429.153 min=  113.533 avg=  121.762 nsec

I can see how the global policy would involve less memory allocation
because of shared data structures. Curious about the reason for the
speedup (specifically if libunwind should change the defaults for the
non-local unwinding case).

  -Arun


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] perf callchain: Use global caching provided by libunwind
  2014-09-23 14:01   ` Arun Sharma
@ 2014-09-24  2:24     ` Namhyung Kim
  2014-09-24 13:45       ` Jean Pihet
  0 siblings, 1 reply; 14+ messages in thread
From: Namhyung Kim @ 2014-09-24  2:24 UTC (permalink / raw)
  To: Arun Sharma
  Cc: Arnaldo Carvalho de Melo, Peter Zijlstra, Ingo Molnar,
	Paul Mackerras, Namhyung Kim, LKML, Jiri Olsa, Jean Pihet

Hi Arun,

On Tue, 23 Sep 2014 14:01:22 +0000, Arun Sharma wrote:
> On 9/23/14, 12:00 PM, Namhyung Kim wrote:
>
>> +	unw_set_caching_policy(addr_space, UNW_CACHE_GLOBAL);
>
> The result is a bit surprising for me. In micro benchmarking (eg:
> Lperf-simple), the per-thread policy is generally faster because it
> doesn't involve locking.
>
> libunwind/tests/Lperf-simple
> unw_getcontext : cold avg=  109.673 nsec, warm avg=   28.610 nsec
> unw_init_local : cold avg=  259.876 nsec, warm avg=    9.537 nsec
> no cache        : unw_step : 1st= 3258.387 min= 2922.331 avg= 3002.384 nsec
> global cache    : unw_step : 1st= 1192.093 min=  960.486 avg=  982.208 nsec
> per-thread cache: unw_step : 1st=  429.153 min=  113.533 avg=  121.762 nsec

Yes, per-thread policy is faster than global caching policy.  Below is my
test result.  Note that I already run this several times before to
remove an effect that file contents loaded in page cache.

 Performance counter stats for
   'perf report -i /home/namhyung/tmp/perf-testing/perf.data.kbuild.dwarf --stdio' (3 runs):

                                 UNW_CACHE_NONE         UNW_CACHE_GLOBAL     UNW_CACHE_PER_THREAD
  -----------------------------------------------------------------------------------------------
  task-clock (msec)                14298.911947              7112.171928              6913.244797      
  context-switches                        1,507                      762                      742      
  cpu-migrations                              1                        2                        1      
  page-faults                         2,924,889                1,101,380                1,101,380      
  cycles                         53,895,784,665           26,798,627,423           26,070,728,349      
  stalled-cycles-frontend        24,472,506,687           12,577,760,746           12,435,320,081      
  stalled-cycles-backend         17,550,483,726            9,075,054,009            9,035,478,957      
  instructions                   73,544,039,490           34,352,889,707           33,283,120,736      
  branches                       14,969,890,371            7,139,469,848            6,926,994,151      
  branch-misses                     193,852,116              100,455,431               99,757,213      
  time elapsed                     14.905719730              7.455597356              7.242275972      


>
> I can see how the global policy would involve less memory allocation
> because of shared data structures. Curious about the reason for the
> speedup (specifically if libunwind should change the defaults for the
> non-local unwinding case).

I don't see much difference between global and per-thread caching for
remote unwind (besides rs_cache->lock you mentioned).  Also I'm curious
that how rs_new() is protected from concurrent accesses in per-thread
caching.  That's why I chose the global caching - yeah, it probably
doesn't matter to a single thread, but... :)

Thanks
Namhyung

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] perf callchain: Use global caching provided by libunwind
  2014-09-24  2:24     ` Namhyung Kim
@ 2014-09-24 13:45       ` Jean Pihet
  2014-09-26  5:50         ` Namhyung Kim
  0 siblings, 1 reply; 14+ messages in thread
From: Jean Pihet @ 2014-09-24 13:45 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Arun Sharma, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Ingo Molnar, Paul Mackerras, Namhyung Kim, LKML, Jiri Olsa

Hi!

Here are the test results on ARMv7 for the 2 patches. The speedup is
about x2.1 for identical unwinding output data.

'perf record --call-graph dwarf -- stress --cpu 2 --io 2 --vm 2
--timeout 10s' generates a 365 MB perf.data file.

time perf.orig report --sort symbol --call-graph --stdio 2&>1 > /dev/null
average on 3 runs
real    36.736
user   14.79
sys    21.91

time perf.libunwind.speedup report --sort symbol --call-graph --stdio
2&>1 > /dev/null
average on 3 runs
real    17.41        x2.11
user     6.42        x2.3
sys    10.97        x2

So the patches definitely speedup the unwinding.
FWIW: Acked-by: Jean Pihet <jean.pihet@linaro.org>

For info unwinding using libdw is about 5x faster:
time perf.libdw.speedup report --sort symbol --call-graph --stdio 2&>1
> /dev/null
real    0m3.484s
user    0m2.360s
sys    0m1.070s

Thanks,
Jean

On 24 September 2014 04:24, Namhyung Kim <namhyung@kernel.org> wrote:
> Hi Arun,
>
> On Tue, 23 Sep 2014 14:01:22 +0000, Arun Sharma wrote:
>> On 9/23/14, 12:00 PM, Namhyung Kim wrote:
>>
>>> +    unw_set_caching_policy(addr_space, UNW_CACHE_GLOBAL);
>>
>> The result is a bit surprising for me. In micro benchmarking (eg:
>> Lperf-simple), the per-thread policy is generally faster because it
>> doesn't involve locking.
>>
>> libunwind/tests/Lperf-simple
>> unw_getcontext : cold avg=  109.673 nsec, warm avg=   28.610 nsec
>> unw_init_local : cold avg=  259.876 nsec, warm avg=    9.537 nsec
>> no cache        : unw_step : 1st= 3258.387 min= 2922.331 avg= 3002.384 nsec
>> global cache    : unw_step : 1st= 1192.093 min=  960.486 avg=  982.208 nsec
>> per-thread cache: unw_step : 1st=  429.153 min=  113.533 avg=  121.762 nsec
>
> Yes, per-thread policy is faster than global caching policy.  Below is my
> test result.  Note that I already run this several times before to
> remove an effect that file contents loaded in page cache.
>
>  Performance counter stats for
>    'perf report -i /home/namhyung/tmp/perf-testing/perf.data.kbuild.dwarf --stdio' (3 runs):
>
>                                  UNW_CACHE_NONE         UNW_CACHE_GLOBAL     UNW_CACHE_PER_THREAD
>   -----------------------------------------------------------------------------------------------
>   task-clock (msec)                14298.911947              7112.171928              6913.244797
>   context-switches                        1,507                      762                      742
>   cpu-migrations                              1                        2                        1
>   page-faults                         2,924,889                1,101,380                1,101,380
>   cycles                         53,895,784,665           26,798,627,423           26,070,728,349
>   stalled-cycles-frontend        24,472,506,687           12,577,760,746           12,435,320,081
>   stalled-cycles-backend         17,550,483,726            9,075,054,009            9,035,478,957
>   instructions                   73,544,039,490           34,352,889,707           33,283,120,736
>   branches                       14,969,890,371            7,139,469,848            6,926,994,151
>   branch-misses                     193,852,116              100,455,431               99,757,213
>   time elapsed                     14.905719730              7.455597356              7.242275972
>
>
>>
>> I can see how the global policy would involve less memory allocation
>> because of shared data structures. Curious about the reason for the
>> speedup (specifically if libunwind should change the defaults for the
>> non-local unwinding case).
>
> I don't see much difference between global and per-thread caching for
> remote unwind (besides rs_cache->lock you mentioned).  Also I'm curious
> that how rs_new() is protected from concurrent accesses in per-thread
> caching.  That's why I chose the global caching - yeah, it probably
> doesn't matter to a single thread, but... :)
>
> Thanks
> Namhyung

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] perf callchain: Use global caching provided by libunwind
  2014-09-24 13:45       ` Jean Pihet
@ 2014-09-26  5:50         ` Namhyung Kim
  2014-09-26  7:14           ` Jean Pihet
  0 siblings, 1 reply; 14+ messages in thread
From: Namhyung Kim @ 2014-09-26  5:50 UTC (permalink / raw)
  To: Jean Pihet
  Cc: Arun Sharma, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Ingo Molnar, Paul Mackerras, Namhyung Kim, LKML, Jiri Olsa

Hi Jean,

On Wed, 24 Sep 2014 15:45:57 +0200, Jean Pihet wrote:
> Hi!
>
> Here are the test results on ARMv7 for the 2 patches. The speedup is
> about x2.1 for identical unwinding output data.
>
> 'perf record --call-graph dwarf -- stress --cpu 2 --io 2 --vm 2
> --timeout 10s' generates a 365 MB perf.data file.
>
> time perf.orig report --sort symbol --call-graph --stdio 2&>1 > /dev/null
> average on 3 runs
> real    36.736
> user   14.79
> sys    21.91
>
> time perf.libunwind.speedup report --sort symbol --call-graph --stdio
> 2&>1 > /dev/null
> average on 3 runs
> real    17.41        x2.11
> user     6.42        x2.3
> sys    10.97        x2
>
> So the patches definitely speedup the unwinding.
> FWIW: Acked-by: Jean Pihet <jean.pihet@linaro.org>

Thanks for your test!

Btw, have you checked the difference of the output before and after the
caching?

>
> For info unwinding using libdw is about 5x faster:
> time perf.libdw.speedup report --sort symbol --call-graph --stdio 2&>1
>> /dev/null
> real    0m3.484s
> user    0m2.360s
> sys    0m1.070s

Wow, it's pretty nice.  I'll take a look at the libdw unwinding later.

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] perf callchain: Use global caching provided by libunwind
  2014-09-26  5:50         ` Namhyung Kim
@ 2014-09-26  7:14           ` Jean Pihet
  2014-09-29  2:35             ` Namhyung Kim
  0 siblings, 1 reply; 14+ messages in thread
From: Jean Pihet @ 2014-09-26  7:14 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Arun Sharma, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Ingo Molnar, Paul Mackerras, Namhyung Kim, LKML, Jiri Olsa

Hi,

On 26 September 2014 07:50, Namhyung Kim <namhyung@kernel.org> wrote:
> Hi Jean,
>
> On Wed, 24 Sep 2014 15:45:57 +0200, Jean Pihet wrote:
>> Hi!
>>
>> Here are the test results on ARMv7 for the 2 patches. The speedup is
>> about x2.1 for identical unwinding output data.
>>
>> 'perf record --call-graph dwarf -- stress --cpu 2 --io 2 --vm 2
>> --timeout 10s' generates a 365 MB perf.data file.
>>
>> time perf.orig report --sort symbol --call-graph --stdio 2&>1 > /dev/null
>> average on 3 runs
>> real    36.736
>> user   14.79
>> sys    21.91
>>
>> time perf.libunwind.speedup report --sort symbol --call-graph --stdio
>> 2&>1 > /dev/null
>> average on 3 runs
>> real    17.41        x2.11
>> user     6.42        x2.3
>> sys    10.97        x2
>>
>> So the patches definitely speedup the unwinding.
>> FWIW: Acked-by: Jean Pihet <jean.pihet@linaro.org>
>
> Thanks for your test!
>
> Btw, have you checked the difference of the output before and after the
> caching?
Yes the output is identical (as stated here above).

>
>>
>> For info unwinding using libdw is about 5x faster:
>> time perf.libdw.speedup report --sort symbol --call-graph --stdio 2&>1
>>> /dev/null
>> real    0m3.484s
>> user    0m2.360s
>> sys    0m1.070s
>
> Wow, it's pretty nice.  I'll take a look at the libdw unwinding later.
>
> Thanks,
> Namhyung

Thanks for the patches!
Jean

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] perf callchain: Use global caching provided by libunwind
  2014-09-26  7:14           ` Jean Pihet
@ 2014-09-29  2:35             ` Namhyung Kim
  0 siblings, 0 replies; 14+ messages in thread
From: Namhyung Kim @ 2014-09-29  2:35 UTC (permalink / raw)
  To: Jean Pihet
  Cc: Arun Sharma, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Ingo Molnar, Paul Mackerras, Namhyung Kim, LKML, Jiri Olsa

Hi Jean,

On Fri, 26 Sep 2014 09:14:41 +0200, Jean Pihet wrote:
> Hi,
>
> On 26 September 2014 07:50, Namhyung Kim <namhyung@kernel.org> wrote:
>> Hi Jean,
>>
>> On Wed, 24 Sep 2014 15:45:57 +0200, Jean Pihet wrote:
>>> Hi!
>>>
>>> Here are the test results on ARMv7 for the 2 patches. The speedup is
>>> about x2.1 for identical unwinding output data.
>>>
>>> 'perf record --call-graph dwarf -- stress --cpu 2 --io 2 --vm 2
>>> --timeout 10s' generates a 365 MB perf.data file.
>>>
>>> time perf.orig report --sort symbol --call-graph --stdio 2&>1 > /dev/null
>>> average on 3 runs
>>> real    36.736
>>> user   14.79
>>> sys    21.91
>>>
>>> time perf.libunwind.speedup report --sort symbol --call-graph --stdio
>>> 2&>1 > /dev/null
>>> average on 3 runs
>>> real    17.41        x2.11
>>> user     6.42        x2.3
>>> sys    10.97        x2
>>>
>>> So the patches definitely speedup the unwinding.
>>> FWIW: Acked-by: Jean Pihet <jean.pihet@linaro.org>
>>
>> Thanks for your test!
>>
>> Btw, have you checked the difference of the output before and after the
>> caching?
> Yes the output is identical (as stated here above).

Ah, I misunderstood it.  Thanks for your confirmation. :)

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] perf callchain: Create an address space per thread
  2014-09-23  6:30 [PATCH 1/2] perf callchain: Create an address space per thread Namhyung Kim
  2014-09-23  6:30 ` [PATCH 2/2] perf callchain: Use global caching provided by libunwind Namhyung Kim
@ 2014-09-23 12:24 ` Jiri Olsa
  2014-09-23 12:49   ` Namhyung Kim
  1 sibling, 1 reply; 14+ messages in thread
From: Jiri Olsa @ 2014-09-23 12:24 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Arnaldo Carvalho de Melo, Peter Zijlstra, Ingo Molnar,
	Paul Mackerras, Namhyung Kim, LKML, Jean Pihet, Arun Sharma

On Tue, Sep 23, 2014 at 03:30:27PM +0900, Namhyung Kim wrote:
> The unw_addr_space_t in libunwind represents an address space to be
> used for stack unwinding.  It doesn't need to be create/destory
> everytime to unwind callchain (as in get_entries) and can have a same
> lifetime as thread (unless exec called).
> 
> So move the address space construction/destruction logic to the thread
> lifetime handling functions.  This is a preparation to enable caching
> in the unwind library.
> 
> Cc: Jiri Olsa <jolsa@redhat.com>
> Cc: Jean Pihet <jean.pihet@linaro.org>
> Cc: Arun Sharma <asharma@fb.com>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
>  tools/perf/util/thread.c           |  8 ++++++++
>  tools/perf/util/unwind-libunwind.c | 30 +++++++++++++++++++++++++-----
>  tools/perf/util/unwind.h           | 17 +++++++++++++++++
>  3 files changed, 50 insertions(+), 5 deletions(-)
> 
> diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
> index a9df7f2c6dc9..c1fa4a3597ea 100644
> --- a/tools/perf/util/thread.c
> +++ b/tools/perf/util/thread.c
> @@ -7,6 +7,7 @@
>  #include "util.h"
>  #include "debug.h"
>  #include "comm.h"
> +#include "unwind.h"
>  
>  int thread__init_map_groups(struct thread *thread, struct machine *machine)
>  {
> @@ -48,6 +49,12 @@ struct thread *thread__new(pid_t pid, pid_t tid)
>  			goto err_thread;
>  
>  		list_add(&comm->list, &thread->comm_list);
> +
> +		if (unwind__prepare_access(thread) < 0) {

you could call list_add below this call and thus save
the list_del call below in error path

looks like you dont need comm to be on comm_list within
the unwind__prepare_access call

> +			list_del(&comm->list);
> +			free(comm);
> +			goto err_thread;
> +		}
>  	}

SNIP

jirka

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] perf callchain: Create an address space per thread
  2014-09-23 12:24 ` [PATCH 1/2] perf callchain: Create an address space per thread Jiri Olsa
@ 2014-09-23 12:49   ` Namhyung Kim
  2014-09-26 15:35     ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 14+ messages in thread
From: Namhyung Kim @ 2014-09-23 12:49 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Arnaldo Carvalho de Melo, Peter Zijlstra, Ingo Molnar,
	Paul Mackerras, Namhyung Kim, LKML, Jean Pihet, Arun Sharma

Hi Jiri,

On Tue, Sep 23, 2014 at 9:24 PM, Jiri Olsa <jolsa@redhat.com> wrote:
> On Tue, Sep 23, 2014 at 03:30:27PM +0900, Namhyung Kim wrote:
>> The unw_addr_space_t in libunwind represents an address space to be
>> used for stack unwinding.  It doesn't need to be create/destory
>> everytime to unwind callchain (as in get_entries) and can have a same
>> lifetime as thread (unless exec called).
>>
>> So move the address space construction/destruction logic to the thread
>> lifetime handling functions.  This is a preparation to enable caching
>> in the unwind library.
>>
>> Cc: Jiri Olsa <jolsa@redhat.com>
>> Cc: Jean Pihet <jean.pihet@linaro.org>
>> Cc: Arun Sharma <asharma@fb.com>
>> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
>> ---
>>  tools/perf/util/thread.c           |  8 ++++++++
>>  tools/perf/util/unwind-libunwind.c | 30 +++++++++++++++++++++++++-----
>>  tools/perf/util/unwind.h           | 17 +++++++++++++++++
>>  3 files changed, 50 insertions(+), 5 deletions(-)
>>
>> diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
>> index a9df7f2c6dc9..c1fa4a3597ea 100644
>> --- a/tools/perf/util/thread.c
>> +++ b/tools/perf/util/thread.c
>> @@ -7,6 +7,7 @@
>>  #include "util.h"
>>  #include "debug.h"
>>  #include "comm.h"
>> +#include "unwind.h"
>>
>>  int thread__init_map_groups(struct thread *thread, struct machine *machine)
>>  {
>> @@ -48,6 +49,12 @@ struct thread *thread__new(pid_t pid, pid_t tid)
>>                       goto err_thread;
>>
>>               list_add(&comm->list, &thread->comm_list);
>> +
>> +             if (unwind__prepare_access(thread) < 0) {
>
> you could call list_add below this call and thus save
> the list_del call below in error path
>
> looks like you dont need comm to be on comm_list within
> the unwind__prepare_access call

Right.  Will change.

Thanks,
Namhyung


>
>> +                     list_del(&comm->list);
>> +                     free(comm);
>> +                     goto err_thread;
>> +             }
>>       }
>
> SNIP
>
> jirka

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] perf callchain: Create an address space per thread
  2014-09-23 12:49   ` Namhyung Kim
@ 2014-09-26 15:35     ` Arnaldo Carvalho de Melo
  0 siblings, 0 replies; 14+ messages in thread
From: Arnaldo Carvalho de Melo @ 2014-09-26 15:35 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Jiri Olsa, Peter Zijlstra, Ingo Molnar, Paul Mackerras,
	Namhyung Kim, LKML, Jean Pihet, Arun Sharma

Em Tue, Sep 23, 2014 at 09:49:22PM +0900, Namhyung Kim escreveu:
> On Tue, Sep 23, 2014 at 9:24 PM, Jiri Olsa <jolsa@redhat.com> wrote:
> >>               list_add(&comm->list, &thread->comm_list);
> >> +
> >> +             if (unwind__prepare_access(thread) < 0) {
> >
> > you could call list_add below this call and thus save
> > the list_del call below in error path
> >
> > looks like you dont need comm to be on comm_list within
> > the unwind__prepare_access call
> 
> Right.  Will change.

Ok, waiting for v2 then.

- Arnaldo

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2014-09-29  2:35 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-09-23  6:30 [PATCH 1/2] perf callchain: Create an address space per thread Namhyung Kim
2014-09-23  6:30 ` [PATCH 2/2] perf callchain: Use global caching provided by libunwind Namhyung Kim
2014-09-23 12:28   ` Jiri Olsa
2014-09-23 12:53     ` Namhyung Kim
2014-09-24  1:04       ` Namhyung Kim
2014-09-23 14:01   ` Arun Sharma
2014-09-24  2:24     ` Namhyung Kim
2014-09-24 13:45       ` Jean Pihet
2014-09-26  5:50         ` Namhyung Kim
2014-09-26  7:14           ` Jean Pihet
2014-09-29  2:35             ` Namhyung Kim
2014-09-23 12:24 ` [PATCH 1/2] perf callchain: Create an address space per thread Jiri Olsa
2014-09-23 12:49   ` Namhyung Kim
2014-09-26 15:35     ` Arnaldo Carvalho de Melo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).