* [PATCH] perf record: mmap output file - RFC
@ 2013-09-23 2:05 David Ahern
2013-09-26 17:51 ` Jiri Olsa
2013-10-09 13:33 ` Jiri Olsa
0 siblings, 2 replies; 9+ messages in thread
From: David Ahern @ 2013-09-23 2:05 UTC (permalink / raw)
To: acme, linux-kernel
Cc: David Ahern, Ingo Molnar, Frederic Weisbecker, Peter Zijlstra,
Jiri Olsa, Namhyung Kim, Mike Galbraith, Stephane Eranian
When recording raw_syscalls for the entire system, e.g.,
perf record -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
you end up with a negative feedback loop as perf itself calls
write() fairly often. This patch mmap's the file in chunks of 64M
at a time and copies events from the event buffers to the file
avoiding write system calls.
Before (with write syscall):
perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
[ perf record: Woken up 0 times to write data ]
[ perf record: Captured and wrote 81.843 MB /tmp/perf.data (~3575786 samples) ]
After (using mmap):
perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
[ perf record: Woken up 31 times to write data ]
[ perf record: Captured and wrote 8.203 MB /tmp/perf.data (~358388 samples) ]
Before I get too far down this path I wanted to get comments on the approach.
Signed-off-by: David Ahern <dsahern@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Stephane Eranian <eranian@google.com>
---
tools/perf/builtin-record.c | 87 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 87 insertions(+)
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index da13840..45bb565 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -29,6 +29,9 @@
#include <sched.h>
#include <sys/mman.h>
+/* mmap file big chunks at a time */
+#define MMAP_OUTPUT_SIZE (64*1024*1024)
+
#ifndef HAVE_ON_EXIT
#ifndef ATEXIT_MAX
#define ATEXIT_MAX 32
@@ -64,6 +67,14 @@ static void __handle_on_exit_funcs(void)
struct perf_record {
struct perf_tool tool;
struct perf_record_opts opts;
+
+ /* for MMAP based file writes */
+ void *mmap_addr;
+ u64 bytes_at_mmap_start; /* bytes in file when mmap use starts */
+ u64 mmap_offset; /* current location within mmap */
+ size_t mmap_size; /* size of mmap segments */
+ bool use_mmap;
+
u64 bytes_written;
const char *output_name;
struct perf_evlist *evlist;
@@ -82,8 +93,66 @@ static void advance_output(struct perf_record *rec, size_t size)
rec->bytes_written += size;
}
+static int do_mmap_output(struct perf_record *rec, void *buf, size_t size)
+{
+ u64 remaining;
+ off_t offset;
+
+ if (rec->mmap_addr == NULL) {
+do_mmap:
+ offset = rec->bytes_at_mmap_start + rec->bytes_written;
+ if (offset < (ssize_t) rec->mmap_size) {
+ rec->mmap_offset = offset;
+ offset = 0;
+ } else
+ rec->mmap_offset = 0;
+
+ rec->mmap_addr = mmap(NULL, rec->mmap_size,
+ PROT_WRITE | PROT_READ,
+ MAP_SHARED,
+ rec->output,
+ offset);
+
+ if (rec->mmap_addr == MAP_FAILED) {
+ pr_err("mmap failed: %d: %s\n", errno, strerror(errno));
+ return -1;
+ }
+
+ /* expand file to include this mmap segment */
+ if (ftruncate(rec->output, offset + rec->mmap_size) != 0) {
+ pr_err("ftruncate failed\n");
+ return -1;
+ }
+ }
+
+ remaining = rec->mmap_size - rec->mmap_offset;
+
+ if (size > remaining) {
+ memcpy(rec->mmap_addr + rec->mmap_offset, buf, remaining);
+ rec->bytes_written += remaining;
+
+ size -= remaining;
+ buf += remaining;
+
+ msync(rec->mmap_addr, rec->mmap_size, MS_ASYNC);
+ munmap(rec->mmap_addr, rec->mmap_size);
+ goto do_mmap;
+ }
+
+ if (size) {
+ memcpy(rec->mmap_addr + rec->mmap_offset, buf, size);
+ rec->bytes_written += size;
+ rec->mmap_offset += size;
+ }
+
+ return 0;
+}
+
static int write_output(struct perf_record *rec, void *buf, size_t size)
{
+ if (rec->use_mmap)
+ return do_mmap_output(rec, buf, size);
+
while (size) {
int ret = write(rec->output, buf, size);
@@ -546,6 +615,11 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
if (forks)
perf_evlist__start_workload(evsel_list);
+ if (!rec->opts.pipe_output && stat(output_name, &st) == 0) {
+ rec->use_mmap = true;
+ rec->bytes_at_mmap_start = st.st_size - rec->bytes_written;
+ }
+
for (;;) {
int hits = rec->samples;
@@ -572,6 +646,18 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
}
}
+ if (rec->use_mmap) {
+ off_t len = rec->bytes_at_mmap_start + rec->bytes_written;
+
+ rec->use_mmap = false;
+ msync(rec->mmap_addr, rec->mmap_size, MS_ASYNC);
+ munmap(rec->mmap_addr, rec->mmap_size);
+ rec->mmap_addr = NULL;
+
+ if (ftruncate(rec->output, len) != 0)
+ pr_err("ftruncate failed\n");
+ }
+
if (quiet || signr == SIGUSR1)
return 0;
@@ -804,6 +890,7 @@ static struct perf_record record = {
.uses_mmap = true,
},
},
+ .mmap_size = MMAP_OUTPUT_SIZE,
};
#define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "
--
1.7.10.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH] perf record: mmap output file - RFC
2013-09-23 2:05 [PATCH] perf record: mmap output file - RFC David Ahern
@ 2013-09-26 17:51 ` Jiri Olsa
2013-09-26 18:12 ` David Ahern
` (2 more replies)
2013-10-09 13:33 ` Jiri Olsa
1 sibling, 3 replies; 9+ messages in thread
From: Jiri Olsa @ 2013-09-26 17:51 UTC (permalink / raw)
To: David Ahern
Cc: acme, linux-kernel, Ingo Molnar, Frederic Weisbecker,
Peter Zijlstra, Namhyung Kim, Mike Galbraith, Stephane Eranian
On Sun, Sep 22, 2013 at 08:05:59PM -0600, David Ahern wrote:
> When recording raw_syscalls for the entire system, e.g.,
> perf record -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
>
> you end up with a negative feedback loop as perf itself calls
> write() fairly often. This patch mmap's the file in chunks of 64M
> at a time and copies events from the event buffers to the file
> avoiding write system calls.
moved processing into userspace:
17.24% -17.10% libpthread-2.15.so [.] __write_nocancel
...
0.07% +0.64% libc-2.15.so [.] __memcpy_sse2
0.02% +51.84% libc-2.15.so [.] __memcpy_ssse3_back
0.01% +0.34% libc-2.15.so [.] __mempcpy_sse2
...
>
> Before (with write syscall):
>
> perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
> [ perf record: Woken up 0 times to write data ]
> [ perf record: Captured and wrote 81.843 MB /tmp/perf.data (~3575786 samples) ]
>
> After (using mmap):
>
> perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
> [ perf record: Woken up 31 times to write data ]
^^^^^^^^
but it's still faster, since we finally get perf a chance to sleep ;-)
new time:
real 0m30.392s
user 0m0.041s
sys 0m0.389s
old time:
real 0m32.235s
user 0m3.080s
sys 0m14.444s
> [ perf record: Captured and wrote 8.203 MB /tmp/perf.data (~358388 samples) ]
>
> Before I get too far down this path I wanted to get comments on the approach.
I think it's worthwhile doing this
jirka
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] perf record: mmap output file - RFC
2013-09-26 17:51 ` Jiri Olsa
@ 2013-09-26 18:12 ` David Ahern
2013-09-26 18:23 ` Jiri Olsa
2013-09-26 23:17 ` David Ahern
2013-09-27 2:24 ` Namhyung Kim
2 siblings, 1 reply; 9+ messages in thread
From: David Ahern @ 2013-09-26 18:12 UTC (permalink / raw)
To: Jiri Olsa
Cc: acme, linux-kernel, Ingo Molnar, Frederic Weisbecker,
Peter Zijlstra, Namhyung Kim, Mike Galbraith, Stephane Eranian
On 9/26/13 11:51 AM, Jiri Olsa wrote:
> but it's still faster, since we finally get perf a chance to sleep ;-)
>
> new time:
> real 0m30.392s
> user 0m0.041s
> sys 0m0.389s
>
> old time:
> real 0m32.235s
> user 0m3.080s
> sys 0m14.444s
excellent.
>
>
>> [ perf record: Captured and wrote 8.203 MB /tmp/perf.data (~358388 samples) ]
>>
>> Before I get too far down this path I wanted to get comments on the approach.
>
> I think it's worthwhile doing this
If you can toss it onto your queue to run through it other use cases --
like the 'make -j' on kernel builds. I am doing the same here and if all
goes well I submit with some other patches next week.
Thanks,
David
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] perf record: mmap output file - RFC
2013-09-26 18:12 ` David Ahern
@ 2013-09-26 18:23 ` Jiri Olsa
0 siblings, 0 replies; 9+ messages in thread
From: Jiri Olsa @ 2013-09-26 18:23 UTC (permalink / raw)
To: David Ahern
Cc: acme, linux-kernel, Ingo Molnar, Frederic Weisbecker,
Peter Zijlstra, Namhyung Kim, Mike Galbraith, Stephane Eranian
On Thu, Sep 26, 2013 at 12:12:02PM -0600, David Ahern wrote:
> On 9/26/13 11:51 AM, Jiri Olsa wrote:
> >but it's still faster, since we finally get perf a chance to sleep ;-)
> >
> >new time:
> > real 0m30.392s
> > user 0m0.041s
> > sys 0m0.389s
> >
> >old time:
> > real 0m32.235s
> > user 0m3.080s
> > sys 0m14.444s
>
> excellent.
>
> >
> >
> >>[ perf record: Captured and wrote 8.203 MB /tmp/perf.data (~358388 samples) ]
> >>
> >>Before I get too far down this path I wanted to get comments on the approach.
> >
> >I think it's worthwhile doing this
>
> If you can toss it onto your queue to run through it other use cases
> -- like the 'make -j' on kernel builds. I am doing the same here and
> if all goes well I submit with some other patches next week.
ok, np
jirka
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] perf record: mmap output file - RFC
2013-09-26 17:51 ` Jiri Olsa
2013-09-26 18:12 ` David Ahern
@ 2013-09-26 23:17 ` David Ahern
2013-09-27 6:29 ` Ingo Molnar
2013-09-27 2:24 ` Namhyung Kim
2 siblings, 1 reply; 9+ messages in thread
From: David Ahern @ 2013-09-26 23:17 UTC (permalink / raw)
To: Jiri Olsa, acme
Cc: linux-kernel, Ingo Molnar, Frederic Weisbecker, Peter Zijlstra,
Namhyung Kim, Mike Galbraith, Stephane Eranian
On 9/26/13 11:51 AM, Jiri Olsa wrote:
> but it's still faster, since we finally get perf a chance to sleep ;-)
>
> new time:
> real 0m30.392s
> user 0m0.041s
> sys 0m0.389s
>
> old time:
> real 0m32.235s
> user 0m3.080s
> sys 0m14.444s
>
Another data point on the performance improvement of perf itself. Using
openssl speed as a workload and perf-stat to collect information about
the perf-record process only:
perf stat -i -- perf record -g -o /tmp/perf.data openssl speed aes
With write():
158.606380 task-clock
72 context-switches
34 cpu-migrations
5,400 page-faults
336,054,007 cycles
137,804,036 stalled-cycles-frontend
74,505,914 stalled-cycles-backend
474,401,639 instructions
91,246,072 branches
1,968,289 branch-misses
With mmap():
50.314270 task-clock
61 context-switches
7 cpu-migrations
3,958 page-faults
93,585,618 cycles
64,878,225 stalled-cycles-frontend
41,680,427 stalled-cycles-backend
81,552,219 instructions
15,301,389 branches
387,230 branch-misses
So time, CPU cycles, instructions all drop by more than a factor of 3.
David
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] perf record: mmap output file - RFC
2013-09-26 17:51 ` Jiri Olsa
2013-09-26 18:12 ` David Ahern
2013-09-26 23:17 ` David Ahern
@ 2013-09-27 2:24 ` Namhyung Kim
2 siblings, 0 replies; 9+ messages in thread
From: Namhyung Kim @ 2013-09-27 2:24 UTC (permalink / raw)
To: Jiri Olsa
Cc: David Ahern, acme, linux-kernel, Ingo Molnar, Frederic Weisbecker,
Peter Zijlstra, Mike Galbraith, Stephane Eranian
Hi Jiri and David,
On Thu, 26 Sep 2013 19:51:05 +0200, Jiri Olsa wrote:
> On Sun, Sep 22, 2013 at 08:05:59PM -0600, David Ahern wrote:
>> When recording raw_syscalls for the entire system, e.g.,
>> perf record -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
>>
>> you end up with a negative feedback loop as perf itself calls
>> write() fairly often. This patch mmap's the file in chunks of 64M
>> at a time and copies events from the event buffers to the file
>> avoiding write system calls.
>
> moved processing into userspace:
>
> 17.24% -17.10% libpthread-2.15.so [.] __write_nocancel
> ...
> 0.07% +0.64% libc-2.15.so [.] __memcpy_sse2
> 0.02% +51.84% libc-2.15.so [.] __memcpy_ssse3_back
> 0.01% +0.34% libc-2.15.so [.] __mempcpy_sse2
> ...
>>
>> Before (with write syscall):
>>
>> perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
>> [ perf record: Woken up 0 times to write data ]
>> [ perf record: Captured and wrote 81.843 MB /tmp/perf.data (~3575786 samples) ]
>>
>> After (using mmap):
>>
>> perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
>> [ perf record: Woken up 31 times to write data ]
>
> ^^^^^^^^
> but it's still faster, since we finally get perf a chance to sleep ;-)
>
> new time:
> real 0m30.392s
> user 0m0.041s
> sys 0m0.389s
>
> old time:
> real 0m32.235s
> user 0m3.080s
> sys 0m14.444s
But why the new user time took so short? I guess it should take at
least 10 seconds or so. Any ideas?
>
>
>> [ perf record: Captured and wrote 8.203 MB /tmp/perf.data (~358388 samples) ]
>>
>> Before I get too far down this path I wanted to get comments on the approach.
>
> I think it's worthwhile doing this
Indeed! It looks like a nice improvement.
Thanks,
Namhyung
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] perf record: mmap output file - RFC
2013-09-26 23:17 ` David Ahern
@ 2013-09-27 6:29 ` Ingo Molnar
0 siblings, 0 replies; 9+ messages in thread
From: Ingo Molnar @ 2013-09-27 6:29 UTC (permalink / raw)
To: David Ahern
Cc: Jiri Olsa, acme, linux-kernel, Frederic Weisbecker,
Peter Zijlstra, Namhyung Kim, Mike Galbraith, Stephane Eranian
* David Ahern <dsahern@gmail.com> wrote:
> On 9/26/13 11:51 AM, Jiri Olsa wrote:
> >but it's still faster, since we finally get perf a chance to sleep ;-)
> >
> >new time:
> > real 0m30.392s
> > user 0m0.041s
> > sys 0m0.389s
> >
> >old time:
> > real 0m32.235s
> > user 0m3.080s
> > sys 0m14.444s
> >
>
> Another data point on the performance improvement of perf itself.
> Using openssl speed as a workload and perf-stat to collect
> information about the perf-record process only:
>
> perf stat -i -- perf record -g -o /tmp/perf.data openssl speed aes
>
> With write():
> 158.606380 task-clock
> 72 context-switches
> 34 cpu-migrations
> 5,400 page-faults
> 336,054,007 cycles
> 137,804,036 stalled-cycles-frontend
> 74,505,914 stalled-cycles-backend
> 474,401,639 instructions
> 91,246,072 branches
> 1,968,289 branch-misses
>
> With mmap():
> 50.314270 task-clock
> 61 context-switches
> 7 cpu-migrations
> 3,958 page-faults
> 93,585,618 cycles
> 64,878,225 stalled-cycles-frontend
> 41,680,427 stalled-cycles-backend
> 81,552,219 instructions
> 15,301,389 branches
> 387,230 branch-misses
>
> So time, CPU cycles, instructions all drop by more than a factor of 3.
Impressive!
Btw., a perf stat bugreport: it's not clear at all that 'task-clock' is in
units of milliseconds. Would be nice to print the unit as '(ms)' or so ...
For the other entries the unit is obvious (a count) - with the exception
of the task-clock.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] perf record: mmap output file - RFC
2013-09-23 2:05 [PATCH] perf record: mmap output file - RFC David Ahern
2013-09-26 17:51 ` Jiri Olsa
@ 2013-10-09 13:33 ` Jiri Olsa
2013-10-09 13:46 ` David Ahern
1 sibling, 1 reply; 9+ messages in thread
From: Jiri Olsa @ 2013-10-09 13:33 UTC (permalink / raw)
To: David Ahern
Cc: acme, linux-kernel, Ingo Molnar, Frederic Weisbecker,
Peter Zijlstra, Namhyung Kim, Mike Galbraith, Stephane Eranian
On Sun, Sep 22, 2013 at 08:05:59PM -0600, David Ahern wrote:
SNIP
> +
> + if (size > remaining) {
> + memcpy(rec->mmap_addr + rec->mmap_offset, buf, remaining);
> + rec->bytes_written += remaining;
> +
> + size -= remaining;
> + buf += remaining;
> +
> + msync(rec->mmap_addr, rec->mmap_size, MS_ASYNC);
> + munmap(rec->mmap_addr, rec->mmap_size);
do we need msync call if munmap is called right away?
> + goto do_mmap;
SNIP
> + if (rec->use_mmap) {
> + off_t len = rec->bytes_at_mmap_start + rec->bytes_written;
> +
> + rec->use_mmap = false;
> + msync(rec->mmap_addr, rec->mmap_size, MS_ASYNC);
> + munmap(rec->mmap_addr, rec->mmap_size);
ditto
thanks,
jirka
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] perf record: mmap output file - RFC
2013-10-09 13:33 ` Jiri Olsa
@ 2013-10-09 13:46 ` David Ahern
0 siblings, 0 replies; 9+ messages in thread
From: David Ahern @ 2013-10-09 13:46 UTC (permalink / raw)
To: Jiri Olsa
Cc: acme, linux-kernel, Ingo Molnar, Frederic Weisbecker,
Peter Zijlstra, Namhyung Kim, Mike Galbraith, Stephane Eranian
On 10/9/13 7:33 AM, Jiri Olsa wrote:
>> +
>> + if (size > remaining) {
>> + memcpy(rec->mmap_addr + rec->mmap_offset, buf, remaining);
>> + rec->bytes_written += remaining;
>> +
>> + size -= remaining;
>> + buf += remaining;
>> +
>> + msync(rec->mmap_addr, rec->mmap_size, MS_ASYNC);
>> + munmap(rec->mmap_addr, rec->mmap_size);
>
> do we need msync call if munmap is called right away?
I was being overly cautious. I will remove it in v2.
David
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2013-10-09 13:47 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-09-23 2:05 [PATCH] perf record: mmap output file - RFC David Ahern
2013-09-26 17:51 ` Jiri Olsa
2013-09-26 18:12 ` David Ahern
2013-09-26 18:23 ` Jiri Olsa
2013-09-26 23:17 ` David Ahern
2013-09-27 6:29 ` Ingo Molnar
2013-09-27 2:24 ` Namhyung Kim
2013-10-09 13:33 ` Jiri Olsa
2013-10-09 13:46 ` David Ahern
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).