linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] perf record: mmap output file - RFC
@ 2013-09-23  2:05 David Ahern
  2013-09-26 17:51 ` Jiri Olsa
  2013-10-09 13:33 ` Jiri Olsa
  0 siblings, 2 replies; 9+ messages in thread
From: David Ahern @ 2013-09-23  2:05 UTC (permalink / raw)
  To: acme, linux-kernel
  Cc: David Ahern, Ingo Molnar, Frederic Weisbecker, Peter Zijlstra,
	Jiri Olsa, Namhyung Kim, Mike Galbraith, Stephane Eranian

When recording raw_syscalls for the entire system, e.g.,
    perf record -e raw_syscalls:*,sched:sched_switch -a -- sleep 1

you end up with a negative feedback loop as perf itself calls
write() fairly often. This patch mmap's the file in chunks of 64M
at a time and copies events from the event buffers to the file
avoiding write system calls.

Before (with write syscall):

perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
[ perf record: Woken up 0 times to write data ]
[ perf record: Captured and wrote 81.843 MB /tmp/perf.data (~3575786 samples) ]

After (using mmap):

perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
[ perf record: Woken up 31 times to write data ]
[ perf record: Captured and wrote 8.203 MB /tmp/perf.data (~358388 samples) ]

Before I get too far down this path I wanted to get comments on the approach.

Signed-off-by: David Ahern <dsahern@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Stephane Eranian <eranian@google.com>
---
 tools/perf/builtin-record.c |   87 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index da13840..45bb565 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -29,6 +29,9 @@
 #include <sched.h>
 #include <sys/mman.h>
 
+/* mmap file big chunks at a time */
+#define MMAP_OUTPUT_SIZE   (64*1024*1024)
+
 #ifndef HAVE_ON_EXIT
 #ifndef ATEXIT_MAX
 #define ATEXIT_MAX 32
@@ -64,6 +67,14 @@ static void __handle_on_exit_funcs(void)
 struct perf_record {
 	struct perf_tool	tool;
 	struct perf_record_opts	opts;
+
+	/* for MMAP based file writes */
+	void			*mmap_addr;
+	u64			bytes_at_mmap_start; /* bytes in file when mmap use starts */
+	u64			mmap_offset;    /* current location within mmap */
+	size_t			mmap_size;      /* size of mmap segments */
+	bool			use_mmap;
+
 	u64			bytes_written;
 	const char		*output_name;
 	struct perf_evlist	*evlist;
@@ -82,8 +93,66 @@ static void advance_output(struct perf_record *rec, size_t size)
 	rec->bytes_written += size;
 }
 
+static int do_mmap_output(struct perf_record *rec, void *buf, size_t size)
+{
+	u64 remaining;
+	off_t offset;
+
+	if (rec->mmap_addr == NULL) {
+do_mmap:
+		offset = rec->bytes_at_mmap_start + rec->bytes_written;
+		if (offset < (ssize_t) rec->mmap_size) {
+			rec->mmap_offset = offset;
+			offset = 0;
+		} else
+			rec->mmap_offset = 0;
+
+		rec->mmap_addr = mmap(NULL, rec->mmap_size,
+				     PROT_WRITE | PROT_READ,
+				     MAP_SHARED,
+				     rec->output,
+				     offset);
+
+		if (rec->mmap_addr == MAP_FAILED) {
+			pr_err("mmap failed: %d: %s\n", errno, strerror(errno));
+			return -1;
+		}
+
+		/* expand file to include this mmap segment */
+		if (ftruncate(rec->output, offset + rec->mmap_size) != 0) {
+			pr_err("ftruncate failed\n");
+			return -1;
+		}
+	}
+
+	remaining = rec->mmap_size - rec->mmap_offset;
+
+	if (size > remaining) {
+		memcpy(rec->mmap_addr + rec->mmap_offset, buf, remaining);
+		rec->bytes_written += remaining;
+
+		size -= remaining;
+		buf  += remaining;
+
+		msync(rec->mmap_addr, rec->mmap_size, MS_ASYNC);
+		munmap(rec->mmap_addr, rec->mmap_size);
+		goto do_mmap;
+	}
+
+	if (size) {
+		memcpy(rec->mmap_addr + rec->mmap_offset, buf, size);
+		rec->bytes_written += size;
+		rec->mmap_offset += size;
+	}
+
+	return 0;
+}
+
 static int write_output(struct perf_record *rec, void *buf, size_t size)
 {
+	if (rec->use_mmap)
+		return do_mmap_output(rec, buf, size);
+
 	while (size) {
 		int ret = write(rec->output, buf, size);
 
@@ -546,6 +615,11 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
 	if (forks)
 		perf_evlist__start_workload(evsel_list);
 
+	if (!rec->opts.pipe_output && stat(output_name, &st) == 0) {
+		rec->use_mmap = true;
+		rec->bytes_at_mmap_start = st.st_size - rec->bytes_written;
+	}
+
 	for (;;) {
 		int hits = rec->samples;
 
@@ -572,6 +646,18 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
 		}
 	}
 
+	if (rec->use_mmap) {
+		off_t len = rec->bytes_at_mmap_start + rec->bytes_written;
+
+		rec->use_mmap = false;
+		msync(rec->mmap_addr, rec->mmap_size, MS_ASYNC);
+		munmap(rec->mmap_addr, rec->mmap_size);
+		rec->mmap_addr = NULL;
+
+		if (ftruncate(rec->output, len) != 0)
+			pr_err("ftruncate failed\n");
+	}
+
 	if (quiet || signr == SIGUSR1)
 		return 0;
 
@@ -804,6 +890,7 @@ static struct perf_record record = {
 			.uses_mmap   = true,
 		},
 	},
+	.mmap_size = MMAP_OUTPUT_SIZE,
 };
 
 #define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "
-- 
1.7.10.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] perf record: mmap output file - RFC
  2013-09-23  2:05 [PATCH] perf record: mmap output file - RFC David Ahern
@ 2013-09-26 17:51 ` Jiri Olsa
  2013-09-26 18:12   ` David Ahern
                     ` (2 more replies)
  2013-10-09 13:33 ` Jiri Olsa
  1 sibling, 3 replies; 9+ messages in thread
From: Jiri Olsa @ 2013-09-26 17:51 UTC (permalink / raw)
  To: David Ahern
  Cc: acme, linux-kernel, Ingo Molnar, Frederic Weisbecker,
	Peter Zijlstra, Namhyung Kim, Mike Galbraith, Stephane Eranian

On Sun, Sep 22, 2013 at 08:05:59PM -0600, David Ahern wrote:
> When recording raw_syscalls for the entire system, e.g.,
>     perf record -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
> 
> you end up with a negative feedback loop as perf itself calls
> write() fairly often. This patch mmap's the file in chunks of 64M
> at a time and copies events from the event buffers to the file
> avoiding write system calls.

moved processing into userspace:

    17.24%  -17.10%  libpthread-2.15.so  [.] __write_nocancel                    
     ...
     0.07%   +0.64%  libc-2.15.so        [.] __memcpy_sse2                               
     0.02%  +51.84%  libc-2.15.so        [.] __memcpy_ssse3_back                         
     0.01%   +0.34%  libc-2.15.so        [.] __mempcpy_sse2                              
     ...
> 
> Before (with write syscall):
> 
> perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
> [ perf record: Woken up 0 times to write data ]
> [ perf record: Captured and wrote 81.843 MB /tmp/perf.data (~3575786 samples) ]
> 
> After (using mmap):
> 
> perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
> [ perf record: Woken up 31 times to write data ]

                          ^^^^^^^^
but it's still faster, since we finally get perf a chance to sleep ;-)

new time:
	real    0m30.392s
	user    0m0.041s
	sys     0m0.389s

old time:
	real    0m32.235s
	user    0m3.080s
	sys     0m14.444s


> [ perf record: Captured and wrote 8.203 MB /tmp/perf.data (~358388 samples) ]
> 
> Before I get too far down this path I wanted to get comments on the approach.

I think it's worthwhile doing this

jirka

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] perf record: mmap output file - RFC
  2013-09-26 17:51 ` Jiri Olsa
@ 2013-09-26 18:12   ` David Ahern
  2013-09-26 18:23     ` Jiri Olsa
  2013-09-26 23:17   ` David Ahern
  2013-09-27  2:24   ` Namhyung Kim
  2 siblings, 1 reply; 9+ messages in thread
From: David Ahern @ 2013-09-26 18:12 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: acme, linux-kernel, Ingo Molnar, Frederic Weisbecker,
	Peter Zijlstra, Namhyung Kim, Mike Galbraith, Stephane Eranian

On 9/26/13 11:51 AM, Jiri Olsa wrote:
> but it's still faster, since we finally get perf a chance to sleep ;-)
>
> new time:
> 	real    0m30.392s
> 	user    0m0.041s
> 	sys     0m0.389s
>
> old time:
> 	real    0m32.235s
> 	user    0m3.080s
> 	sys     0m14.444s

excellent.

>
>
>> [ perf record: Captured and wrote 8.203 MB /tmp/perf.data (~358388 samples) ]
>>
>> Before I get too far down this path I wanted to get comments on the approach.
>
> I think it's worthwhile doing this

If you can toss it onto your queue to run through it other use cases -- 
like the 'make -j' on kernel builds. I am doing the same here and if all 
goes well I submit with some other patches next week.

Thanks,

David

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] perf record: mmap output file - RFC
  2013-09-26 18:12   ` David Ahern
@ 2013-09-26 18:23     ` Jiri Olsa
  0 siblings, 0 replies; 9+ messages in thread
From: Jiri Olsa @ 2013-09-26 18:23 UTC (permalink / raw)
  To: David Ahern
  Cc: acme, linux-kernel, Ingo Molnar, Frederic Weisbecker,
	Peter Zijlstra, Namhyung Kim, Mike Galbraith, Stephane Eranian

On Thu, Sep 26, 2013 at 12:12:02PM -0600, David Ahern wrote:
> On 9/26/13 11:51 AM, Jiri Olsa wrote:
> >but it's still faster, since we finally get perf a chance to sleep ;-)
> >
> >new time:
> >	real    0m30.392s
> >	user    0m0.041s
> >	sys     0m0.389s
> >
> >old time:
> >	real    0m32.235s
> >	user    0m3.080s
> >	sys     0m14.444s
> 
> excellent.
> 
> >
> >
> >>[ perf record: Captured and wrote 8.203 MB /tmp/perf.data (~358388 samples) ]
> >>
> >>Before I get too far down this path I wanted to get comments on the approach.
> >
> >I think it's worthwhile doing this
> 
> If you can toss it onto your queue to run through it other use cases
> -- like the 'make -j' on kernel builds. I am doing the same here and
> if all goes well I submit with some other patches next week.

ok, np

jirka

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] perf record: mmap output file - RFC
  2013-09-26 17:51 ` Jiri Olsa
  2013-09-26 18:12   ` David Ahern
@ 2013-09-26 23:17   ` David Ahern
  2013-09-27  6:29     ` Ingo Molnar
  2013-09-27  2:24   ` Namhyung Kim
  2 siblings, 1 reply; 9+ messages in thread
From: David Ahern @ 2013-09-26 23:17 UTC (permalink / raw)
  To: Jiri Olsa, acme
  Cc: linux-kernel, Ingo Molnar, Frederic Weisbecker, Peter Zijlstra,
	Namhyung Kim, Mike Galbraith, Stephane Eranian

On 9/26/13 11:51 AM, Jiri Olsa wrote:
> but it's still faster, since we finally get perf a chance to sleep ;-)
>
> new time:
> 	real    0m30.392s
> 	user    0m0.041s
> 	sys     0m0.389s
>
> old time:
> 	real    0m32.235s
> 	user    0m3.080s
> 	sys     0m14.444s
>

Another data point on the performance improvement of perf itself. Using 
openssl speed as a workload and perf-stat to collect information about 
the perf-record process only:

perf stat -i -- perf record -g -o /tmp/perf.data openssl speed aes

With write():
        158.606380 task-clock
                 72 context-switches
                 34 cpu-migrations
              5,400 page-faults
        336,054,007 cycles
        137,804,036 stalled-cycles-frontend
         74,505,914 stalled-cycles-backend
        474,401,639 instructions
         91,246,072 branches
          1,968,289 branch-misses

With mmap():
         50.314270 task-clock
                 61 context-switches
                  7 cpu-migrations
              3,958 page-faults
         93,585,618 cycles
         64,878,225 stalled-cycles-frontend
         41,680,427 stalled-cycles-backend
         81,552,219 instructions
         15,301,389 branches
            387,230 branch-misses

So time, CPU cycles, instructions all drop by more than a factor of 3.

David

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] perf record: mmap output file - RFC
  2013-09-26 17:51 ` Jiri Olsa
  2013-09-26 18:12   ` David Ahern
  2013-09-26 23:17   ` David Ahern
@ 2013-09-27  2:24   ` Namhyung Kim
  2 siblings, 0 replies; 9+ messages in thread
From: Namhyung Kim @ 2013-09-27  2:24 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: David Ahern, acme, linux-kernel, Ingo Molnar, Frederic Weisbecker,
	Peter Zijlstra, Mike Galbraith, Stephane Eranian

Hi Jiri and David,

On Thu, 26 Sep 2013 19:51:05 +0200, Jiri Olsa wrote:
> On Sun, Sep 22, 2013 at 08:05:59PM -0600, David Ahern wrote:
>> When recording raw_syscalls for the entire system, e.g.,
>>     perf record -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
>> 
>> you end up with a negative feedback loop as perf itself calls
>> write() fairly often. This patch mmap's the file in chunks of 64M
>> at a time and copies events from the event buffers to the file
>> avoiding write system calls.
>
> moved processing into userspace:
>
>     17.24%  -17.10%  libpthread-2.15.so  [.] __write_nocancel                    
>      ...
>      0.07%   +0.64%  libc-2.15.so        [.] __memcpy_sse2                               
>      0.02%  +51.84%  libc-2.15.so        [.] __memcpy_ssse3_back                         
>      0.01%   +0.34%  libc-2.15.so        [.] __mempcpy_sse2                              
>      ...
>> 
>> Before (with write syscall):
>> 
>> perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
>> [ perf record: Woken up 0 times to write data ]
>> [ perf record: Captured and wrote 81.843 MB /tmp/perf.data (~3575786 samples) ]
>> 
>> After (using mmap):
>> 
>> perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
>> [ perf record: Woken up 31 times to write data ]
>
>                           ^^^^^^^^
> but it's still faster, since we finally get perf a chance to sleep ;-)
>
> new time:
> 	real    0m30.392s
> 	user    0m0.041s
> 	sys     0m0.389s
>
> old time:
> 	real    0m32.235s
> 	user    0m3.080s
> 	sys     0m14.444s

But why the new user time took so short?  I guess it should take at
least 10 seconds or so.  Any ideas?

>
>
>> [ perf record: Captured and wrote 8.203 MB /tmp/perf.data (~358388 samples) ]
>> 
>> Before I get too far down this path I wanted to get comments on the approach.
>
> I think it's worthwhile doing this

Indeed!  It looks like a nice improvement.

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] perf record: mmap output file - RFC
  2013-09-26 23:17   ` David Ahern
@ 2013-09-27  6:29     ` Ingo Molnar
  0 siblings, 0 replies; 9+ messages in thread
From: Ingo Molnar @ 2013-09-27  6:29 UTC (permalink / raw)
  To: David Ahern
  Cc: Jiri Olsa, acme, linux-kernel, Frederic Weisbecker,
	Peter Zijlstra, Namhyung Kim, Mike Galbraith, Stephane Eranian


* David Ahern <dsahern@gmail.com> wrote:

> On 9/26/13 11:51 AM, Jiri Olsa wrote:
> >but it's still faster, since we finally get perf a chance to sleep ;-)
> >
> >new time:
> >	real    0m30.392s
> >	user    0m0.041s
> >	sys     0m0.389s
> >
> >old time:
> >	real    0m32.235s
> >	user    0m3.080s
> >	sys     0m14.444s
> >
> 
> Another data point on the performance improvement of perf itself.
> Using openssl speed as a workload and perf-stat to collect
> information about the perf-record process only:
> 
> perf stat -i -- perf record -g -o /tmp/perf.data openssl speed aes
> 
> With write():
>        158.606380 task-clock
>                 72 context-switches
>                 34 cpu-migrations
>              5,400 page-faults
>        336,054,007 cycles
>        137,804,036 stalled-cycles-frontend
>         74,505,914 stalled-cycles-backend
>        474,401,639 instructions
>         91,246,072 branches
>          1,968,289 branch-misses
> 
> With mmap():
>         50.314270 task-clock
>                 61 context-switches
>                  7 cpu-migrations
>              3,958 page-faults
>         93,585,618 cycles
>         64,878,225 stalled-cycles-frontend
>         41,680,427 stalled-cycles-backend
>         81,552,219 instructions
>         15,301,389 branches
>            387,230 branch-misses
> 
> So time, CPU cycles, instructions all drop by more than a factor of 3.

Impressive!

Btw., a perf stat bugreport: it's not clear at all that 'task-clock' is in 
units of milliseconds. Would be nice to print the unit as '(ms)' or so ...

For the other entries the unit is obvious (a count) - with the exception 
of the task-clock.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] perf record: mmap output file - RFC
  2013-09-23  2:05 [PATCH] perf record: mmap output file - RFC David Ahern
  2013-09-26 17:51 ` Jiri Olsa
@ 2013-10-09 13:33 ` Jiri Olsa
  2013-10-09 13:46   ` David Ahern
  1 sibling, 1 reply; 9+ messages in thread
From: Jiri Olsa @ 2013-10-09 13:33 UTC (permalink / raw)
  To: David Ahern
  Cc: acme, linux-kernel, Ingo Molnar, Frederic Weisbecker,
	Peter Zijlstra, Namhyung Kim, Mike Galbraith, Stephane Eranian

On Sun, Sep 22, 2013 at 08:05:59PM -0600, David Ahern wrote:

SNIP

> +
> +	if (size > remaining) {
> +		memcpy(rec->mmap_addr + rec->mmap_offset, buf, remaining);
> +		rec->bytes_written += remaining;
> +
> +		size -= remaining;
> +		buf  += remaining;
> +
> +		msync(rec->mmap_addr, rec->mmap_size, MS_ASYNC);
> +		munmap(rec->mmap_addr, rec->mmap_size);

do we need msync call if munmap is called right away?


> +		goto do_mmap;

SNIP

> +	if (rec->use_mmap) {
> +		off_t len = rec->bytes_at_mmap_start + rec->bytes_written;
> +
> +		rec->use_mmap = false;
> +		msync(rec->mmap_addr, rec->mmap_size, MS_ASYNC);
> +		munmap(rec->mmap_addr, rec->mmap_size);

ditto

thanks,
jirka

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] perf record: mmap output file - RFC
  2013-10-09 13:33 ` Jiri Olsa
@ 2013-10-09 13:46   ` David Ahern
  0 siblings, 0 replies; 9+ messages in thread
From: David Ahern @ 2013-10-09 13:46 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: acme, linux-kernel, Ingo Molnar, Frederic Weisbecker,
	Peter Zijlstra, Namhyung Kim, Mike Galbraith, Stephane Eranian

On 10/9/13 7:33 AM, Jiri Olsa wrote:
>> +
>> +	if (size > remaining) {
>> +		memcpy(rec->mmap_addr + rec->mmap_offset, buf, remaining);
>> +		rec->bytes_written += remaining;
>> +
>> +		size -= remaining;
>> +		buf  += remaining;
>> +
>> +		msync(rec->mmap_addr, rec->mmap_size, MS_ASYNC);
>> +		munmap(rec->mmap_addr, rec->mmap_size);
>
> do we need msync call if munmap is called right away?

I was being overly cautious. I will remove it in v2.

David

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2013-10-09 13:47 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-09-23  2:05 [PATCH] perf record: mmap output file - RFC David Ahern
2013-09-26 17:51 ` Jiri Olsa
2013-09-26 18:12   ` David Ahern
2013-09-26 18:23     ` Jiri Olsa
2013-09-26 23:17   ` David Ahern
2013-09-27  6:29     ` Ingo Molnar
2013-09-27  2:24   ` Namhyung Kim
2013-10-09 13:33 ` Jiri Olsa
2013-10-09 13:46   ` David Ahern

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).