All of lore.kernel.org
 help / color / mirror / Atom feed
From: David Ahern <dsahern@gmail.com>
To: acme@ghostprotocols.net, linux-kernel@vger.kernel.org
Cc: mingo@kernel.org, jolsa@redhat.com,
	David Ahern <dsahern@gmail.com>,
	Frederic Weisbecker <fweisbec@gmail.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Namhyung Kim <namhyung@kernel.org>,
	Mike Galbraith <efault@gmx.de>,
	Stephane Eranian <eranian@google.com>
Subject: [PATCH 4/5] perf record: mmap output file - v5
Date: Tue, 12 Nov 2013 07:46:56 -0700	[thread overview]
Message-ID: <1384267617-3446-5-git-send-email-dsahern@gmail.com> (raw)
In-Reply-To: <1384267617-3446-1-git-send-email-dsahern@gmail.com>

When recording raw_syscalls for the entire system, e.g.,
    perf record -e raw_syscalls:*,sched:sched_switch -a -- sleep 1

you end up with a negative feedback loop as perf itself calls write() fairly
often. This patch handles the problem by mmap'ing the file in chunks of 64M at
a time and copies events from the event buffers to the file avoiding write
system calls.

Before (with write syscall):

    perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
    [ perf record: Woken up 0 times to write data ]
    [ perf record: Captured and wrote 81.843 MB /tmp/perf.data (~3575786 samples) ]

After (using mmap):

    perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
    [ perf record: Woken up 31 times to write data ]
    [ perf record: Captured and wrote 8.203 MB /tmp/perf.data (~358388 samples) ]

In addition to perf-trace benefits using mmap lowers the overhead of
perf-record. For example,

  perf stat -i -- perf record -g -o /tmp/perf.data openssl speed aes

shows a drop in time, CPU cycles, and instructions all drop by more than a
factor of 3. Jiri also ran a test that showed a big improvement.

v5: Addressed misc comments from Jiri, Adrian and Arnaldo. Added -O shortcut
    for --out-pages. Added -O 0 as a means to fall back to write

v4: Refactoring per Ingo's comments

v3: Removed use of bytes_at_mmap_start at the stat() that set it
    Added user option to control the size of the mmap for writing file.

v2: Removed msync call before munmap per Jiri's suggestion

Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: David Ahern <dsahern@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Stephane Eranian <eranian@google.com>
---
 tools/perf/Documentation/perf-record.txt |   7 ++
 tools/perf/builtin-record.c              | 164 +++++++++++++++++++++++++++++++
 tools/perf/util/evlist.c                 |  23 +++++
 tools/perf/util/evlist.h                 |   3 +
 4 files changed, 197 insertions(+)

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 052f7c4dc00c..7c67dad9e341 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -201,6 +201,13 @@ abort events and some memory events in precise mode on modern Intel CPUs.
 --transaction::
 Record transaction flags for transaction related events.
 
+-O::
+--out-pages=::
+Number of pages to mmap for writing data to file or size specification
+with appended unit character - B/K/M/G. The size is rounded up to have nearest
+pages power of two value. 0 falls back to write instead of mmap. Default size
+is 64M.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 880227eae20f..1a4fa5df215b 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -30,6 +30,9 @@
 #include <sched.h>
 #include <sys/mman.h>
 
+/* output file mmap'ed N chunks at a time */
+#define MMAP_OUTPUT_SIZE   (64*1024*1024)
+
 #ifndef HAVE_ON_EXIT_SUPPORT
 #ifndef ATEXIT_MAX
 #define ATEXIT_MAX 32
@@ -65,6 +68,16 @@ static void __handle_on_exit_funcs(void)
 struct perf_record {
 	struct perf_tool	tool;
 	struct perf_record_opts	opts;
+
+	/* for MMAP based file writes */
+	struct {
+		void		*addr;
+		u64		offset;     /* current location within mmap */
+		unsigned int	out_pages;  /* user configurable option */
+		size_t		out_size;   /* size of mmap segments */
+		bool		use;
+	} mmap;
+
 	u64			bytes_written;
 	struct perf_data_file	file;
 	struct perf_evlist	*evlist;
@@ -76,6 +89,95 @@ struct perf_record {
 	long			samples;
 };
 
+static int mmap_next_segment(struct perf_record *rec, off_t offset)
+{
+	struct perf_data_file *file = &rec->file;
+
+	/* extend file to include a new mmap segment */
+	if (ftruncate(file->fd, offset + rec->mmap.out_size) != 0) {
+		pr_err("ftruncate failed\n");
+		return -1;
+	}
+
+	rec->mmap.addr = mmap(NULL, rec->mmap.out_size,
+			      PROT_WRITE | PROT_READ, MAP_SHARED,
+			      file->fd, offset);
+
+	if (rec->mmap.addr == MAP_FAILED) {
+		pr_err("mmap failed: %d: %s\n", errno, strerror(errno));
+
+		/* reset file size */
+		if (ftruncate(file->fd, offset) != 0)
+			pr_err("ftruncate failed too. Is it Halloween?\n");
+
+		return -1;
+	}
+
+	return 0;
+}
+
+static off_t next_mmap_offset(struct perf_record *rec)
+{
+	off_t offset;
+
+	/*
+	 * for first segment, mmap offset is current amount of data
+	 * already written to file. For follow on segments the output
+	 * starts at 0.
+	 */
+	offset = rec->session->header.data_offset + rec->bytes_written;
+	if (offset < (ssize_t) rec->mmap.out_size) {
+		rec->mmap.offset = offset;
+		offset = 0;
+	} else {
+		rec->mmap.offset = 0;
+	}
+
+	/* returning offset within file - used for mmap of next segment */
+	return offset;
+}
+
+static int do_mmap_output(struct perf_record *rec, void *buf, size_t size)
+{
+	u64 remaining;
+	off_t offset;
+
+	if (rec->mmap.addr == NULL) {
+next_segment:
+		offset = next_mmap_offset(rec);
+		if (mmap_next_segment(rec, offset) != 0)
+			return -1;
+	}
+
+	/* amount of space in current mmap segment */
+	remaining = rec->mmap.out_size - rec->mmap.offset;
+
+	/*
+	 * if current size to write is more than the available
+	 * space write what we can then go back and create the
+	 * next segment
+	 */
+	if (size > remaining) {
+		memcpy(rec->mmap.addr + rec->mmap.offset, buf, remaining);
+		rec->bytes_written += remaining;
+
+		size -= remaining;
+		buf  += remaining;
+
+		munmap(rec->mmap.addr, rec->mmap.out_size);
+		goto next_segment;
+	}
+
+	/* more data to copy and it fits in the current segment */
+	if (size) {
+		memcpy(rec->mmap.addr + rec->mmap.offset, buf, size);
+		rec->bytes_written += size;
+		rec->mmap.offset += size;
+	}
+
+	return 0;
+}
+
 static int do_write_output(struct perf_record *rec, void *buf, size_t size)
 {
 	struct perf_data_file *file = &rec->file;
@@ -99,6 +201,9 @@ static int do_write_output(struct perf_record *rec, void *buf, size_t size)
 
 static int write_output(struct perf_record *rec, void *buf, size_t size)
 {
+	if (rec->mmap.use)
+		return do_mmap_output(rec, buf, size);
+
 	return do_write_output(rec, buf, size);
 }
 
@@ -361,6 +466,52 @@ static void perf_record__init_features(struct perf_record *rec)
 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
 }
 
+static int mmap_output_fini(struct perf_record *rec)
+{
+	off_t len;
+	int fd;
+
+	if (!rec->mmap.use)
+		return 0;
+
+	rec->mmap.use = false;
+
+	len = rec->session->header.data_offset + rec->bytes_written;
+	fd = rec->file.fd;
+
+	munmap(rec->mmap.addr, rec->mmap.out_size);
+	rec->mmap.addr = NULL;
+
+	if (ftruncate(fd, len) != 0) {
+		pr_err("ftruncate failed\n");
+		return -1;
+	}
+
+	/*
+	 * Set output pointer to end of file
+	 * eg., needed for buildid processing
+	 */
+	if (lseek(fd, 0, SEEK_END) == (off_t) -1) {
+		pr_err("ftruncate failed\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static void mmap_output_init(struct perf_record *rec)
+{
+	struct perf_data_file *file = &rec->file;
+
+	if (file->is_pipe)
+		return;
+
+	rec->mmap.out_size = rec->mmap.out_pages * page_size;
+
+	if (rec->mmap.out_size)
+		rec->mmap.use = true;
+}
+
 static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
 {
 	int err;
@@ -434,6 +585,8 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
 		goto out_delete_session;
 	}
 
+	mmap_output_init(rec);
+
 	machine = &session->machines.host;
 
 	if (file->is_pipe) {
@@ -541,6 +694,11 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
 		}
 	}
 
+	if (mmap_output_fini(rec) != 0) {
+		err = -1;
+		goto out_delete_session;
+	}
+
 	if (quiet || signr == SIGUSR1)
 		return 0;
 
@@ -802,6 +960,9 @@ static struct perf_record record = {
 			.uses_mmap   = true,
 		},
 	},
+	.mmap = {
+		.out_size = MMAP_OUTPUT_SIZE,
+	},
 };
 
 #define CALLCHAIN_HELP "setup and enables call-graph (stack chain/backtrace) recording: "
@@ -888,6 +1049,9 @@ const struct option record_options[] = {
 		    "sample by weight (on special events only)"),
 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
 		    "sample transaction flags (special events only)"),
+	OPT_CALLBACK('O', "out-pages", &record.mmap.out_pages, "pages",
+		     "Number of pages or size with units to use for output (default 64M)",
+		     perf_evlist__parse_out_pages),
 	OPT_END()
 };
 
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index cb19044601bb..3d1f7faa30d7 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -767,6 +767,29 @@ int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str,
 	return 0;
 }
 
+int perf_evlist__parse_out_pages(const struct option *opt, const char *str,
+				  int unset __maybe_unused)
+{
+	unsigned int *out_pages = opt->value;
+	unsigned long max = UINT_MAX;
+	long pages;
+
+	if (max < SIZE_MAX / page_size)
+		max = SIZE_MAX / page_size;
+
+	pages = parse_pages_arg(str, 0, max);
+	if (pages < 0) {
+		pr_err("Invalid argument for --out-pages/-O\n");
+		return -1;
+	}
+
+	if (pages == 0)
+		pr_debug("Reverting to write instead of mmap for output file\n");
+
+	*out_pages = pages;
+	return 0;
+}
+
 /**
  * perf_evlist__mmap - Create mmaps to receive events.
  * @evlist: list of events
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index ecaa582f40e2..749488147276 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -107,6 +107,9 @@ int perf_evlist__prepare_workload(struct perf_evlist *evlist,
 				  bool want_signal);
 int perf_evlist__start_workload(struct perf_evlist *evlist);
 
+int perf_evlist__parse_out_pages(const struct option *opt,
+				  const char *str, int unset);
+
 int perf_evlist__parse_mmap_pages(const struct option *opt,
 				  const char *str,
 				  int unset);
-- 
1.8.3.4 (Apple Git-47)


  parent reply	other threads:[~2013-11-12 14:48 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-11-12 14:46 [PATCH 0/5] perf record: mmap output file - v5 David Ahern
2013-11-12 14:46 ` [PATCH 1/5] perf record: Fix segfault with --no-mmap-pages David Ahern
2013-11-12 21:57   ` [tip:perf/urgent] " tip-bot for David Ahern
2013-11-12 14:46 ` [PATCH 2/5] perf tool: Round mmap pages to power 2 - v2 David Ahern
2013-11-12 21:57   ` [tip:perf/urgent] perf evlist: " tip-bot for David Ahern
2013-11-12 14:46 ` [PATCH 3/5] perf tool: Refactor mmap_pages parsing David Ahern
2013-11-12 21:57   ` [tip:perf/urgent] perf evlist: " tip-bot for David Ahern
2013-11-12 14:46 ` David Ahern [this message]
2013-11-12 14:57   ` [PATCH 4/5] perf record: mmap output file - v5 Peter Zijlstra
2013-11-12 15:07     ` Arnaldo Carvalho de Melo
2013-11-12 15:19       ` Peter Zijlstra
2013-11-12 15:36         ` David Ahern
2013-11-12 21:11           ` Ingo Molnar
2013-11-13 11:34             ` Peter Zijlstra
2013-11-13 11:50               ` Ingo Molnar
2013-11-13 12:16                 ` Peter Zijlstra
2013-11-13 14:29                 ` David Ahern
2013-11-15 16:41               ` David Ahern
2013-11-18  9:01                 ` Peter Zijlstra
2013-11-18  9:40                   ` Ingo Molnar
2013-11-19  0:24                     ` Namhyung Kim
2013-11-19  0:34                       ` David Ahern
2013-11-19  1:48                         ` Namhyung Kim
2013-11-19  2:02                         ` Namhyung Kim
2013-11-19  2:13                         ` Namhyung Kim
2013-11-19  2:17                           ` David Ahern
2013-11-19  2:30                             ` Namhyung Kim
2013-11-19  2:33                               ` David Ahern
2013-11-19  2:36                                 ` Namhyung Kim
2013-11-19  6:58                                 ` Ingo Molnar
2013-11-19 11:48                                   ` Peter Zijlstra
2013-11-19 11:49                                     ` Peter Zijlstra
2013-11-19 13:13                                       ` Ingo Molnar
2013-11-19 13:45                                         ` Peter Zijlstra
2013-11-19 15:31                                           ` Ingo Molnar
2013-11-19 16:09                                             ` David Ahern
2013-11-19 16:14                                               ` Ingo Molnar
2013-11-19 12:08                         ` Peter Zijlstra
2013-11-19  6:54                       ` Ingo Molnar
2013-11-12 14:46 ` [PATCH 5/5] perf record: Handle out of space failures writing data with mmap David Ahern
2013-11-12 21:19   ` Ingo Molnar
2013-11-13 14:33     ` David Ahern

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1384267617-3446-5-git-send-email-dsahern@gmail.com \
    --to=dsahern@gmail.com \
    --cc=acme@ghostprotocols.net \
    --cc=efault@gmx.de \
    --cc=eranian@google.com \
    --cc=fweisbec@gmail.com \
    --cc=jolsa@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=namhyung@kernel.org \
    --cc=peterz@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.