public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [RFC/PATCH] perf sched: Implement 'perf sched runtime' command
@ 2015-08-25  6:11 Namhyung Kim
  2015-08-25  6:19 ` David Ahern
  0 siblings, 1 reply; 4+ messages in thread
From: Namhyung Kim @ 2015-08-25  6:11 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Ingo Molnar, Peter Zijlstra, Jiri Olsa, LKML, David Ahern

Currently 'perf sched record' commandsrecords various task stat info but
other perf sched commands don't use them.  This is a very simple
implementation of showing those info to users.

  $ sudo perf sched record -a sleep 1
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.346 MB perf.data (774 samples) ]

  $ sudo perf sched runtime | head
    TID:COMM                  Run-time    Sleep-time     Wait-time   IOWait-time  Switch  Migrate
  ----------------------  ------------  ------------  ------------  ------------  ------  -------
   1347:Socket Thread         20901793    4895322293             0             0       2        2
  14855:sshd                   7404420     660792968             0             0       4        0
   1342:firefox                3657944     998405339         82358             0      11        0
   1360:Timer                  1429014    1002239227             0             0      11        0
    170:systemd-journal        1209607     644878917             0             0       1        0
      7:rcu_preempt            1081471     958866004        416514             0      26        6
  14856:sshd                    973632     506809918             0             0       6        0
  14859:sleep                   887558    1030418783          4055             0       3        1

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-sched.c | 361 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 359 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 33962612a5e9..112596121242 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -98,6 +98,18 @@ struct work_atoms {
 	int			num_merged;
 };
 
+struct task_runtime {
+	struct list_head	list;
+	struct rb_node		node;
+	struct thread		*thread;
+	u64			run_time;
+	u64			sleep_time;
+	u64			wait_time;
+	u64			iowait_time;
+	int			num_switch;
+	int			num_migrate;
+};
+
 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
 
 struct perf_sched;
@@ -109,6 +121,15 @@ struct trace_sched_handler {
 	int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
 			     struct perf_sample *sample, struct machine *machine);
 
+	int (*sleep_event)(struct perf_sched *sched, struct perf_evsel *evsel,
+			   struct perf_sample *sample, struct machine *machine);
+
+	int (*wait_event)(struct perf_sched *sched, struct perf_evsel *evsel,
+			  struct perf_sample *sample, struct machine *machine);
+
+	int (*iowait_event)(struct perf_sched *sched, struct perf_evsel *evsel,
+			    struct perf_sample *sample, struct machine *machine);
+
 	int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
 			    struct perf_sample *sample, struct machine *machine);
 
@@ -170,7 +191,7 @@ struct perf_sched {
 	u64		 all_count;
 	u64		 cpu_last_switched[MAX_CPUS];
 	struct rb_root	 atom_root, sorted_atom_root, merged_atom_root;
-	struct list_head sort_list, cmp_pid;
+	struct list_head sort_list, cmp_pid, runtime_list;
 	bool force;
 	bool skip_merge;
 };
@@ -1424,6 +1445,120 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
 	return 0;
 }
 
+static struct task_runtime *get_task_runtime(struct perf_sched *sched,
+					     struct machine *machine, int pid)
+{
+	struct thread *th;
+	struct task_runtime *tr;
+
+	th = machine__findnew_thread(machine, -1, pid);
+	if (th == NULL)
+		return NULL;
+
+	tr = thread__priv(th);
+	if (tr == NULL) {
+		tr = zalloc(sizeof(*tr));
+		if (tr == NULL)
+			return NULL;
+
+		thread__set_priv(th, tr);
+
+		/* hold a refcnt not to free the thread */
+		tr->thread = thread__get(th);
+
+		list_add(&tr->list, &sched->runtime_list);
+	}
+	return tr;
+}
+
+static int runtime_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
+				struct perf_sample *sample, struct machine *machine)
+{
+	const u32 pid = perf_evsel__intval(evsel, sample, "prev_pid");
+	struct task_runtime *tr;
+
+	tr = get_task_runtime(sched, machine, pid);
+	if (tr == NULL)
+		return -1;
+
+	tr->num_switch++;
+	return 0;
+}
+
+static int runtime_runtime_event(struct perf_sched *sched, struct perf_evsel *evsel,
+				 struct perf_sample *sample, struct machine *machine)
+{
+	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
+	const u64 time = perf_evsel__intval(evsel, sample, "runtime");
+	struct task_runtime *tr;
+
+	tr = get_task_runtime(sched, machine, pid);
+	if (tr == NULL)
+		return -1;
+
+	tr->run_time += time;
+	return 0;
+}
+
+static int runtime_sleep_event(struct perf_sched *sched, struct perf_evsel *evsel,
+			       struct perf_sample *sample, struct machine *machine)
+{
+	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
+	const u64 time = perf_evsel__intval(evsel, sample, "delay");
+	struct task_runtime *tr;
+
+	tr = get_task_runtime(sched, machine, pid);
+	if (tr == NULL)
+		return -1;
+
+	tr->sleep_time += time;
+	return 0;
+}
+
+static int runtime_wait_event(struct perf_sched *sched, struct perf_evsel *evsel,
+			      struct perf_sample *sample, struct machine *machine)
+{
+	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
+	const u64 time = perf_evsel__intval(evsel, sample, "delay");
+	struct task_runtime *tr;
+
+	tr = get_task_runtime(sched, machine, pid);
+	if (tr == NULL)
+		return -1;
+
+	tr->wait_time += time;
+	return 0;
+}
+
+static int runtime_iowait_event(struct perf_sched *sched, struct perf_evsel *evsel,
+				struct perf_sample *sample, struct machine *machine)
+{
+	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
+	const u64 time = perf_evsel__intval(evsel, sample, "delay");
+	struct task_runtime *tr;
+
+	tr = get_task_runtime(sched, machine, pid);
+	if (tr == NULL)
+		return -1;
+
+	tr->iowait_time += time;
+	return 0;
+}
+
+static int runtime_migrate_event(struct perf_sched *sched, struct perf_evsel *evsel,
+				 struct perf_sample *sample, struct machine *machine)
+{
+	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
+	struct task_runtime *tr;
+
+	tr = get_task_runtime(sched, machine, pid);
+	if (tr == NULL)
+		return -1;
+
+	tr->num_migrate++;
+	return 0;
+}
+
 static int process_sched_switch_event(struct perf_tool *tool,
 				      struct perf_evsel *evsel,
 				      struct perf_sample *sample,
@@ -1463,6 +1598,45 @@ static int process_sched_runtime_event(struct perf_tool *tool,
 	return 0;
 }
 
+static int process_sched_sleep_event(struct perf_tool *tool,
+				     struct perf_evsel *evsel,
+				     struct perf_sample *sample,
+				     struct machine *machine)
+{
+	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
+
+	if (sched->tp_handler->sleep_event)
+		return sched->tp_handler->sleep_event(sched, evsel, sample, machine);
+
+	return 0;
+}
+
+static int process_sched_wait_event(struct perf_tool *tool,
+				    struct perf_evsel *evsel,
+				    struct perf_sample *sample,
+				    struct machine *machine)
+{
+	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
+
+	if (sched->tp_handler->wait_event)
+		return sched->tp_handler->wait_event(sched, evsel, sample, machine);
+
+	return 0;
+}
+
+static int process_sched_iowait_event(struct perf_tool *tool,
+				      struct perf_evsel *evsel,
+				      struct perf_sample *sample,
+				      struct machine *machine)
+{
+	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
+
+	if (sched->tp_handler->iowait_event)
+		return sched->tp_handler->iowait_event(sched, evsel, sample, machine);
+
+	return 0;
+}
+
 static int perf_sched__process_fork_event(struct perf_tool *tool,
 					  union perf_event *event,
 					  struct perf_sample *sample,
@@ -1560,6 +1734,143 @@ out_delete:
 	return rc;
 }
 
+static int perf_sched__read_runtime_events(struct perf_sched *sched)
+{
+	const struct perf_evsel_str_handler handlers[] = {
+		{ "sched:sched_switch",	      process_sched_switch_event, },
+		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
+		{ "sched:sched_stat_sleep",   process_sched_sleep_event, },
+		{ "sched:sched_stat_wait",    process_sched_wait_event, },
+		{ "sched:sched_stat_iowait",  process_sched_iowait_event, },
+		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
+	};
+	struct perf_session *session;
+	struct perf_data_file file = {
+		.path = input_name,
+		.mode = PERF_DATA_MODE_READ,
+		.force = sched->force,
+	};
+	int rc = -1;
+
+	session = perf_session__new(&file, false, &sched->tool);
+	if (session == NULL) {
+		pr_debug("No Memory for session\n");
+		return -1;
+	}
+
+	symbol__init(&session->header.env);
+
+	if (perf_session__set_tracepoints_handlers(session, handlers))
+		goto out_delete;
+
+	if (perf_session__has_traces(session, "record -R")) {
+		int err = perf_session__process_events(session);
+		if (err) {
+			pr_err("Failed to process events, error %d", err);
+			goto out_delete;
+		}
+
+		sched->nr_events      = session->evlist->stats.nr_events[0];
+		sched->nr_lost_events = session->evlist->stats.total_lost;
+		sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
+	}
+
+	rc = 0;
+out_delete:
+	perf_session__delete(session);
+	return rc;
+}
+
+static int task_runtime_cmp(struct perf_sched *sched __maybe_unused,
+			    struct task_runtime *a, struct task_runtime *b)
+{
+	return b->run_time - a->run_time;
+}
+
+static void sort_task_runtime(struct perf_sched *sched, struct task_runtime *tr)
+{
+	struct rb_root *root = &sched->sorted_atom_root;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*p) {
+		struct task_runtime *iter;
+		int cmp;
+
+		parent = *p;
+		iter = rb_entry(parent, struct task_runtime, node);
+
+		cmp = task_runtime_cmp(sched, iter, tr);
+
+		if (cmp > 0)
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+	}
+
+	rb_link_node(&tr->node, parent, p);
+	rb_insert_color(&tr->node, root);
+}
+
+static void perf_sched__sort_runtime(struct perf_sched *sched)
+{
+	while (!list_empty(&sched->runtime_list)) {
+		struct task_runtime *tr;
+
+		tr = list_first_entry(&sched->runtime_list,
+				      struct task_runtime, list);
+		list_del(&tr->list);
+
+		sort_task_runtime(sched, tr);
+	}
+}
+
+static void print_task_runtime(struct task_runtime *tr)
+{
+	struct thread *th = tr->thread;
+
+	/*      tid:comm   run_time     sleep_time   wait_time    iowait_time  switch  migrate*/
+	printf("%5d:%-16s  %12"PRIu64"  %12"PRIu64"  %12"PRIu64"  %12"PRIu64"  %6d  %7d\n",
+	       th->tid, thread__comm_str(th), tr->run_time, tr->sleep_time,
+	       tr->wait_time, tr->iowait_time, tr->num_switch, tr->num_migrate);
+}
+
+static void perf_sched__print_runtime(struct perf_sched *sched)
+{
+	struct task_runtime *tr;
+	struct rb_root *root = &sched->sorted_atom_root;
+	struct rb_node *node = rb_first(root);
+
+	printf("%5s:%-16s  %12s  %12s  %12s  %12s  %6s  %7s\n",
+	       "TID", "COMM", "Run-time", "Sleep-time", "Wait-time",
+	       "IOWait-time", "Switch", "Migrate");
+	printf("%.22s  %.12s  %.12s  %.12s  %.12s  %.6s  %.7s\n",
+	       graph_dotted_line, graph_dotted_line, graph_dotted_line,
+	       graph_dotted_line, graph_dotted_line, graph_dotted_line,
+	       graph_dotted_line);
+
+	while (node) {
+		tr = rb_entry(node, struct task_runtime, node);
+		print_task_runtime(tr);
+
+		node = rb_next(node);
+	}
+
+	/* release thread refcnt */
+	node = rb_first(root);
+	while (node) {
+		struct rb_node *next = rb_next(node);
+		rb_erase(node, root);
+
+		tr = rb_entry(node, struct task_runtime, node);
+		thread__set_priv(tr->thread, NULL);
+		thread__put(tr->thread);
+		free(tr);
+
+		node = next;
+	}
+}
+
 static void print_bad_events(struct perf_sched *sched)
 {
 	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
@@ -1720,6 +2031,23 @@ static int perf_sched__replay(struct perf_sched *sched)
 	return 0;
 }
 
+static int perf_sched__runtime(struct perf_sched *sched)
+{
+	setup_pager();
+
+	INIT_LIST_HEAD(&sched->runtime_list);
+	if (perf_sched__read_runtime_events(sched))
+		return -1;
+
+	perf_sched__sort_runtime(sched);
+	perf_sched__print_runtime(sched);
+
+	print_bad_events(sched);
+	printf("\n");
+
+	return 0;
+}
+
 static void setup_sorting(struct perf_sched *sched, const struct option *options,
 			  const char * const usage_msg[])
 {
@@ -1821,6 +2149,14 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
 	OPT_END()
 	};
+	const struct option runtime_options[] = {
+	OPT_INCR('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
+	OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
+	OPT_END()
+	};
 	const struct option sched_options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
@@ -1838,8 +2174,13 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
 		"perf sched replay [<options>]",
 		NULL
 	};
+	const char * const runtime_usage[] = {
+		"perf sched runtime [<options>]",
+		NULL
+	};
 	const char *const sched_subcommands[] = { "record", "latency", "map",
-						  "replay", "script", NULL };
+						  "replay", "script", "runtime",
+						  NULL };
 	const char *sched_usage[] = {
 		NULL,
 		NULL
@@ -1858,6 +2199,14 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
 		.switch_event	    = replay_switch_event,
 		.fork_event	    = replay_fork_event,
 	};
+	struct trace_sched_handler runtime_ops  = {
+		.switch_event	    = runtime_switch_event,
+		.runtime_event	    = runtime_runtime_event,
+		.sleep_event	    = runtime_sleep_event,
+		.wait_event	    = runtime_wait_event,
+		.iowait_event	    = runtime_iowait_event,
+		.migrate_task_event = runtime_migrate_event,
+	};
 	unsigned int i;
 
 	for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
@@ -1897,6 +2246,14 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
 				usage_with_options(replay_usage, replay_options);
 		}
 		return perf_sched__replay(&sched);
+	} else if (!strncmp(argv[0], "run", 3)) {
+		sched.tp_handler = &runtime_ops;
+		if (argc) {
+			argc = parse_options(argc, argv, runtime_options, runtime_usage, 0);
+			if (argc)
+				usage_with_options(runtime_usage, runtime_options);
+		}
+		return perf_sched__runtime(&sched);
 	} else {
 		usage_with_options(sched_usage, sched_options);
 	}
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [RFC/PATCH] perf sched: Implement 'perf sched runtime' command
  2015-08-25  6:11 [RFC/PATCH] perf sched: Implement 'perf sched runtime' command Namhyung Kim
@ 2015-08-25  6:19 ` David Ahern
  2015-08-25  8:07   ` Namhyung Kim
  0 siblings, 1 reply; 4+ messages in thread
From: David Ahern @ 2015-08-25  6:19 UTC (permalink / raw)
  To: Namhyung Kim, Arnaldo Carvalho de Melo
  Cc: Ingo Molnar, Peter Zijlstra, Jiri Olsa, LKML

On 8/24/15 11:11 PM, Namhyung Kim wrote:
>
> +static int perf_sched__read_runtime_events(struct perf_sched *sched)
> +{
> +	const struct perf_evsel_str_handler handlers[] = {
> +		{ "sched:sched_switch",	      process_sched_switch_event, },
> +		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
> +		{ "sched:sched_stat_sleep",   process_sched_sleep_event, },
> +		{ "sched:sched_stat_wait",    process_sched_wait_event, },
> +		{ "sched:sched_stat_iowait",  process_sched_iowait_event, },
> +		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
> +	};

I like the concept (obviously since I have invested so much time in 
other solutions). But... you cannot rely on the sched_stat tracepoints 
being enabled. I believe it is a performance hit to have 
CONFIG_SCHEDSTATS enabled and distributions (e.g., Fedora) do not have 
it enabled.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC/PATCH] perf sched: Implement 'perf sched runtime' command
  2015-08-25  6:19 ` David Ahern
@ 2015-08-25  8:07   ` Namhyung Kim
  2015-08-25 17:09     ` David Ahern
  0 siblings, 1 reply; 4+ messages in thread
From: Namhyung Kim @ 2015-08-25  8:07 UTC (permalink / raw)
  To: David Ahern
  Cc: Arnaldo Carvalho de Melo, Ingo Molnar, Peter Zijlstra, Jiri Olsa,
	LKML

Hi David,

On Mon, Aug 24, 2015 at 11:19:44PM -0700, David Ahern wrote:
> On 8/24/15 11:11 PM, Namhyung Kim wrote:
> >
> >+static int perf_sched__read_runtime_events(struct perf_sched *sched)
> >+{
> >+	const struct perf_evsel_str_handler handlers[] = {
> >+		{ "sched:sched_switch",	      process_sched_switch_event, },
> >+		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
> >+		{ "sched:sched_stat_sleep",   process_sched_sleep_event, },
> >+		{ "sched:sched_stat_wait",    process_sched_wait_event, },
> >+		{ "sched:sched_stat_iowait",  process_sched_iowait_event, },
> >+		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
> >+	};
> 
> I like the concept (obviously since I have invested so much time in other
> solutions). But... you cannot rely on the sched_stat tracepoints being
> enabled. I believe it is a performance hit to have CONFIG_SCHEDSTATS enabled
> and distributions (e.g., Fedora) do not have it enabled.

Yes, I know that some distributions don't enable SCHEDSTATS.  But as
perf sched record already requires it, I think it's ok to use those
events in other sched commands.

Maybe we could fix 'perf sched record' not to requires SCHEDSTATS.
All commands in perf sched just use stat_runtime event currently.  And
I guess we can calculate/approximate the stat times using sched_switch
events (and wakeup events) only.  But it also requires changing other
sched commands not to use sched_stat_runtime.

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC/PATCH] perf sched: Implement 'perf sched runtime' command
  2015-08-25  8:07   ` Namhyung Kim
@ 2015-08-25 17:09     ` David Ahern
  0 siblings, 0 replies; 4+ messages in thread
From: David Ahern @ 2015-08-25 17:09 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Arnaldo Carvalho de Melo, Ingo Molnar, Peter Zijlstra, Jiri Olsa,
	LKML

On 8/25/15 1:07 AM, Namhyung Kim wrote:
> Hi David,
>
> On Mon, Aug 24, 2015 at 11:19:44PM -0700, David Ahern wrote:
>> On 8/24/15 11:11 PM, Namhyung Kim wrote:
>>>
>>> +static int perf_sched__read_runtime_events(struct perf_sched *sched)
>>> +{
>>> +	const struct perf_evsel_str_handler handlers[] = {
>>> +		{ "sched:sched_switch",	      process_sched_switch_event, },
>>> +		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
>>> +		{ "sched:sched_stat_sleep",   process_sched_sleep_event, },
>>> +		{ "sched:sched_stat_wait",    process_sched_wait_event, },
>>> +		{ "sched:sched_stat_iowait",  process_sched_iowait_event, },
>>> +		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
>>> +	};
>>
>> I like the concept (obviously since I have invested so much time in other
>> solutions). But... you cannot rely on the sched_stat tracepoints being
>> enabled. I believe it is a performance hit to have CONFIG_SCHEDSTATS enabled
>> and distributions (e.g., Fedora) do not have it enabled.
>
> Yes, I know that some distributions don't enable SCHEDSTATS.  But as
> perf sched record already requires it, I think it's ok to use those
> events in other sched commands.
>
> Maybe we could fix 'perf sched record' not to requires SCHEDSTATS.

I was going to suggest that a couple of years ago. You can't rely on it 
in production environments, so my timehist command for example used 
other approximations. Then if it is enabled it balloons the data file.

> All commands in perf sched just use stat_runtime event currently.  And
> I guess we can calculate/approximate the stat times using sched_switch
> events (and wakeup events) only.  But it also requires changing other
> sched commands not to use sched_stat_runtime.

That's what I do in my 'perf sched timehist' command.

David


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2015-08-25 17:09 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-08-25  6:11 [RFC/PATCH] perf sched: Implement 'perf sched runtime' command Namhyung Kim
2015-08-25  6:19 ` David Ahern
2015-08-25  8:07   ` Namhyung Kim
2015-08-25 17:09     ` David Ahern

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox