* [RFC/PATCH] perf sched: Implement 'perf sched runtime' command
@ 2015-08-25 6:11 Namhyung Kim
2015-08-25 6:19 ` David Ahern
0 siblings, 1 reply; 4+ messages in thread
From: Namhyung Kim @ 2015-08-25 6:11 UTC (permalink / raw)
To: Arnaldo Carvalho de Melo
Cc: Ingo Molnar, Peter Zijlstra, Jiri Olsa, LKML, David Ahern
Currently 'perf sched record' commandsrecords various task stat info but
other perf sched commands don't use them. This is a very simple
implementation of showing those info to users.
$ sudo perf sched record -a sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.346 MB perf.data (774 samples) ]
$ sudo perf sched runtime | head
TID:COMM Run-time Sleep-time Wait-time IOWait-time Switch Migrate
---------------------- ------------ ------------ ------------ ------------ ------ -------
1347:Socket Thread 20901793 4895322293 0 0 2 2
14855:sshd 7404420 660792968 0 0 4 0
1342:firefox 3657944 998405339 82358 0 11 0
1360:Timer 1429014 1002239227 0 0 11 0
170:systemd-journal 1209607 644878917 0 0 1 0
7:rcu_preempt 1081471 958866004 416514 0 26 6
14856:sshd 973632 506809918 0 0 6 0
14859:sleep 887558 1030418783 4055 0 3 1
Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
tools/perf/builtin-sched.c | 361 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 359 insertions(+), 2 deletions(-)
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 33962612a5e9..112596121242 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -98,6 +98,18 @@ struct work_atoms {
int num_merged;
};
+struct task_runtime {
+ struct list_head list;
+ struct rb_node node;
+ struct thread *thread;
+ u64 run_time;
+ u64 sleep_time;
+ u64 wait_time;
+ u64 iowait_time;
+ int num_switch;
+ int num_migrate;
+};
+
typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
struct perf_sched;
@@ -109,6 +121,15 @@ struct trace_sched_handler {
int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
struct perf_sample *sample, struct machine *machine);
+ int (*sleep_event)(struct perf_sched *sched, struct perf_evsel *evsel,
+ struct perf_sample *sample, struct machine *machine);
+
+ int (*wait_event)(struct perf_sched *sched, struct perf_evsel *evsel,
+ struct perf_sample *sample, struct machine *machine);
+
+ int (*iowait_event)(struct perf_sched *sched, struct perf_evsel *evsel,
+ struct perf_sample *sample, struct machine *machine);
+
int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
struct perf_sample *sample, struct machine *machine);
@@ -170,7 +191,7 @@ struct perf_sched {
u64 all_count;
u64 cpu_last_switched[MAX_CPUS];
struct rb_root atom_root, sorted_atom_root, merged_atom_root;
- struct list_head sort_list, cmp_pid;
+ struct list_head sort_list, cmp_pid, runtime_list;
bool force;
bool skip_merge;
};
@@ -1424,6 +1445,120 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
return 0;
}
+static struct task_runtime *get_task_runtime(struct perf_sched *sched,
+ struct machine *machine, int pid)
+{
+ struct thread *th;
+ struct task_runtime *tr;
+
+ th = machine__findnew_thread(machine, -1, pid);
+ if (th == NULL)
+ return NULL;
+
+ tr = thread__priv(th);
+ if (tr == NULL) {
+ tr = zalloc(sizeof(*tr));
+ if (tr == NULL)
+ return NULL;
+
+ thread__set_priv(th, tr);
+
+ /* hold a refcnt not to free the thread */
+ tr->thread = thread__get(th);
+
+ list_add(&tr->list, &sched->runtime_list);
+ }
+ return tr;
+}
+
+static int runtime_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
+ struct perf_sample *sample, struct machine *machine)
+{
+ const u32 pid = perf_evsel__intval(evsel, sample, "prev_pid");
+ struct task_runtime *tr;
+
+ tr = get_task_runtime(sched, machine, pid);
+ if (tr == NULL)
+ return -1;
+
+ tr->num_switch++;
+ return 0;
+}
+
+static int runtime_runtime_event(struct perf_sched *sched, struct perf_evsel *evsel,
+ struct perf_sample *sample, struct machine *machine)
+{
+ const u32 pid = perf_evsel__intval(evsel, sample, "pid");
+ const u64 time = perf_evsel__intval(evsel, sample, "runtime");
+ struct task_runtime *tr;
+
+ tr = get_task_runtime(sched, machine, pid);
+ if (tr == NULL)
+ return -1;
+
+ tr->run_time += time;
+ return 0;
+}
+
+static int runtime_sleep_event(struct perf_sched *sched, struct perf_evsel *evsel,
+ struct perf_sample *sample, struct machine *machine)
+{
+ const u32 pid = perf_evsel__intval(evsel, sample, "pid");
+ const u64 time = perf_evsel__intval(evsel, sample, "delay");
+ struct task_runtime *tr;
+
+ tr = get_task_runtime(sched, machine, pid);
+ if (tr == NULL)
+ return -1;
+
+ tr->sleep_time += time;
+ return 0;
+}
+
+static int runtime_wait_event(struct perf_sched *sched, struct perf_evsel *evsel,
+ struct perf_sample *sample, struct machine *machine)
+{
+ const u32 pid = perf_evsel__intval(evsel, sample, "pid");
+ const u64 time = perf_evsel__intval(evsel, sample, "delay");
+ struct task_runtime *tr;
+
+ tr = get_task_runtime(sched, machine, pid);
+ if (tr == NULL)
+ return -1;
+
+ tr->wait_time += time;
+ return 0;
+}
+
+static int runtime_iowait_event(struct perf_sched *sched, struct perf_evsel *evsel,
+ struct perf_sample *sample, struct machine *machine)
+{
+ const u32 pid = perf_evsel__intval(evsel, sample, "pid");
+ const u64 time = perf_evsel__intval(evsel, sample, "delay");
+ struct task_runtime *tr;
+
+ tr = get_task_runtime(sched, machine, pid);
+ if (tr == NULL)
+ return -1;
+
+ tr->iowait_time += time;
+ return 0;
+}
+
+static int runtime_migrate_event(struct perf_sched *sched, struct perf_evsel *evsel,
+ struct perf_sample *sample, struct machine *machine)
+{
+ const u32 pid = perf_evsel__intval(evsel, sample, "pid");
+ struct task_runtime *tr;
+
+ tr = get_task_runtime(sched, machine, pid);
+ if (tr == NULL)
+ return -1;
+
+ tr->num_migrate++;
+ return 0;
+}
+
static int process_sched_switch_event(struct perf_tool *tool,
struct perf_evsel *evsel,
struct perf_sample *sample,
@@ -1463,6 +1598,45 @@ static int process_sched_runtime_event(struct perf_tool *tool,
return 0;
}
+static int process_sched_sleep_event(struct perf_tool *tool,
+ struct perf_evsel *evsel,
+ struct perf_sample *sample,
+ struct machine *machine)
+{
+ struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
+
+ if (sched->tp_handler->sleep_event)
+ return sched->tp_handler->sleep_event(sched, evsel, sample, machine);
+
+ return 0;
+}
+
+static int process_sched_wait_event(struct perf_tool *tool,
+ struct perf_evsel *evsel,
+ struct perf_sample *sample,
+ struct machine *machine)
+{
+ struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
+
+ if (sched->tp_handler->wait_event)
+ return sched->tp_handler->wait_event(sched, evsel, sample, machine);
+
+ return 0;
+}
+
+static int process_sched_iowait_event(struct perf_tool *tool,
+ struct perf_evsel *evsel,
+ struct perf_sample *sample,
+ struct machine *machine)
+{
+ struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
+
+ if (sched->tp_handler->iowait_event)
+ return sched->tp_handler->iowait_event(sched, evsel, sample, machine);
+
+ return 0;
+}
+
static int perf_sched__process_fork_event(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample,
@@ -1560,6 +1734,143 @@ out_delete:
return rc;
}
+static int perf_sched__read_runtime_events(struct perf_sched *sched)
+{
+ const struct perf_evsel_str_handler handlers[] = {
+ { "sched:sched_switch", process_sched_switch_event, },
+ { "sched:sched_stat_runtime", process_sched_runtime_event, },
+ { "sched:sched_stat_sleep", process_sched_sleep_event, },
+ { "sched:sched_stat_wait", process_sched_wait_event, },
+ { "sched:sched_stat_iowait", process_sched_iowait_event, },
+ { "sched:sched_migrate_task", process_sched_migrate_task_event, },
+ };
+ struct perf_session *session;
+ struct perf_data_file file = {
+ .path = input_name,
+ .mode = PERF_DATA_MODE_READ,
+ .force = sched->force,
+ };
+ int rc = -1;
+
+ session = perf_session__new(&file, false, &sched->tool);
+ if (session == NULL) {
+ pr_debug("No Memory for session\n");
+ return -1;
+ }
+
+ symbol__init(&session->header.env);
+
+ if (perf_session__set_tracepoints_handlers(session, handlers))
+ goto out_delete;
+
+ if (perf_session__has_traces(session, "record -R")) {
+ int err = perf_session__process_events(session);
+ if (err) {
+ pr_err("Failed to process events, error %d", err);
+ goto out_delete;
+ }
+
+ sched->nr_events = session->evlist->stats.nr_events[0];
+ sched->nr_lost_events = session->evlist->stats.total_lost;
+ sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
+ }
+
+ rc = 0;
+out_delete:
+ perf_session__delete(session);
+ return rc;
+}
+
+static int task_runtime_cmp(struct perf_sched *sched __maybe_unused,
+ struct task_runtime *a, struct task_runtime *b)
+{
+ return b->run_time - a->run_time;
+}
+
+static void sort_task_runtime(struct perf_sched *sched, struct task_runtime *tr)
+{
+ struct rb_root *root = &sched->sorted_atom_root;
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+
+ while (*p) {
+ struct task_runtime *iter;
+ int cmp;
+
+ parent = *p;
+ iter = rb_entry(parent, struct task_runtime, node);
+
+ cmp = task_runtime_cmp(sched, iter, tr);
+
+ if (cmp > 0)
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+ }
+
+ rb_link_node(&tr->node, parent, p);
+ rb_insert_color(&tr->node, root);
+}
+
+static void perf_sched__sort_runtime(struct perf_sched *sched)
+{
+ while (!list_empty(&sched->runtime_list)) {
+ struct task_runtime *tr;
+
+ tr = list_first_entry(&sched->runtime_list,
+ struct task_runtime, list);
+ list_del(&tr->list);
+
+ sort_task_runtime(sched, tr);
+ }
+}
+
+static void print_task_runtime(struct task_runtime *tr)
+{
+ struct thread *th = tr->thread;
+
+ /* tid:comm run_time sleep_time wait_time iowait_time switch migrate*/
+ printf("%5d:%-16s %12"PRIu64" %12"PRIu64" %12"PRIu64" %12"PRIu64" %6d %7d\n",
+ th->tid, thread__comm_str(th), tr->run_time, tr->sleep_time,
+ tr->wait_time, tr->iowait_time, tr->num_switch, tr->num_migrate);
+}
+
+static void perf_sched__print_runtime(struct perf_sched *sched)
+{
+ struct task_runtime *tr;
+ struct rb_root *root = &sched->sorted_atom_root;
+ struct rb_node *node = rb_first(root);
+
+ printf("%5s:%-16s %12s %12s %12s %12s %6s %7s\n",
+ "TID", "COMM", "Run-time", "Sleep-time", "Wait-time",
+ "IOWait-time", "Switch", "Migrate");
+ printf("%.22s %.12s %.12s %.12s %.12s %.6s %.7s\n",
+ graph_dotted_line, graph_dotted_line, graph_dotted_line,
+ graph_dotted_line, graph_dotted_line, graph_dotted_line,
+ graph_dotted_line);
+
+ while (node) {
+ tr = rb_entry(node, struct task_runtime, node);
+ print_task_runtime(tr);
+
+ node = rb_next(node);
+ }
+
+ /* release thread refcnt */
+ node = rb_first(root);
+ while (node) {
+ struct rb_node *next = rb_next(node);
+ rb_erase(node, root);
+
+ tr = rb_entry(node, struct task_runtime, node);
+ thread__set_priv(tr->thread, NULL);
+ thread__put(tr->thread);
+ free(tr);
+
+ node = next;
+ }
+}
+
static void print_bad_events(struct perf_sched *sched)
{
if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
@@ -1720,6 +2031,23 @@ static int perf_sched__replay(struct perf_sched *sched)
return 0;
}
+static int perf_sched__runtime(struct perf_sched *sched)
+{
+ setup_pager();
+
+ INIT_LIST_HEAD(&sched->runtime_list);
+ if (perf_sched__read_runtime_events(sched))
+ return -1;
+
+ perf_sched__sort_runtime(sched);
+ perf_sched__print_runtime(sched);
+
+ print_bad_events(sched);
+ printf("\n");
+
+ return 0;
+}
+
static void setup_sorting(struct perf_sched *sched, const struct option *options,
const char * const usage_msg[])
{
@@ -1821,6 +2149,14 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
OPT_END()
};
+ const struct option runtime_options[] = {
+ OPT_INCR('v', "verbose", &verbose,
+ "be more verbose (show symbol address, etc)"),
+ OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+ "dump raw trace in ASCII"),
+ OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
+ OPT_END()
+ };
const struct option sched_options[] = {
OPT_STRING('i', "input", &input_name, "file",
"input file name"),
@@ -1838,8 +2174,13 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
"perf sched replay [<options>]",
NULL
};
+ const char * const runtime_usage[] = {
+ "perf sched runtime [<options>]",
+ NULL
+ };
const char *const sched_subcommands[] = { "record", "latency", "map",
- "replay", "script", NULL };
+ "replay", "script", "runtime",
+ NULL };
const char *sched_usage[] = {
NULL,
NULL
@@ -1858,6 +2199,14 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
.switch_event = replay_switch_event,
.fork_event = replay_fork_event,
};
+ struct trace_sched_handler runtime_ops = {
+ .switch_event = runtime_switch_event,
+ .runtime_event = runtime_runtime_event,
+ .sleep_event = runtime_sleep_event,
+ .wait_event = runtime_wait_event,
+ .iowait_event = runtime_iowait_event,
+ .migrate_task_event = runtime_migrate_event,
+ };
unsigned int i;
for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
@@ -1897,6 +2246,14 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
usage_with_options(replay_usage, replay_options);
}
return perf_sched__replay(&sched);
+ } else if (!strncmp(argv[0], "run", 3)) {
+ sched.tp_handler = &runtime_ops;
+ if (argc) {
+ argc = parse_options(argc, argv, runtime_options, runtime_usage, 0);
+ if (argc)
+ usage_with_options(runtime_usage, runtime_options);
+ }
+ return perf_sched__runtime(&sched);
} else {
usage_with_options(sched_usage, sched_options);
}
--
2.5.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [RFC/PATCH] perf sched: Implement 'perf sched runtime' command
2015-08-25 6:11 [RFC/PATCH] perf sched: Implement 'perf sched runtime' command Namhyung Kim
@ 2015-08-25 6:19 ` David Ahern
2015-08-25 8:07 ` Namhyung Kim
0 siblings, 1 reply; 4+ messages in thread
From: David Ahern @ 2015-08-25 6:19 UTC (permalink / raw)
To: Namhyung Kim, Arnaldo Carvalho de Melo
Cc: Ingo Molnar, Peter Zijlstra, Jiri Olsa, LKML
On 8/24/15 11:11 PM, Namhyung Kim wrote:
>
> +static int perf_sched__read_runtime_events(struct perf_sched *sched)
> +{
> + const struct perf_evsel_str_handler handlers[] = {
> + { "sched:sched_switch", process_sched_switch_event, },
> + { "sched:sched_stat_runtime", process_sched_runtime_event, },
> + { "sched:sched_stat_sleep", process_sched_sleep_event, },
> + { "sched:sched_stat_wait", process_sched_wait_event, },
> + { "sched:sched_stat_iowait", process_sched_iowait_event, },
> + { "sched:sched_migrate_task", process_sched_migrate_task_event, },
> + };
I like the concept (obviously since I have invested so much time in
other solutions). But... you cannot rely on the sched_stat tracepoints
being enabled. I believe it is a performance hit to have
CONFIG_SCHEDSTATS enabled and distributions (e.g., Fedora) do not have
it enabled.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [RFC/PATCH] perf sched: Implement 'perf sched runtime' command
2015-08-25 6:19 ` David Ahern
@ 2015-08-25 8:07 ` Namhyung Kim
2015-08-25 17:09 ` David Ahern
0 siblings, 1 reply; 4+ messages in thread
From: Namhyung Kim @ 2015-08-25 8:07 UTC (permalink / raw)
To: David Ahern
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, Peter Zijlstra, Jiri Olsa,
LKML
Hi David,
On Mon, Aug 24, 2015 at 11:19:44PM -0700, David Ahern wrote:
> On 8/24/15 11:11 PM, Namhyung Kim wrote:
> >
> >+static int perf_sched__read_runtime_events(struct perf_sched *sched)
> >+{
> >+ const struct perf_evsel_str_handler handlers[] = {
> >+ { "sched:sched_switch", process_sched_switch_event, },
> >+ { "sched:sched_stat_runtime", process_sched_runtime_event, },
> >+ { "sched:sched_stat_sleep", process_sched_sleep_event, },
> >+ { "sched:sched_stat_wait", process_sched_wait_event, },
> >+ { "sched:sched_stat_iowait", process_sched_iowait_event, },
> >+ { "sched:sched_migrate_task", process_sched_migrate_task_event, },
> >+ };
>
> I like the concept (obviously since I have invested so much time in other
> solutions). But... you cannot rely on the sched_stat tracepoints being
> enabled. I believe it is a performance hit to have CONFIG_SCHEDSTATS enabled
> and distributions (e.g., Fedora) do not have it enabled.
Yes, I know that some distributions don't enable SCHEDSTATS. But as
perf sched record already requires it, I think it's ok to use those
events in other sched commands.
Maybe we could fix 'perf sched record' not to requires SCHEDSTATS.
All commands in perf sched just use stat_runtime event currently. And
I guess we can calculate/approximate the stat times using sched_switch
events (and wakeup events) only. But it also requires changing other
sched commands not to use sched_stat_runtime.
Thanks,
Namhyung
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [RFC/PATCH] perf sched: Implement 'perf sched runtime' command
2015-08-25 8:07 ` Namhyung Kim
@ 2015-08-25 17:09 ` David Ahern
0 siblings, 0 replies; 4+ messages in thread
From: David Ahern @ 2015-08-25 17:09 UTC (permalink / raw)
To: Namhyung Kim
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, Peter Zijlstra, Jiri Olsa,
LKML
On 8/25/15 1:07 AM, Namhyung Kim wrote:
> Hi David,
>
> On Mon, Aug 24, 2015 at 11:19:44PM -0700, David Ahern wrote:
>> On 8/24/15 11:11 PM, Namhyung Kim wrote:
>>>
>>> +static int perf_sched__read_runtime_events(struct perf_sched *sched)
>>> +{
>>> + const struct perf_evsel_str_handler handlers[] = {
>>> + { "sched:sched_switch", process_sched_switch_event, },
>>> + { "sched:sched_stat_runtime", process_sched_runtime_event, },
>>> + { "sched:sched_stat_sleep", process_sched_sleep_event, },
>>> + { "sched:sched_stat_wait", process_sched_wait_event, },
>>> + { "sched:sched_stat_iowait", process_sched_iowait_event, },
>>> + { "sched:sched_migrate_task", process_sched_migrate_task_event, },
>>> + };
>>
>> I like the concept (obviously since I have invested so much time in other
>> solutions). But... you cannot rely on the sched_stat tracepoints being
>> enabled. I believe it is a performance hit to have CONFIG_SCHEDSTATS enabled
>> and distributions (e.g., Fedora) do not have it enabled.
>
> Yes, I know that some distributions don't enable SCHEDSTATS. But as
> perf sched record already requires it, I think it's ok to use those
> events in other sched commands.
>
> Maybe we could fix 'perf sched record' not to requires SCHEDSTATS.
I was going to suggest that a couple of years ago. You can't rely on it
in production environments, so my timehist command for example used
other approximations. Then if it is enabled it balloons the data file.
> All commands in perf sched just use stat_runtime event currently. And
> I guess we can calculate/approximate the stat times using sched_switch
> events (and wakeup events) only. But it also requires changing other
> sched commands not to use sched_stat_runtime.
That's what I do in my 'perf sched timehist' command.
David
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2015-08-25 17:09 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-08-25 6:11 [RFC/PATCH] perf sched: Implement 'perf sched runtime' command Namhyung Kim
2015-08-25 6:19 ` David Ahern
2015-08-25 8:07 ` Namhyung Kim
2015-08-25 17:09 ` David Ahern
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox