* [PATCH] perf sched timehist: Add pre-migration wait time option
@ 2024-09-17 9:15 Madadi Vineeth Reddy
2024-09-26 23:04 ` Namhyung Kim
0 siblings, 1 reply; 3+ messages in thread
From: Madadi Vineeth Reddy @ 2024-09-17 9:15 UTC (permalink / raw)
To: Arnaldo Carvalho de Melo, Namhyung Kim, Ian Rogers, Athira Rajeev,
Peter Zijlstra, Ingo Molnar, Mark Rutland, Alexander Shishkin,
Jiri Olsa, Adrian Hunter, Kan Liang, acme, linux-perf-users, LKML
Cc: Madadi Vineeth Reddy
pre-migration wait time is the time that a task unnecessarily spends
on the runqueue of a CPU but doesn't get switched-in there. In terms
of tracepoints, it is the time between sched:sched_wakeup and
sched:sched_migrate_task.
Let's say a task woke up on CPU2, then it got migrated to CPU4 and
then it's switched-in to CPU4. So, here pre-migration wait time is
time that it was waiting on runqueue of CPU2 after it is woken up.
The general pattern for pre-migration to occur is:
sched:sched_wakeup
sched:sched_migrate_task
sched:sched_switch
So, this option expects sched_wakeup also to be recorded and fails
if attempted to use the option without sched:sched_wakeup tracepoint.
pre-migrations are generally not useful and it increases migrations.
This metric would be helpful in testing patches mainly related to wakeup
and load-balancer code paths as better wakeup logic would choose an
optimal CPU where task would be switched-in and thereby reducing pre-
migrations.
The sample output(s) when -P or --pre-migrations is used:
=================
time cpu task name wait time sch delay run time pre-mig time
[tid/pid] (msec) (msec) (msec) (msec)
--------------- ------ ------------------------------ --------- --------- --------- ---------
103032.721020 [0000] perf[47206] 0.000 0.000 0.000 0.000
103032.721034 [0000] migration/0[18] 0.000 0.003 0.013 0.000
103032.736716 [0001] schbench[47229/47207] 10.664 9.231 0.039 9.170
103032.736719 [0005] <idle> 0.000 0.000 15.405 0.000
103032.736726 [0003] schbench[47216/47207] 11.251 9.193 0.022 9.126
103032.736727 [0001] schbench[47228/47207] 10.752 9.264 0.010 9.169
103032.736731 [0007] <idle> 0.000 0.000 15.314 0.000
103032.736739 [0008] <idle> 0.000 0.000 15.249 0.000
103032.736742 [0003] schbench[47257/47207] 9.498 9.184 0.015 9.100
51370.894024 [0012] schbench[38775/38770] 21.047 21.047 8.950 0.000
51370.894024 [0008] schbench[38781/38770] 39.597 39.448 9.984 39.437
51370.894025 [0009] stress-ng-cpu[38743] 7.971 7.971 9.997 0.000
Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
---
tools/perf/Documentation/perf-sched.txt | 8 ++
tools/perf/builtin-sched.c | 110 +++++++++++++++++-------
2 files changed, 85 insertions(+), 33 deletions(-)
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 3db64954a267..7935499b88b0 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -221,6 +221,14 @@ OPTIONS for 'perf sched timehist'
priorities are specified with -: 120-129. A combination of both can also be
provided: 0,120-129.
+-P::
+--pre-migrations::
+ Show pre-migration wait time. pre-migration wait time is the time spent
+ by a task waiting on a runqueue but not getting the chance to run there
+ and is migrated to a different runqueue where it is finally run. This
+ time between migrate_task and sched_wakeup is the pre-migration wait
+ time. This option is valid only if sched_wakeup tracepoint is recorded.
+
OPTIONS for 'perf sched replay'
------------------------------
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 5981cc51abc8..252e71d83d82 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -228,6 +228,7 @@ struct perf_sched {
bool show_wakeups;
bool show_next;
bool show_migrations;
+ bool pre_migrations;
bool show_state;
bool show_prio;
u64 skipped_samples;
@@ -247,7 +248,10 @@ struct thread_runtime {
u64 dt_iowait; /* time between CPU access by iowait (off cpu) */
u64 dt_preempt; /* time between CPU access by preempt (off cpu) */
u64 dt_delay; /* time between wakeup and sched-in */
+ u64 dt_pre_mig; /* time between migration and wakeup */
u64 ready_to_run; /* time of wakeup */
+ u64 woken; /* time when sched_wakeup tracepoint is hit */
+ u64 migrated; /* time when a thread is migrated */
struct stats run_stats;
u64 total_run_time;
@@ -255,6 +259,7 @@ struct thread_runtime {
u64 total_iowait_time;
u64 total_preempt_time;
u64 total_delay_time;
+ u64 total_pre_mig_time;
char last_state;
@@ -2083,14 +2088,15 @@ static void timehist_header(struct perf_sched *sched)
printf(" ");
}
- if (sched->show_prio) {
- printf(" %-*s %-*s %9s %9s %9s",
- comm_width, "task name", MAX_PRIO_STR_LEN, "prio",
- "wait time", "sch delay", "run time");
- } else {
- printf(" %-*s %9s %9s %9s", comm_width,
- "task name", "wait time", "sch delay", "run time");
- }
+ printf(" %-*s", comm_width, "task name");
+
+ if (sched->show_prio)
+ printf(" %-*s", MAX_PRIO_STR_LEN, "prio");
+
+ printf(" %9s %9s %9s", "wait time", "sch delay", "run time");
+
+ if (sched->pre_migrations)
+ printf(" %9s", "pre-mig time");
if (sched->show_state)
printf(" %s", "state");
@@ -2105,17 +2111,15 @@ static void timehist_header(struct perf_sched *sched)
if (sched->show_cpu_visual)
printf(" %*s ", ncpus, "");
- if (sched->show_prio) {
- printf(" %-*s %-*s %9s %9s %9s",
- comm_width, "[tid/pid]", MAX_PRIO_STR_LEN, "",
- "(msec)", "(msec)", "(msec)");
- } else {
- printf(" %-*s %9s %9s %9s", comm_width,
- "[tid/pid]", "(msec)", "(msec)", "(msec)");
- }
+ printf(" %-*s", comm_width, "[tid/pid]");
- if (sched->show_state)
- printf(" %5s", "");
+ if (sched->show_prio)
+ printf(" %-*s", MAX_PRIO_STR_LEN, "");
+
+ printf(" %9s %9s %9s", "(msec)", "(msec)", "(msec)");
+
+ if (sched->pre_migrations)
+ printf(" %9s", "(msec)");
printf("\n");
@@ -2127,15 +2131,15 @@ static void timehist_header(struct perf_sched *sched)
if (sched->show_cpu_visual)
printf(" %.*s ", ncpus, graph_dotted_line);
- if (sched->show_prio) {
- printf(" %.*s %.*s %.9s %.9s %.9s",
- comm_width, graph_dotted_line, MAX_PRIO_STR_LEN, graph_dotted_line,
- graph_dotted_line, graph_dotted_line, graph_dotted_line);
- } else {
- printf(" %.*s %.9s %.9s %.9s", comm_width,
- graph_dotted_line, graph_dotted_line, graph_dotted_line,
- graph_dotted_line);
- }
+ printf(" %.*s", comm_width, graph_dotted_line);
+
+ if (sched->show_prio)
+ printf(" %.*s", MAX_PRIO_STR_LEN, graph_dotted_line);
+
+ printf(" %.9s %.9s %.9s", graph_dotted_line, graph_dotted_line, graph_dotted_line);
+
+ if (sched->pre_migrations)
+ printf(" %.9s", graph_dotted_line);
if (sched->show_state)
printf(" %.5s", graph_dotted_line);
@@ -2190,6 +2194,8 @@ static void timehist_print_sample(struct perf_sched *sched,
print_sched_time(tr->dt_delay, 6);
print_sched_time(tr->dt_run, 6);
+ if (sched->pre_migrations)
+ print_sched_time(tr->dt_pre_mig, 6);
if (sched->show_state)
printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state);
@@ -2249,6 +2255,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
r->dt_iowait = 0;
r->dt_preempt = 0;
r->dt_run = 0;
+ r->dt_pre_mig = 0;
if (tprev) {
r->dt_run = t - tprev;
@@ -2257,6 +2264,11 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
else
r->dt_delay = tprev - r->ready_to_run;
+
+ if (r->woken && r->migrated) {
+ if ((r->migrated > r->woken) && (r->migrated < tprev))
+ r->dt_pre_mig = r->migrated - r->woken;
+ }
}
if (r->last_time > tprev)
@@ -2280,6 +2292,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
r->total_sleep_time += r->dt_sleep;
r->total_iowait_time += r->dt_iowait;
r->total_preempt_time += r->dt_preempt;
+ r->total_pre_mig_time += r->dt_pre_mig;
}
static bool is_idle_sample(struct perf_sample *sample,
@@ -2579,12 +2592,27 @@ static void timehist_print_wakeup_event(struct perf_sched *sched,
printf("\n");
}
-static int timehist_sched_wakeup_ignore(const struct perf_tool *tool __maybe_unused,
+static int timehist_sched_wakeup_consider(const struct perf_tool *tool __maybe_unused,
union perf_event *event __maybe_unused,
struct evsel *evsel __maybe_unused,
struct perf_sample *sample __maybe_unused,
struct machine *machine __maybe_unused)
{
+ struct thread *thread;
+ struct thread_runtime *tr = NULL;
+ const u32 pid = evsel__intval(evsel, sample, "pid");
+
+ thread = machine__findnew_thread(machine, 0, pid);
+ if (thread == NULL)
+ return -1;
+
+ tr = thread__get_runtime(thread);
+ if (tr == NULL)
+ return -1;
+
+ if (tr->woken == 0)
+ tr->woken = sample->time;
+
return 0;
}
@@ -2694,8 +2722,14 @@ static int timehist_migrate_task_event(const struct perf_tool *tool,
tr->migrations++;
+ if (tr->migrated == 0)
+ tr->migrated = sample->time;
+
/* show migrations if requested */
- timehist_print_migration_event(sched, evsel, sample, machine, thread);
+ if (sched->show_migrations) {
+ timehist_print_migration_event(sched, evsel, sample,
+ machine, thread);
+ }
return 0;
}
@@ -2846,11 +2880,14 @@ static int timehist_sched_change_event(const struct perf_tool *tool,
/* last state is used to determine where to account wait time */
tr->last_state = state;
- /* sched out event for task so reset ready to run time */
+ /* sched out event for task so reset ready to run time, woken and migrated time */
if (state == 'R')
tr->ready_to_run = t;
else
tr->ready_to_run = 0;
+
+ tr->woken = 0;
+ tr->migrated = 0;
}
evsel__save_time(evsel, sample->time, sample->cpu);
@@ -3278,7 +3315,7 @@ static int perf_sched__timehist(struct perf_sched *sched)
/* prefer sched_waking if it is captured */
if (evlist__find_tracepoint_by_name(session->evlist, "sched:sched_waking"))
- handlers[1].handler = timehist_sched_wakeup_ignore;
+ handlers[1].handler = timehist_sched_wakeup_consider;
/* setup per-evsel handlers */
if (perf_session__set_tracepoints_handlers(session, handlers))
@@ -3290,8 +3327,14 @@ static int perf_sched__timehist(struct perf_sched *sched)
goto out;
}
- if (sched->show_migrations &&
- perf_session__set_tracepoints_handlers(session, migrate_handlers))
+ if (sched->pre_migrations && !evlist__find_tracepoint_by_name(session->evlist,
+ "sched:sched_wakeup")) {
+ pr_err("No sched_wakeup events found. sched_wakeup tracepoint is mandatory for -P option\n");
+ goto out;
+ }
+
+ if ((sched->show_migrations || sched->pre_migrations) &&
+ (perf_session__set_tracepoints_handlers(session, migrate_handlers)))
goto out;
/* pre-allocate struct for per-CPU idle stats */
@@ -3833,6 +3876,7 @@ int cmd_sched(int argc, const char **argv)
OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"),
OPT_STRING(0, "prio", &sched.prio_str, "prio",
"analyze events only for given task priority(ies)"),
+ OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"),
OPT_PARENT(sched_options)
};
--
2.43.1
^ permalink raw reply related [flat|nested] 3+ messages in thread* Re: [PATCH] perf sched timehist: Add pre-migration wait time option
2024-09-17 9:15 [PATCH] perf sched timehist: Add pre-migration wait time option Madadi Vineeth Reddy
@ 2024-09-26 23:04 ` Namhyung Kim
2024-10-01 6:03 ` Madadi Vineeth Reddy
0 siblings, 1 reply; 3+ messages in thread
From: Namhyung Kim @ 2024-09-26 23:04 UTC (permalink / raw)
To: Madadi Vineeth Reddy
Cc: Arnaldo Carvalho de Melo, Ian Rogers, Athira Rajeev,
Peter Zijlstra, Ingo Molnar, Mark Rutland, Alexander Shishkin,
Jiri Olsa, Adrian Hunter, Kan Liang, acme, linux-perf-users, LKML
Hello,
On Tue, Sep 17, 2024 at 02:45:37PM +0530, Madadi Vineeth Reddy wrote:
> pre-migration wait time is the time that a task unnecessarily spends
> on the runqueue of a CPU but doesn't get switched-in there. In terms
> of tracepoints, it is the time between sched:sched_wakeup and
> sched:sched_migrate_task.
Sounds useful.
>
> Let's say a task woke up on CPU2, then it got migrated to CPU4 and
> then it's switched-in to CPU4. So, here pre-migration wait time is
> time that it was waiting on runqueue of CPU2 after it is woken up.
>
> The general pattern for pre-migration to occur is:
> sched:sched_wakeup
> sched:sched_migrate_task
> sched:sched_switch
> So, this option expects sched_wakeup also to be recorded and fails
> if attempted to use the option without sched:sched_wakeup tracepoint.
I'm curious if you check sched_migrate_task as well.
>
> pre-migrations are generally not useful and it increases migrations.
> This metric would be helpful in testing patches mainly related to wakeup
> and load-balancer code paths as better wakeup logic would choose an
> optimal CPU where task would be switched-in and thereby reducing pre-
> migrations.
>
> The sample output(s) when -P or --pre-migrations is used:
> =================
> time cpu task name wait time sch delay run time pre-mig time
> [tid/pid] (msec) (msec) (msec) (msec)
> --------------- ------ ------------------------------ --------- --------- --------- ---------
> 103032.721020 [0000] perf[47206] 0.000 0.000 0.000 0.000
> 103032.721034 [0000] migration/0[18] 0.000 0.003 0.013 0.000
> 103032.736716 [0001] schbench[47229/47207] 10.664 9.231 0.039 9.170
> 103032.736719 [0005] <idle> 0.000 0.000 15.405 0.000
> 103032.736726 [0003] schbench[47216/47207] 11.251 9.193 0.022 9.126
> 103032.736727 [0001] schbench[47228/47207] 10.752 9.264 0.010 9.169
> 103032.736731 [0007] <idle> 0.000 0.000 15.314 0.000
> 103032.736739 [0008] <idle> 0.000 0.000 15.249 0.000
> 103032.736742 [0003] schbench[47257/47207] 9.498 9.184 0.015 9.100
>
> 51370.894024 [0012] schbench[38775/38770] 21.047 21.047 8.950 0.000
> 51370.894024 [0008] schbench[38781/38770] 39.597 39.448 9.984 39.437
> 51370.894025 [0009] stress-ng-cpu[38743] 7.971 7.971 9.997 0.000
>
> Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
> ---
> tools/perf/Documentation/perf-sched.txt | 8 ++
> tools/perf/builtin-sched.c | 110 +++++++++++++++++-------
> 2 files changed, 85 insertions(+), 33 deletions(-)
>
> diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
> index 3db64954a267..7935499b88b0 100644
> --- a/tools/perf/Documentation/perf-sched.txt
> +++ b/tools/perf/Documentation/perf-sched.txt
> @@ -221,6 +221,14 @@ OPTIONS for 'perf sched timehist'
> priorities are specified with -: 120-129. A combination of both can also be
> provided: 0,120-129.
>
> +-P::
> +--pre-migrations::
> + Show pre-migration wait time. pre-migration wait time is the time spent
> + by a task waiting on a runqueue but not getting the chance to run there
> + and is migrated to a different runqueue where it is finally run. This
> + time between migrate_task and sched_wakeup is the pre-migration wait
> + time. This option is valid only if sched_wakeup tracepoint is recorded.
> +
> OPTIONS for 'perf sched replay'
> ------------------------------
>
> diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
> index 5981cc51abc8..252e71d83d82 100644
> --- a/tools/perf/builtin-sched.c
> +++ b/tools/perf/builtin-sched.c
> @@ -228,6 +228,7 @@ struct perf_sched {
> bool show_wakeups;
> bool show_next;
> bool show_migrations;
> + bool pre_migrations;
> bool show_state;
> bool show_prio;
> u64 skipped_samples;
> @@ -247,7 +248,10 @@ struct thread_runtime {
> u64 dt_iowait; /* time between CPU access by iowait (off cpu) */
> u64 dt_preempt; /* time between CPU access by preempt (off cpu) */
> u64 dt_delay; /* time between wakeup and sched-in */
> + u64 dt_pre_mig; /* time between migration and wakeup */
> u64 ready_to_run; /* time of wakeup */
> + u64 woken; /* time when sched_wakeup tracepoint is hit */
> + u64 migrated; /* time when a thread is migrated */
>
> struct stats run_stats;
> u64 total_run_time;
> @@ -255,6 +259,7 @@ struct thread_runtime {
> u64 total_iowait_time;
> u64 total_preempt_time;
> u64 total_delay_time;
> + u64 total_pre_mig_time;
>
> char last_state;
>
> @@ -2083,14 +2088,15 @@ static void timehist_header(struct perf_sched *sched)
> printf(" ");
> }
>
> - if (sched->show_prio) {
> - printf(" %-*s %-*s %9s %9s %9s",
> - comm_width, "task name", MAX_PRIO_STR_LEN, "prio",
> - "wait time", "sch delay", "run time");
> - } else {
> - printf(" %-*s %9s %9s %9s", comm_width,
> - "task name", "wait time", "sch delay", "run time");
> - }
> + printf(" %-*s", comm_width, "task name");
> +
> + if (sched->show_prio)
> + printf(" %-*s", MAX_PRIO_STR_LEN, "prio");
> +
> + printf(" %9s %9s %9s", "wait time", "sch delay", "run time");
> +
> + if (sched->pre_migrations)
> + printf(" %9s", "pre-mig time");
>
> if (sched->show_state)
> printf(" %s", "state");
> @@ -2105,17 +2111,15 @@ static void timehist_header(struct perf_sched *sched)
> if (sched->show_cpu_visual)
> printf(" %*s ", ncpus, "");
>
> - if (sched->show_prio) {
> - printf(" %-*s %-*s %9s %9s %9s",
> - comm_width, "[tid/pid]", MAX_PRIO_STR_LEN, "",
> - "(msec)", "(msec)", "(msec)");
> - } else {
> - printf(" %-*s %9s %9s %9s", comm_width,
> - "[tid/pid]", "(msec)", "(msec)", "(msec)");
> - }
> + printf(" %-*s", comm_width, "[tid/pid]");
>
> - if (sched->show_state)
> - printf(" %5s", "");
> + if (sched->show_prio)
> + printf(" %-*s", MAX_PRIO_STR_LEN, "");
> +
> + printf(" %9s %9s %9s", "(msec)", "(msec)", "(msec)");
> +
> + if (sched->pre_migrations)
> + printf(" %9s", "(msec)");
>
> printf("\n");
>
> @@ -2127,15 +2131,15 @@ static void timehist_header(struct perf_sched *sched)
> if (sched->show_cpu_visual)
> printf(" %.*s ", ncpus, graph_dotted_line);
>
> - if (sched->show_prio) {
> - printf(" %.*s %.*s %.9s %.9s %.9s",
> - comm_width, graph_dotted_line, MAX_PRIO_STR_LEN, graph_dotted_line,
> - graph_dotted_line, graph_dotted_line, graph_dotted_line);
> - } else {
> - printf(" %.*s %.9s %.9s %.9s", comm_width,
> - graph_dotted_line, graph_dotted_line, graph_dotted_line,
> - graph_dotted_line);
> - }
> + printf(" %.*s", comm_width, graph_dotted_line);
> +
> + if (sched->show_prio)
> + printf(" %.*s", MAX_PRIO_STR_LEN, graph_dotted_line);
> +
> + printf(" %.9s %.9s %.9s", graph_dotted_line, graph_dotted_line, graph_dotted_line);
> +
> + if (sched->pre_migrations)
> + printf(" %.9s", graph_dotted_line);
>
> if (sched->show_state)
> printf(" %.5s", graph_dotted_line);
> @@ -2190,6 +2194,8 @@ static void timehist_print_sample(struct perf_sched *sched,
>
> print_sched_time(tr->dt_delay, 6);
> print_sched_time(tr->dt_run, 6);
> + if (sched->pre_migrations)
> + print_sched_time(tr->dt_pre_mig, 6);
>
> if (sched->show_state)
> printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state);
> @@ -2249,6 +2255,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
> r->dt_iowait = 0;
> r->dt_preempt = 0;
> r->dt_run = 0;
> + r->dt_pre_mig = 0;
>
> if (tprev) {
> r->dt_run = t - tprev;
> @@ -2257,6 +2264,11 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
> pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
> else
> r->dt_delay = tprev - r->ready_to_run;
> +
> + if (r->woken && r->migrated) {
> + if ((r->migrated > r->woken) && (r->migrated < tprev))
> + r->dt_pre_mig = r->migrated - r->woken;
> + }
> }
>
> if (r->last_time > tprev)
> @@ -2280,6 +2292,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
> r->total_sleep_time += r->dt_sleep;
> r->total_iowait_time += r->dt_iowait;
> r->total_preempt_time += r->dt_preempt;
> + r->total_pre_mig_time += r->dt_pre_mig;
> }
>
> static bool is_idle_sample(struct perf_sample *sample,
> @@ -2579,12 +2592,27 @@ static void timehist_print_wakeup_event(struct perf_sched *sched,
> printf("\n");
> }
>
> -static int timehist_sched_wakeup_ignore(const struct perf_tool *tool __maybe_unused,
> +static int timehist_sched_wakeup_consider(const struct perf_tool *tool __maybe_unused,
No this function is to not duplicate work when you have both
sched_wakeup and sched_waking tracepoints.
> union perf_event *event __maybe_unused,
> struct evsel *evsel __maybe_unused,
> struct perf_sample *sample __maybe_unused,
> struct machine *machine __maybe_unused)
> {
> + struct thread *thread;
> + struct thread_runtime *tr = NULL;
> + const u32 pid = evsel__intval(evsel, sample, "pid");
> +
> + thread = machine__findnew_thread(machine, 0, pid);
> + if (thread == NULL)
> + return -1;
> +
> + tr = thread__get_runtime(thread);
> + if (tr == NULL)
> + return -1;
> +
> + if (tr->woken == 0)
> + tr->woken = sample->time;
I think you can add this to timehist_sched_wakeup_event().
> +
> return 0;
> }
>
> @@ -2694,8 +2722,14 @@ static int timehist_migrate_task_event(const struct perf_tool *tool,
>
> tr->migrations++;
>
> + if (tr->migrated == 0)
> + tr->migrated = sample->time;
> +
> /* show migrations if requested */
> - timehist_print_migration_event(sched, evsel, sample, machine, thread);
> + if (sched->show_migrations) {
> + timehist_print_migration_event(sched, evsel, sample,
> + machine, thread);
> + }
>
> return 0;
> }
> @@ -2846,11 +2880,14 @@ static int timehist_sched_change_event(const struct perf_tool *tool,
> /* last state is used to determine where to account wait time */
> tr->last_state = state;
>
> - /* sched out event for task so reset ready to run time */
> + /* sched out event for task so reset ready to run time, woken and migrated time */
> if (state == 'R')
> tr->ready_to_run = t;
> else
> tr->ready_to_run = 0;
> +
> + tr->woken = 0;
> + tr->migrated = 0;
> }
>
> evsel__save_time(evsel, sample->time, sample->cpu);
> @@ -3278,7 +3315,7 @@ static int perf_sched__timehist(struct perf_sched *sched)
>
> /* prefer sched_waking if it is captured */
> if (evlist__find_tracepoint_by_name(session->evlist, "sched:sched_waking"))
> - handlers[1].handler = timehist_sched_wakeup_ignore;
> + handlers[1].handler = timehist_sched_wakeup_consider;
>
> /* setup per-evsel handlers */
> if (perf_session__set_tracepoints_handlers(session, handlers))
> @@ -3290,8 +3327,14 @@ static int perf_sched__timehist(struct perf_sched *sched)
> goto out;
> }
>
> - if (sched->show_migrations &&
> - perf_session__set_tracepoints_handlers(session, migrate_handlers))
> + if (sched->pre_migrations && !evlist__find_tracepoint_by_name(session->evlist,
> + "sched:sched_wakeup")) {
> + pr_err("No sched_wakeup events found. sched_wakeup tracepoint is mandatory for -P option\n");
> + goto out;
> + }
> +
> + if ((sched->show_migrations || sched->pre_migrations) &&
> + (perf_session__set_tracepoints_handlers(session, migrate_handlers)))
No need for parenthesis.
Thanks,
Namhyung
> goto out;
>
> /* pre-allocate struct for per-CPU idle stats */
> @@ -3833,6 +3876,7 @@ int cmd_sched(int argc, const char **argv)
> OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"),
> OPT_STRING(0, "prio", &sched.prio_str, "prio",
> "analyze events only for given task priority(ies)"),
> + OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"),
> OPT_PARENT(sched_options)
> };
>
> --
> 2.43.1
>
^ permalink raw reply [flat|nested] 3+ messages in thread* Re: [PATCH] perf sched timehist: Add pre-migration wait time option
2024-09-26 23:04 ` Namhyung Kim
@ 2024-10-01 6:03 ` Madadi Vineeth Reddy
0 siblings, 0 replies; 3+ messages in thread
From: Madadi Vineeth Reddy @ 2024-10-01 6:03 UTC (permalink / raw)
To: Namhyung Kim
Cc: Arnaldo Carvalho de Melo, Ian Rogers, Athira Rajeev,
Peter Zijlstra, Ingo Molnar, Mark Rutland, Alexander Shishkin,
Jiri Olsa, Adrian Hunter, Kan Liang, acme, linux-perf-users, LKML,
Madadi Vineeth Reddy
Hi Namhyung,
On 27/09/24 04:34, Namhyung Kim wrote:
> Hello,
>
> On Tue, Sep 17, 2024 at 02:45:37PM +0530, Madadi Vineeth Reddy wrote:
>> pre-migration wait time is the time that a task unnecessarily spends
>> on the runqueue of a CPU but doesn't get switched-in there. In terms
>> of tracepoints, it is the time between sched:sched_wakeup and
>> sched:sched_migrate_task.
>
> Sounds useful.
>
>>
>> Let's say a task woke up on CPU2, then it got migrated to CPU4 and
>> then it's switched-in to CPU4. So, here pre-migration wait time is
>> time that it was waiting on runqueue of CPU2 after it is woken up.
>>
>> The general pattern for pre-migration to occur is:
>> sched:sched_wakeup
>> sched:sched_migrate_task
>> sched:sched_switch
>> So, this option expects sched_wakeup also to be recorded and fails
>> if attempted to use the option without sched:sched_wakeup tracepoint.
>
> I'm curious if you check sched_migrate_task as well.
Yes, I do check it in timehist_migrate_task_event().
>
>>
>> pre-migrations are generally not useful and it increases migrations.
>> This metric would be helpful in testing patches mainly related to wakeup
>> and load-balancer code paths as better wakeup logic would choose an
>> optimal CPU where task would be switched-in and thereby reducing pre-
>> migrations.
>>
>> The sample output(s) when -P or --pre-migrations is used:
>> =================
>> time cpu task name wait time sch delay run time pre-mig time
>> [tid/pid] (msec) (msec) (msec) (msec)
>> --------------- ------ ------------------------------ --------- --------- --------- ---------
>> 103032.721020 [0000] perf[47206] 0.000 0.000 0.000 0.000
>> 103032.721034 [0000] migration/0[18] 0.000 0.003 0.013 0.000
>> 103032.736716 [0001] schbench[47229/47207] 10.664 9.231 0.039 9.170
>> 103032.736719 [0005] <idle> 0.000 0.000 15.405 0.000
>> 103032.736726 [0003] schbench[47216/47207] 11.251 9.193 0.022 9.126
>> 103032.736727 [0001] schbench[47228/47207] 10.752 9.264 0.010 9.169
>> 103032.736731 [0007] <idle> 0.000 0.000 15.314 0.000
>> 103032.736739 [0008] <idle> 0.000 0.000 15.249 0.000
>> 103032.736742 [0003] schbench[47257/47207] 9.498 9.184 0.015 9.100
>>
>> 51370.894024 [0012] schbench[38775/38770] 21.047 21.047 8.950 0.000
>> 51370.894024 [0008] schbench[38781/38770] 39.597 39.448 9.984 39.437
>> 51370.894025 [0009] stress-ng-cpu[38743] 7.971 7.971 9.997 0.000
>>
>> Signed-off-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
>> ---
>> tools/perf/Documentation/perf-sched.txt | 8 ++
>> tools/perf/builtin-sched.c | 110 +++++++++++++++++-------
>> 2 files changed, 85 insertions(+), 33 deletions(-)
>>
>> diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
>> index 3db64954a267..7935499b88b0 100644
>> --- a/tools/perf/Documentation/perf-sched.txt
>> +++ b/tools/perf/Documentation/perf-sched.txt
>> @@ -221,6 +221,14 @@ OPTIONS for 'perf sched timehist'
>> priorities are specified with -: 120-129. A combination of both can also be
>> provided: 0,120-129.
>>
>> +-P::
>> +--pre-migrations::
>> + Show pre-migration wait time. pre-migration wait time is the time spent
>> + by a task waiting on a runqueue but not getting the chance to run there
>> + and is migrated to a different runqueue where it is finally run. This
>> + time between migrate_task and sched_wakeup is the pre-migration wait
>> + time. This option is valid only if sched_wakeup tracepoint is recorded.
>> +
>> OPTIONS for 'perf sched replay'
>> ------------------------------
>>
>> diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
>> index 5981cc51abc8..252e71d83d82 100644
>> --- a/tools/perf/builtin-sched.c
>> +++ b/tools/perf/builtin-sched.c
>> @@ -228,6 +228,7 @@ struct perf_sched {
>> bool show_wakeups;
>> bool show_next;
>> bool show_migrations;
>> + bool pre_migrations;
>> bool show_state;
>> bool show_prio;
>> u64 skipped_samples;
>> @@ -247,7 +248,10 @@ struct thread_runtime {
>> u64 dt_iowait; /* time between CPU access by iowait (off cpu) */
>> u64 dt_preempt; /* time between CPU access by preempt (off cpu) */
>> u64 dt_delay; /* time between wakeup and sched-in */
>> + u64 dt_pre_mig; /* time between migration and wakeup */
>> u64 ready_to_run; /* time of wakeup */
>> + u64 woken; /* time when sched_wakeup tracepoint is hit */
>> + u64 migrated; /* time when a thread is migrated */
>>
>> struct stats run_stats;
>> u64 total_run_time;
>> @@ -255,6 +259,7 @@ struct thread_runtime {
>> u64 total_iowait_time;
>> u64 total_preempt_time;
>> u64 total_delay_time;
>> + u64 total_pre_mig_time;
>>
>> char last_state;
>>
>> @@ -2083,14 +2088,15 @@ static void timehist_header(struct perf_sched *sched)
>> printf(" ");
>> }
>>
>> - if (sched->show_prio) {
>> - printf(" %-*s %-*s %9s %9s %9s",
>> - comm_width, "task name", MAX_PRIO_STR_LEN, "prio",
>> - "wait time", "sch delay", "run time");
>> - } else {
>> - printf(" %-*s %9s %9s %9s", comm_width,
>> - "task name", "wait time", "sch delay", "run time");
>> - }
>> + printf(" %-*s", comm_width, "task name");
>> +
>> + if (sched->show_prio)
>> + printf(" %-*s", MAX_PRIO_STR_LEN, "prio");
>> +
>> + printf(" %9s %9s %9s", "wait time", "sch delay", "run time");
>> +
>> + if (sched->pre_migrations)
>> + printf(" %9s", "pre-mig time");
>>
>> if (sched->show_state)
>> printf(" %s", "state");
>> @@ -2105,17 +2111,15 @@ static void timehist_header(struct perf_sched *sched)
>> if (sched->show_cpu_visual)
>> printf(" %*s ", ncpus, "");
>>
>> - if (sched->show_prio) {
>> - printf(" %-*s %-*s %9s %9s %9s",
>> - comm_width, "[tid/pid]", MAX_PRIO_STR_LEN, "",
>> - "(msec)", "(msec)", "(msec)");
>> - } else {
>> - printf(" %-*s %9s %9s %9s", comm_width,
>> - "[tid/pid]", "(msec)", "(msec)", "(msec)");
>> - }
>> + printf(" %-*s", comm_width, "[tid/pid]");
>>
>> - if (sched->show_state)
>> - printf(" %5s", "");
>> + if (sched->show_prio)
>> + printf(" %-*s", MAX_PRIO_STR_LEN, "");
>> +
>> + printf(" %9s %9s %9s", "(msec)", "(msec)", "(msec)");
>> +
>> + if (sched->pre_migrations)
>> + printf(" %9s", "(msec)");
>>
>> printf("\n");
>>
>> @@ -2127,15 +2131,15 @@ static void timehist_header(struct perf_sched *sched)
>> if (sched->show_cpu_visual)
>> printf(" %.*s ", ncpus, graph_dotted_line);
>>
>> - if (sched->show_prio) {
>> - printf(" %.*s %.*s %.9s %.9s %.9s",
>> - comm_width, graph_dotted_line, MAX_PRIO_STR_LEN, graph_dotted_line,
>> - graph_dotted_line, graph_dotted_line, graph_dotted_line);
>> - } else {
>> - printf(" %.*s %.9s %.9s %.9s", comm_width,
>> - graph_dotted_line, graph_dotted_line, graph_dotted_line,
>> - graph_dotted_line);
>> - }
>> + printf(" %.*s", comm_width, graph_dotted_line);
>> +
>> + if (sched->show_prio)
>> + printf(" %.*s", MAX_PRIO_STR_LEN, graph_dotted_line);
>> +
>> + printf(" %.9s %.9s %.9s", graph_dotted_line, graph_dotted_line, graph_dotted_line);
>> +
>> + if (sched->pre_migrations)
>> + printf(" %.9s", graph_dotted_line);
>>
>> if (sched->show_state)
>> printf(" %.5s", graph_dotted_line);
>> @@ -2190,6 +2194,8 @@ static void timehist_print_sample(struct perf_sched *sched,
>>
>> print_sched_time(tr->dt_delay, 6);
>> print_sched_time(tr->dt_run, 6);
>> + if (sched->pre_migrations)
>> + print_sched_time(tr->dt_pre_mig, 6);
>>
>> if (sched->show_state)
>> printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state);
>> @@ -2249,6 +2255,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
>> r->dt_iowait = 0;
>> r->dt_preempt = 0;
>> r->dt_run = 0;
>> + r->dt_pre_mig = 0;
>>
>> if (tprev) {
>> r->dt_run = t - tprev;
>> @@ -2257,6 +2264,11 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
>> pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
>> else
>> r->dt_delay = tprev - r->ready_to_run;
>> +
>> + if (r->woken && r->migrated) {
>> + if ((r->migrated > r->woken) && (r->migrated < tprev))
>> + r->dt_pre_mig = r->migrated - r->woken;
>> + }
>> }
>>
>> if (r->last_time > tprev)
>> @@ -2280,6 +2292,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
>> r->total_sleep_time += r->dt_sleep;
>> r->total_iowait_time += r->dt_iowait;
>> r->total_preempt_time += r->dt_preempt;
>> + r->total_pre_mig_time += r->dt_pre_mig;
>> }
>>
>> static bool is_idle_sample(struct perf_sample *sample,
>> @@ -2579,12 +2592,27 @@ static void timehist_print_wakeup_event(struct perf_sched *sched,
>> printf("\n");
>> }
>>
>> -static int timehist_sched_wakeup_ignore(const struct perf_tool *tool __maybe_unused,
>> +static int timehist_sched_wakeup_consider(const struct perf_tool *tool __maybe_unused,
>
> No this function is to not duplicate work when you have both
> sched_wakeup and sched_waking tracepoints.
>
>> union perf_event *event __maybe_unused,
>> struct evsel *evsel __maybe_unused,
>> struct perf_sample *sample __maybe_unused,
>> struct machine *machine __maybe_unused)
>> {
>> + struct thread *thread;
>> + struct thread_runtime *tr = NULL;
>> + const u32 pid = evsel__intval(evsel, sample, "pid");
>> +
>> + thread = machine__findnew_thread(machine, 0, pid);
>> + if (thread == NULL)
>> + return -1;
>> +
>> + tr = thread__get_runtime(thread);
>> + if (tr == NULL)
>> + return -1;
>> +
>> + if (tr->woken == 0)
>> + tr->woken = sample->time;
>
> I think you can add this to timehist_sched_wakeup_event().
Got it. I will do that. May be I will do it by skipping
if (evlist__find_tracepoint_by_name(session->evlist, "sched:sched_waking"))
handlers[1].handler = timehist_sched_wakeup_ignore;
when using pre-migration option.
>
>> +
>> return 0;
>> }
>>
>> @@ -2694,8 +2722,14 @@ static int timehist_migrate_task_event(const struct perf_tool *tool,
>>
>> tr->migrations++;
>>
>> + if (tr->migrated == 0)
>> + tr->migrated = sample->time;
>> +
>> /* show migrations if requested */
>> - timehist_print_migration_event(sched, evsel, sample, machine, thread);
>> + if (sched->show_migrations) {
>> + timehist_print_migration_event(sched, evsel, sample,
>> + machine, thread);
>> + }
>>
>> return 0;
>> }
>> @@ -2846,11 +2880,14 @@ static int timehist_sched_change_event(const struct perf_tool *tool,
>> /* last state is used to determine where to account wait time */
>> tr->last_state = state;
>>
>> - /* sched out event for task so reset ready to run time */
>> + /* sched out event for task so reset ready to run time, woken and migrated time */
>> if (state == 'R')
>> tr->ready_to_run = t;
>> else
>> tr->ready_to_run = 0;
>> +
>> + tr->woken = 0;
>> + tr->migrated = 0;
>> }
>>
>> evsel__save_time(evsel, sample->time, sample->cpu);
>> @@ -3278,7 +3315,7 @@ static int perf_sched__timehist(struct perf_sched *sched)
>>
>> /* prefer sched_waking if it is captured */
>> if (evlist__find_tracepoint_by_name(session->evlist, "sched:sched_waking"))
>> - handlers[1].handler = timehist_sched_wakeup_ignore;
>> + handlers[1].handler = timehist_sched_wakeup_consider;
>>
>> /* setup per-evsel handlers */
>> if (perf_session__set_tracepoints_handlers(session, handlers))
>> @@ -3290,8 +3327,14 @@ static int perf_sched__timehist(struct perf_sched *sched)
>> goto out;
>> }
>>
>> - if (sched->show_migrations &&
>> - perf_session__set_tracepoints_handlers(session, migrate_handlers))
>> + if (sched->pre_migrations && !evlist__find_tracepoint_by_name(session->evlist,
>> + "sched:sched_wakeup")) {
>> + pr_err("No sched_wakeup events found. sched_wakeup tracepoint is mandatory for -P option\n");
>> + goto out;
>> + }
>> +
>> + if ((sched->show_migrations || sched->pre_migrations) &&
>> + (perf_session__set_tracepoints_handlers(session, migrate_handlers)))
>
> No need for parenthesis.
Got it. Thanks for your time. Will send v2 with the changes.
Thanks,
Madadi Vineeth Reddy
>
> Thanks,
> Namhyung
>
>
>> goto out;
>>
>> /* pre-allocate struct for per-CPU idle stats */
>> @@ -3833,6 +3876,7 @@ int cmd_sched(int argc, const char **argv)
>> OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"),
>> OPT_STRING(0, "prio", &sched.prio_str, "prio",
>> "analyze events only for given task priority(ies)"),
>> + OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"),
>> OPT_PARENT(sched_options)
>> };
>>
>> --
>> 2.43.1
>>
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2024-10-01 6:04 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-09-17 9:15 [PATCH] perf sched timehist: Add pre-migration wait time option Madadi Vineeth Reddy
2024-09-26 23:04 ` Namhyung Kim
2024-10-01 6:03 ` Madadi Vineeth Reddy
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).