* [PATCH] schedstats additions
@ 2004-09-04 5:07 Nick Piggin
2004-09-04 8:26 ` Rafael J. Wysocki
2004-09-08 8:09 ` Rick Lindsley
0 siblings, 2 replies; 5+ messages in thread
From: Nick Piggin @ 2004-09-04 5:07 UTC (permalink / raw)
To: Rick Lindsley, linux-kernel
[-- Attachment #1: Type: text/plain, Size: 443 bytes --]
Hi,
I have a patch here to provide more useful statistics for me. Basically
it moves a lot more of the balancing information into the domains instead
of the runqueue, where it is nearly useless on multi-domain setups (eg.
SMT+SMP, SMP+NUMA).
It requires a version number bump, but that isn't much of an issue because
I think we're about the only two using it at the moment. But your tools
will need a little bit of work.
What do you think?
[-- Attachment #2: sched-stat.patch --]
[-- Type: text/x-patch, Size: 10525 bytes --]
---
linux-2.6-npiggin/kernel/sched.c | 155 ++++++++++++++++++++-------------------
1 files changed, 82 insertions(+), 73 deletions(-)
diff -puN kernel/sched.c~sched-stat kernel/sched.c
--- linux-2.6/kernel/sched.c~sched-stat 2004-09-04 13:08:54.000000000 +1000
+++ linux-2.6-npiggin/kernel/sched.c 2004-09-04 15:02:07.000000000 +1000
@@ -194,7 +194,6 @@ struct runqueue {
unsigned long yld_cnt;
/* schedule() stats */
- unsigned long sched_noswitch;
unsigned long sched_switch;
unsigned long sched_cnt;
unsigned long sched_goidle;
@@ -203,26 +202,9 @@ struct runqueue {
unsigned long pt_gained[MAX_IDLE_TYPES];
unsigned long pt_lost[MAX_IDLE_TYPES];
- /* active_load_balance() stats */
- unsigned long alb_cnt;
- unsigned long alb_lost;
- unsigned long alb_gained;
- unsigned long alb_failed;
-
/* try_to_wake_up() stats */
unsigned long ttwu_cnt;
- unsigned long ttwu_attempts;
- unsigned long ttwu_moved;
-
- /* wake_up_new_task() stats */
- unsigned long wunt_cnt;
- unsigned long wunt_moved;
-
- /* sched_migrate_task() stats */
- unsigned long smt_cnt;
-
- /* sched_balance_exec() stats */
- unsigned long sbe_cnt;
+ unsigned long ttwu_remote;
#endif
};
@@ -277,15 +259,24 @@ struct sched_domain {
/* load_balance() stats */
unsigned long lb_cnt[MAX_IDLE_TYPES];
unsigned long lb_failed[MAX_IDLE_TYPES];
+ unsigned long lb_balanced[MAX_IDLE_TYPES];
unsigned long lb_imbalance[MAX_IDLE_TYPES];
+ unsigned long lb_pulled[MAX_IDLE_TYPES];
+ unsigned long lb_hot_pulled[MAX_IDLE_TYPES];
unsigned long lb_nobusyg[MAX_IDLE_TYPES];
unsigned long lb_nobusyq[MAX_IDLE_TYPES];
+ /* Active load balancing */
+ unsigned long alb_cnt;
+ unsigned long alb_failed;
+ unsigned long alb_pushed;
+
/* sched_balance_exec() stats */
unsigned long sbe_attempts;
unsigned long sbe_pushed;
/* try_to_wake_up() stats */
+ unsigned long ttwu_wake_remote;
unsigned long ttwu_wake_affine;
unsigned long ttwu_wake_balance;
#endif
@@ -409,7 +400,7 @@ static inline void task_rq_unlock(runque
* bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
-#define SCHEDSTAT_VERSION 10
+#define SCHEDSTAT_VERSION 11
static int show_schedstat(struct seq_file *seq, void *v)
{
@@ -427,17 +418,12 @@ static int show_schedstat(struct seq_fil
/* runqueue-specific stats */
seq_printf(seq,
- "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
- "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
+ "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
cpu, rq->yld_both_empty,
- rq->yld_act_empty, rq->yld_exp_empty,
- rq->yld_cnt, rq->sched_noswitch,
+ rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
- rq->alb_cnt, rq->alb_gained, rq->alb_lost,
- rq->alb_failed,
- rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts,
- rq->wunt_cnt, rq->wunt_moved,
- rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time,
+ rq->ttwu_cnt, rq->ttwu_remote,
+ rq->rq_sched_info.cpu_time,
rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++)
@@ -453,16 +439,20 @@ static int show_schedstat(struct seq_fil
cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
seq_printf(seq, "domain%d %s", dcnt++, mask_str);
for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++) {
- seq_printf(seq, " %lu %lu %lu %lu %lu",
+ seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
sd->lb_cnt[itype],
+ sd->lb_balanced[itype],
sd->lb_failed[itype],
sd->lb_imbalance[itype],
+ sd->lb_pulled[itype],
+ sd->lb_hot_pulled[itype],
sd->lb_nobusyq[itype],
sd->lb_nobusyg[itype]);
}
- seq_printf(seq, " %lu %lu %lu %lu\n",
+ seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
+ sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
sd->sbe_pushed, sd->sbe_attempts,
- sd->ttwu_wake_affine, sd->ttwu_wake_balance);
+ sd->ttwu_wake_remote, sd->ttwu_wake_affine, sd->ttwu_wake_balance);
}
#endif
}
@@ -1058,6 +1048,10 @@ static int try_to_wake_up(task_t * p, un
unsigned long load, this_load;
struct sched_domain *sd;
int new_cpu;
+
+#ifdef CONFIG_SCHEDSTATS
+ struct sched_domain *stat_sd = NULL;
+#endif
#endif
rq = task_rq_lock(p, &flags);
@@ -1076,8 +1070,19 @@ static int try_to_wake_up(task_t * p, un
if (unlikely(task_running(rq, p)))
goto out_activate;
- new_cpu = cpu;
+#ifdef CONFIG_SCHEDSTATS
+ if (cpu != this_cpu) {
+ schedstat_inc(rq, ttwu_remote);
+ for_each_domain(this_cpu, stat_sd) {
+ if (cpu_isset(cpu, stat_sd->span)) {
+ schedstat_inc(stat_sd, ttwu_wake_remote);
+ break;
+ }
+ }
+ }
+#endif
+ new_cpu = cpu;
if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
goto out_set_cpu;
@@ -1103,30 +1108,32 @@ static int try_to_wake_up(task_t * p, un
*/
for_each_domain(this_cpu, sd) {
unsigned int imbalance;
- /*
- * Start passive balancing when half the imbalance_pct
- * limit is reached.
- */
- imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
- if ((sd->flags & SD_WAKE_AFFINE) &&
- !task_hot(p, rq->timestamp_last_tick, sd)) {
+ if (cpu_isset(cpu, sd->span)) {
/*
- * This domain has SD_WAKE_AFFINE and p is cache cold
- * in this domain.
+ * Start passive balancing when half the imbalance_pct
+ * limit is reached.
*/
- if (cpu_isset(cpu, sd->span)) {
- schedstat_inc(sd, ttwu_wake_affine);
+ if ((sd->flags & SD_WAKE_AFFINE) &&
+ !task_hot(p, rq->timestamp_last_tick, sd)) {
+ /*
+ * This domain has SD_WAKE_AFFINE and p is
+ * cache cold in this domain.
+ */
+ schedstat_inc(stat_sd, ttwu_wake_affine);
goto out_set_cpu;
}
- } else if ((sd->flags & SD_WAKE_BALANCE) &&
+
+ imbalance = sd->imbalance_pct +
+ (sd->imbalance_pct - 100) / 2;
+
+ if ((sd->flags & SD_WAKE_BALANCE) &&
imbalance*this_load <= 100*load) {
- /*
- * This domain has SD_WAKE_BALANCE and there is
- * an imbalance.
- */
- if (cpu_isset(cpu, sd->span)) {
- schedstat_inc(sd, ttwu_wake_balance);
+ /*
+ * This domain has SD_WAKE_BALANCE and there is
+ * an imbalance.
+ */
+ schedstat_inc(stat_sd, ttwu_wake_balance);
goto out_set_cpu;
}
}
@@ -1134,10 +1141,8 @@ static int try_to_wake_up(task_t * p, un
new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
out_set_cpu:
- schedstat_inc(rq, ttwu_attempts);
new_cpu = wake_idle(new_cpu, p);
if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
- schedstat_inc(rq, ttwu_moved);
set_task_cpu(p, new_cpu);
task_rq_unlock(rq, &flags);
/* might preempt at this point */
@@ -1282,8 +1287,6 @@ void fastcall wake_up_new_task(task_t *
this_cpu = smp_processor_id();
cpu = task_cpu(p);
- schedstat_inc(rq, wunt_cnt);
-
array = rq->active;
if (unlikely(p->used_slice == -1)) {
p->used_slice = 0;
@@ -1329,8 +1332,6 @@ void fastcall wake_up_new_task(task_t *
__activate_task(p, rq, array);
if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
-
- schedstat_inc(rq, wunt_moved);
#endif
}
task_rq_unlock(rq, &flags);
@@ -1582,7 +1583,6 @@ static void sched_migrate_task(task_t *p
|| unlikely(cpu_is_offline(dest_cpu)))
goto out;
- schedstat_inc(rq, smt_cnt);
/* force the process onto the specified CPU */
if (migrate_task(p, dest_cpu, &req)) {
/* Need to wait for migration thread (might exit: take ref). */
@@ -1610,7 +1610,6 @@ void sched_exec(void)
struct sched_domain *tmp, *sd = NULL;
int new_cpu, this_cpu = get_cpu();
- schedstat_inc(this_rq(), sbe_cnt);
/* Prefer the current CPU if there's only this task running */
if (this_rq()->nr_running <= 1)
goto out;
@@ -1752,13 +1751,10 @@ skip_queue:
goto skip_bitmap;
}
- /*
- * Right now, this is the only place pull_task() is called,
- * so we can safely collect pull_task() stats here rather than
- * inside pull_task().
- */
- schedstat_inc(this_rq, pt_gained[idle]);
- schedstat_inc(busiest, pt_lost[idle]);
+#ifdef CONFIG_SCHEDSTATS
+ if (task_hot(tmp, busiest->timestamp_last_tick, sd))
+ schedstat_inc(sd, lb_hot_pulled[idle]);
+#endif
pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
pulled++;
@@ -1771,6 +1767,15 @@ skip_queue:
goto skip_bitmap;
}
out:
+ /*
+ * Right now, this is the only place pull_task() is called,
+ * so we can safely collect pull_task() stats here rather than
+ * inside pull_task().
+ */
+ schedstat_add(this_rq, pt_gained[idle], pulled);
+ schedstat_add(busiest, pt_lost[idle], pulled);
+ schedstat_add(sd, lb_pulled[idle], pulled);
+
return pulled;
}
@@ -2025,6 +2030,8 @@ static int load_balance(int this_cpu, ru
return nr_moved;
out_balanced:
+ schedstat_inc(sd, lb_balanced[idle]);
+
/* tune up the balancing interval */
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
@@ -2066,8 +2073,11 @@ static int load_balance_newidle(int this
schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
imbalance, sd, NEWLY_IDLE);
- if (!nr_moved)
+ if (!nr_moved) {
schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+ } else {
+ schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+ }
spin_unlock(&busiest->lock);
@@ -2107,7 +2117,6 @@ static void active_load_balance(runqueue
struct sched_group *group, *busy_group;
int i;
- schedstat_inc(busiest, alb_cnt);
if (busiest->nr_running <= 1)
return;
@@ -2117,6 +2126,8 @@ static void active_load_balance(runqueue
if (!sd)
return;
+ schedstat_inc(sd, alb_cnt);
+
group = sd->groups;
while (!cpu_isset(busiest_cpu, group->cpumask))
group = group->next;
@@ -2153,10 +2164,9 @@ static void active_load_balance(runqueue
goto next_group;
double_lock_balance(busiest, rq);
if (move_tasks(rq, push_cpu, busiest, 1, sd, IDLE)) {
- schedstat_inc(busiest, alb_lost);
- schedstat_inc(rq, alb_gained);
+ schedstat_inc(sd, alb_pushed);
} else {
- schedstat_inc(busiest, alb_failed);
+ schedstat_inc(sd, alb_failed);
}
spin_unlock(&rq->lock);
next_group:
@@ -2567,8 +2577,7 @@ go_idle:
rq->expired = array;
rq->expired->min_prio = MAX_PRIO;
array = rq->active;
- } else
- schedstat_inc(rq, sched_noswitch);
+ }
idx = sched_find_first_bit(array->bitmap);
queue = array->queue + idx;
_
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] schedstats additions
2004-09-04 5:07 [PATCH] schedstats additions Nick Piggin
@ 2004-09-04 8:26 ` Rafael J. Wysocki
2004-09-04 9:04 ` Nick Piggin
2004-09-08 8:09 ` Rick Lindsley
1 sibling, 1 reply; 5+ messages in thread
From: Rafael J. Wysocki @ 2004-09-04 8:26 UTC (permalink / raw)
To: linux-kernel; +Cc: Nick Piggin, Rick Lindsley
Dnia Saturday 04 of September 2004 07:07, Nick Piggin napisał:
> Hi,
> I have a patch here to provide more useful statistics for me. Basically
> it moves a lot more of the balancing information into the domains instead
> of the runqueue, where it is nearly useless on multi-domain setups (eg.
> SMT+SMP, SMP+NUMA).
Which kernel version it is against?
RJW
--
For a successful technology, reality must take precedence over public
relations, for nature cannot be fooled.
-- Richard P. Feynman
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] schedstats additions
2004-09-04 8:26 ` Rafael J. Wysocki
@ 2004-09-04 9:04 ` Nick Piggin
0 siblings, 0 replies; 5+ messages in thread
From: Nick Piggin @ 2004-09-04 9:04 UTC (permalink / raw)
To: Rafael J. Wysocki; +Cc: linux-kernel, Rick Lindsley
Rafael J. Wysocki wrote:
> Dnia Saturday 04 of September 2004 07:07, Nick Piggin napisał:
>
>>Hi,
>>I have a patch here to provide more useful statistics for me. Basically
>>it moves a lot more of the balancing information into the domains instead
>>of the runqueue, where it is nearly useless on multi-domain setups (eg.
>>SMT+SMP, SMP+NUMA).
>
>
> Which kernel version it is against?
>
-mm3 ... oh yeah that has nicksched in it, sorry that would put a spanner
in the works.
I'll redo it to suit 2.6 if Rick acks it - the main info he needs is still
valid, that is the output format.
Thanks
Nick
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] schedstats additions
2004-09-04 5:07 [PATCH] schedstats additions Nick Piggin
2004-09-04 8:26 ` Rafael J. Wysocki
@ 2004-09-08 8:09 ` Rick Lindsley
2004-09-08 22:56 ` Nick Piggin
1 sibling, 1 reply; 5+ messages in thread
From: Rick Lindsley @ 2004-09-08 8:09 UTC (permalink / raw)
To: Nick Piggin; +Cc: linux-kernel
I have a patch here to provide more useful statistics for me. Basically
it moves a lot more of the balancing information into the domains instead
of the runqueue, where it is nearly useless on multi-domain setups (eg.
SMT+SMP, SMP+NUMA).
It requires a version number bump, but that isn't much of an issue because
I think we're about the only two using it at the moment. But your tools
will need a little bit of work.
What do you think?
The idea of moving some counters from runqueues to domains is fine in
general, but I've some questions about a couple of specific changes in
your patch.
It looks to me like there are some changes in try_to_wake_up() that
aren't schedstats related, although schedstats code is among some
that is moved around. Is there some code there that should be
broken out separately?
alb_cnt
by moving this, we won't get an accurate look at the number of
times we called active_load_balance and returned immediately
because nr_running had slipped to 0 or 1. how about we add
another counter to count that too, and/or change the name of
this one?
lb_balanced
are you sure lb_balanced[idle] can't be deduced from lb_cnt[idle]
and lb_failed[idle]?
ttwu_attempts
ttwu_moved
removing these makes it harder to determine how successful
try_to_wake_up() was at moving a process. What counters would
I use to get this information if these were removed?
ttwu_remote
ttwu_wake_remote
so what's the one line description of what these count now?
smt_cnt
sbe_cnt
how might I see how often sched_migrate_task() and sched_exec()
were called if these were deleted?
lb_pulled
Rather than add another counter here, would it be as effective
to make pt_gained a domain counter? Looks like you're collecting
the same information. pt_lost would have to remain a runqueue
counter, though, since losing a task has nothing to do with a
particular domain.
Rick
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] schedstats additions
2004-09-08 8:09 ` Rick Lindsley
@ 2004-09-08 22:56 ` Nick Piggin
0 siblings, 0 replies; 5+ messages in thread
From: Nick Piggin @ 2004-09-08 22:56 UTC (permalink / raw)
To: Rick Lindsley; +Cc: linux-kernel
Rick Lindsley wrote:
> I have a patch here to provide more useful statistics for me. Basically
> it moves a lot more of the balancing information into the domains instead
> of the runqueue, where it is nearly useless on multi-domain setups (eg.
> SMT+SMP, SMP+NUMA).
>
> It requires a version number bump, but that isn't much of an issue because
> I think we're about the only two using it at the moment. But your tools
> will need a little bit of work.
>
> What do you think?
>
> The idea of moving some counters from runqueues to domains is fine in
> general, but I've some questions about a couple of specific changes in
> your patch.
>
> It looks to me like there are some changes in try_to_wake_up() that
> aren't schedstats related, although schedstats code is among some
> that is moved around. Is there some code there that should be
> broken out separately?
>
There is, yes. I'll be sure to seperate it.
> alb_cnt
> by moving this, we won't get an accurate look at the number of
> times we called active_load_balance and returned immediately
> because nr_running had slipped to 0 or 1. how about we add
> another counter to count that too, and/or change the name of
> this one?
>
OK.
> lb_balanced
> are you sure lb_balanced[idle] can't be deduced from lb_cnt[idle]
> and lb_failed[idle]?
>
I don't think so, because you also have the success case, which is
!balanced && !failed.
> ttwu_attempts
> ttwu_moved
> removing these makes it harder to determine how successful
> try_to_wake_up() was at moving a process. What counters would
> I use to get this information if these were removed?
>
ttwu_cnt in the rq stats, and ttwu_wake_affine / ttwu_wake_balance
in the domain stats.
> ttwu_remote
> ttwu_wake_remote
> so what's the one line description of what these count now?
>
ttwu_remote/ttwu_wake_remote are the number of times a runqueue has
woken a remote task / a remote task within that domain, respectively.
Regardless of whether or not it gets pulled onto the local CPU.
> smt_cnt
> sbe_cnt
> how might I see how often sched_migrate_task() and sched_exec()
> were called if these were deleted?
>
sbe_pushed should basically be the same as smt_cnt, barring rare
races with the cpus_allowed mask. I guess sbe_cnt doesn't have to
go.
> lb_pulled
> Rather than add another counter here, would it be as effective
> to make pt_gained a domain counter? Looks like you're collecting
Yeah removing the runqueue counters for these would be good.
> the same information. pt_lost would have to remain a runqueue
> counter, though, since losing a task has nothing to do with a
> particular domain.
Whatever domain that the pulling CPU was in, is also a fair candidate
for pt_lost. Remember, all the domains are per-CPU so any information
you can get from a per-runqueue counter you can also get from a domain
counter.
I'll make a few changes and give you another look. Thanks for the comments.
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2004-09-09 0:03 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-09-04 5:07 [PATCH] schedstats additions Nick Piggin
2004-09-04 8:26 ` Rafael J. Wysocki
2004-09-04 9:04 ` Nick Piggin
2004-09-08 8:09 ` Rick Lindsley
2004-09-08 22:56 ` Nick Piggin
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox