[PATCH] schedstats additions

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] schedstats additions
@ 2004-09-04  5:07 Nick Piggin
  2004-09-04  8:26 ` Rafael J. Wysocki
  2004-09-08  8:09 ` Rick Lindsley
  0 siblings, 2 replies; 5+ messages in thread
From: Nick Piggin @ 2004-09-04  5:07 UTC (permalink / raw)
  To: Rick Lindsley, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 443 bytes --]

Hi,
I have a patch here to provide more useful statistics for me. Basically
it moves a lot more of the balancing information into the domains instead
of the runqueue, where it is nearly useless on multi-domain setups (eg.
SMT+SMP, SMP+NUMA).

It requires a version number bump, but that isn't much of an issue because
I think we're about the only two using it at the moment. But your tools
will need a little bit of work.

What do you think?


[-- Attachment #2: sched-stat.patch --]
[-- Type: text/x-patch, Size: 10525 bytes --]




---

 linux-2.6-npiggin/kernel/sched.c |  155 ++++++++++++++++++++-------------------
 1 files changed, 82 insertions(+), 73 deletions(-)

diff -puN kernel/sched.c~sched-stat kernel/sched.c
--- linux-2.6/kernel/sched.c~sched-stat	2004-09-04 13:08:54.000000000 +1000
+++ linux-2.6-npiggin/kernel/sched.c	2004-09-04 15:02:07.000000000 +1000
@@ -194,7 +194,6 @@ struct runqueue {
 	unsigned long yld_cnt;
 
 	/* schedule() stats */
-	unsigned long sched_noswitch;
 	unsigned long sched_switch;
 	unsigned long sched_cnt;
 	unsigned long sched_goidle;
@@ -203,26 +202,9 @@ struct runqueue {
 	unsigned long pt_gained[MAX_IDLE_TYPES];
 	unsigned long pt_lost[MAX_IDLE_TYPES];
 
-	/* active_load_balance() stats */
-	unsigned long alb_cnt;
-	unsigned long alb_lost;
-	unsigned long alb_gained;
-	unsigned long alb_failed;
-
 	/* try_to_wake_up() stats */
 	unsigned long ttwu_cnt;
-	unsigned long ttwu_attempts;
-	unsigned long ttwu_moved;
-
-	/* wake_up_new_task() stats */
-	unsigned long wunt_cnt;
-	unsigned long wunt_moved;
-
-	/* sched_migrate_task() stats */
-	unsigned long smt_cnt;
-
-	/* sched_balance_exec() stats */
-	unsigned long sbe_cnt;
+	unsigned long ttwu_remote;
 #endif
 };
 
@@ -277,15 +259,24 @@ struct sched_domain {
 	/* load_balance() stats */
 	unsigned long lb_cnt[MAX_IDLE_TYPES];
 	unsigned long lb_failed[MAX_IDLE_TYPES];
+	unsigned long lb_balanced[MAX_IDLE_TYPES];
 	unsigned long lb_imbalance[MAX_IDLE_TYPES];
+	unsigned long lb_pulled[MAX_IDLE_TYPES];
+	unsigned long lb_hot_pulled[MAX_IDLE_TYPES];
 	unsigned long lb_nobusyg[MAX_IDLE_TYPES];
 	unsigned long lb_nobusyq[MAX_IDLE_TYPES];
 
+	/* Active load balancing */
+	unsigned long alb_cnt;
+	unsigned long alb_failed;
+	unsigned long alb_pushed;
+
 	/* sched_balance_exec() stats */
 	unsigned long sbe_attempts;
 	unsigned long sbe_pushed;
 
 	/* try_to_wake_up() stats */
+	unsigned long ttwu_wake_remote;
 	unsigned long ttwu_wake_affine;
 	unsigned long ttwu_wake_balance;
 #endif
@@ -409,7 +400,7 @@ static inline void task_rq_unlock(runque
  * bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
-#define SCHEDSTAT_VERSION 10
+#define SCHEDSTAT_VERSION 11
 
 static int show_schedstat(struct seq_file *seq, void *v)
 {
@@ -427,17 +418,12 @@ static int show_schedstat(struct seq_fil
 
 		/* runqueue-specific stats */
 		seq_printf(seq,
-		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
-		    "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
+		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
 		    cpu, rq->yld_both_empty,
-		    rq->yld_act_empty, rq->yld_exp_empty,
-		    rq->yld_cnt, rq->sched_noswitch,
+		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
 		    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
-		    rq->alb_cnt, rq->alb_gained, rq->alb_lost,
-		    rq->alb_failed,
-		    rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts,
-		    rq->wunt_cnt, rq->wunt_moved,
-		    rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time,
+		    rq->ttwu_cnt, rq->ttwu_remote,
+		    rq->rq_sched_info.cpu_time,
 		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
 
 		for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++)
@@ -453,16 +439,20 @@ static int show_schedstat(struct seq_fil
 			cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
 			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
 			for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++) {
-				seq_printf(seq, " %lu %lu %lu %lu %lu",
+				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
 				    sd->lb_cnt[itype],
+				    sd->lb_balanced[itype],
 				    sd->lb_failed[itype],
 				    sd->lb_imbalance[itype],
+				    sd->lb_pulled[itype],
+				    sd->lb_hot_pulled[itype],
 				    sd->lb_nobusyq[itype],
 				    sd->lb_nobusyg[itype]);
 			}
-			seq_printf(seq, " %lu %lu %lu %lu\n",
+			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
+			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
 			    sd->sbe_pushed, sd->sbe_attempts,
-			    sd->ttwu_wake_affine, sd->ttwu_wake_balance);
+			    sd->ttwu_wake_remote, sd->ttwu_wake_affine, sd->ttwu_wake_balance);
 		}
 #endif
 	}
@@ -1058,6 +1048,10 @@ static int try_to_wake_up(task_t * p, un
 	unsigned long load, this_load;
 	struct sched_domain *sd;
 	int new_cpu;
+
+#ifdef CONFIG_SCHEDSTATS
+	struct sched_domain *stat_sd = NULL;
+#endif
 #endif
 
 	rq = task_rq_lock(p, &flags);
@@ -1076,8 +1070,19 @@ static int try_to_wake_up(task_t * p, un
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 
-	new_cpu = cpu;
+#ifdef CONFIG_SCHEDSTATS
+	if (cpu != this_cpu) {
+		schedstat_inc(rq, ttwu_remote);
+		for_each_domain(this_cpu, stat_sd) {
+			if (cpu_isset(cpu, stat_sd->span)) {
+				schedstat_inc(stat_sd, ttwu_wake_remote);
+				break;
+			}
+		}
+	}
+#endif
 
+	new_cpu = cpu;
 	if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 		goto out_set_cpu;
 
@@ -1103,30 +1108,32 @@ static int try_to_wake_up(task_t * p, un
 	 */
 	for_each_domain(this_cpu, sd) {
 		unsigned int imbalance;
-		/*
-		 * Start passive balancing when half the imbalance_pct
-		 * limit is reached.
-		 */
-		imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
 
-		if ((sd->flags & SD_WAKE_AFFINE) &&
-				!task_hot(p, rq->timestamp_last_tick, sd)) {
+		if (cpu_isset(cpu, sd->span)) {
 			/*
-			 * This domain has SD_WAKE_AFFINE and p is cache cold
-			 * in this domain.
+			 * Start passive balancing when half the imbalance_pct
+			 * limit is reached.
 			 */
-			if (cpu_isset(cpu, sd->span)) {
-				schedstat_inc(sd, ttwu_wake_affine);
+			if ((sd->flags & SD_WAKE_AFFINE) &&
+				!task_hot(p, rq->timestamp_last_tick, sd)) {
+				/*
+				 * This domain has SD_WAKE_AFFINE and p is
+				 * cache cold in this domain.
+				 */
+				schedstat_inc(stat_sd, ttwu_wake_affine);
 				goto out_set_cpu;
 			}
-		} else if ((sd->flags & SD_WAKE_BALANCE) &&
+
+			imbalance = sd->imbalance_pct +
+					(sd->imbalance_pct - 100) / 2;
+
+			if ((sd->flags & SD_WAKE_BALANCE) &&
 				imbalance*this_load <= 100*load) {
-			/*
-			 * This domain has SD_WAKE_BALANCE and there is
-			 * an imbalance.
-			 */
-			if (cpu_isset(cpu, sd->span)) {
-				schedstat_inc(sd, ttwu_wake_balance);
+				/*
+				 * This domain has SD_WAKE_BALANCE and there is
+				 * an imbalance.
+				 */
+				schedstat_inc(stat_sd, ttwu_wake_balance);
 				goto out_set_cpu;
 			}
 		}
@@ -1134,10 +1141,8 @@ static int try_to_wake_up(task_t * p, un
 
 	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
 out_set_cpu:
-	schedstat_inc(rq, ttwu_attempts);
 	new_cpu = wake_idle(new_cpu, p);
 	if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
-		schedstat_inc(rq, ttwu_moved);
 		set_task_cpu(p, new_cpu);
 		task_rq_unlock(rq, &flags);
 		/* might preempt at this point */
@@ -1282,8 +1287,6 @@ void fastcall wake_up_new_task(task_t * 
 	this_cpu = smp_processor_id();
 	cpu = task_cpu(p);
 
-	schedstat_inc(rq, wunt_cnt);
-
 	array = rq->active;
 	if (unlikely(p->used_slice == -1)) {
 		p->used_slice = 0;
@@ -1329,8 +1332,6 @@ void fastcall wake_up_new_task(task_t * 
 		__activate_task(p, rq, array);
 		if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
-
-		schedstat_inc(rq, wunt_moved);
 #endif
 	}
 	task_rq_unlock(rq, &flags);
@@ -1582,7 +1583,6 @@ static void sched_migrate_task(task_t *p
 	    || unlikely(cpu_is_offline(dest_cpu)))
 		goto out;
 
-	schedstat_inc(rq, smt_cnt);
 	/* force the process onto the specified CPU */
 	if (migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread (might exit: take ref). */
@@ -1610,7 +1610,6 @@ void sched_exec(void)
 	struct sched_domain *tmp, *sd = NULL;
 	int new_cpu, this_cpu = get_cpu();
 
-	schedstat_inc(this_rq(), sbe_cnt);
 	/* Prefer the current CPU if there's only this task running */
 	if (this_rq()->nr_running <= 1)
 		goto out;
@@ -1752,13 +1751,10 @@ skip_queue:
 		goto skip_bitmap;
 	}
 
-	/*
-	 * Right now, this is the only place pull_task() is called,
-	 * so we can safely collect pull_task() stats here rather than
-	 * inside pull_task().
-	 */
-	schedstat_inc(this_rq, pt_gained[idle]);
-	schedstat_inc(busiest, pt_lost[idle]);
+#ifdef CONFIG_SCHEDSTATS
+	if (task_hot(tmp, busiest->timestamp_last_tick, sd))
+		schedstat_inc(sd, lb_hot_pulled[idle]);
+#endif
 
 	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
 	pulled++;
@@ -1771,6 +1767,15 @@ skip_queue:
 		goto skip_bitmap;
 	}
 out:
+	/*
+	 * Right now, this is the only place pull_task() is called,
+	 * so we can safely collect pull_task() stats here rather than
+	 * inside pull_task().
+	 */
+	schedstat_add(this_rq, pt_gained[idle], pulled);
+	schedstat_add(busiest, pt_lost[idle], pulled);
+	schedstat_add(sd, lb_pulled[idle], pulled);
+
 	return pulled;
 }
 
@@ -2025,6 +2030,8 @@ static int load_balance(int this_cpu, ru
 	return nr_moved;
 
 out_balanced:
+	schedstat_inc(sd, lb_balanced[idle]);
+
 	/* tune up the balancing interval */
 	if (sd->balance_interval < sd->max_interval)
 		sd->balance_interval *= 2;
@@ -2066,8 +2073,11 @@ static int load_balance_newidle(int this
 	schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
 	nr_moved = move_tasks(this_rq, this_cpu, busiest,
 					imbalance, sd, NEWLY_IDLE);
-	if (!nr_moved)
+	if (!nr_moved) {
 		schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+	} else {
+		schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+	}
 
 	spin_unlock(&busiest->lock);
 
@@ -2107,7 +2117,6 @@ static void active_load_balance(runqueue
 	struct sched_group *group, *busy_group;
 	int i;
 
-	schedstat_inc(busiest, alb_cnt);
 	if (busiest->nr_running <= 1)
 		return;
 
@@ -2117,6 +2126,8 @@ static void active_load_balance(runqueue
 	if (!sd)
 		return;
 
+	schedstat_inc(sd, alb_cnt);
+
 	group = sd->groups;
 	while (!cpu_isset(busiest_cpu, group->cpumask))
 		group = group->next;
@@ -2153,10 +2164,9 @@ static void active_load_balance(runqueue
 			goto next_group;
 		double_lock_balance(busiest, rq);
 		if (move_tasks(rq, push_cpu, busiest, 1, sd, IDLE)) {
-			schedstat_inc(busiest, alb_lost);
-			schedstat_inc(rq, alb_gained);
+			schedstat_inc(sd, alb_pushed);
 		} else {
-			schedstat_inc(busiest, alb_failed);
+			schedstat_inc(sd, alb_failed);
 		}
 		spin_unlock(&rq->lock);
 next_group:
@@ -2567,8 +2577,7 @@ go_idle:
 		rq->expired = array;
 		rq->expired->min_prio = MAX_PRIO;
 		array = rq->active;
-	} else
-		schedstat_inc(rq, sched_noswitch);
+	}
 
 	idx = sched_find_first_bit(array->bitmap);
 	queue = array->queue + idx;

_

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] schedstats additions
  2004-09-04  5:07 [PATCH] schedstats additions Nick Piggin
@ 2004-09-04  8:26 ` Rafael J. Wysocki
  2004-09-04  9:04   ` Nick Piggin
  2004-09-08  8:09 ` Rick Lindsley
  1 sibling, 1 reply; 5+ messages in thread
From: Rafael J. Wysocki @ 2004-09-04  8:26 UTC (permalink / raw)
  To: linux-kernel; +Cc: Nick Piggin, Rick Lindsley

Dnia Saturday 04 of September 2004 07:07, Nick Piggin napisał:
> Hi,
> I have a patch here to provide more useful statistics for me. Basically
> it moves a lot more of the balancing information into the domains instead
> of the runqueue, where it is nearly useless on multi-domain setups (eg.
> SMT+SMP, SMP+NUMA).

Which kernel version it is against?

RJW

-- 
For a successful technology, reality must take precedence over public 
relations, for nature cannot be fooled.
					-- Richard P. Feynman

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] schedstats additions
  2004-09-04  8:26 ` Rafael J. Wysocki
@ 2004-09-04  9:04   ` Nick Piggin
  0 siblings, 0 replies; 5+ messages in thread
From: Nick Piggin @ 2004-09-04  9:04 UTC (permalink / raw)
  To: Rafael J. Wysocki; +Cc: linux-kernel, Rick Lindsley

Rafael J. Wysocki wrote:
> Dnia Saturday 04 of September 2004 07:07, Nick Piggin napisał:
> 
>>Hi,
>>I have a patch here to provide more useful statistics for me. Basically
>>it moves a lot more of the balancing information into the domains instead
>>of the runqueue, where it is nearly useless on multi-domain setups (eg.
>>SMT+SMP, SMP+NUMA).
> 
> 
> Which kernel version it is against?
> 

-mm3 ... oh yeah that has nicksched in it, sorry that would put a spanner
in the works.

I'll redo it to suit 2.6 if Rick acks it - the main info he needs is still
valid, that is the output format.

Thanks
Nick

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] schedstats additions
  2004-09-04  5:07 [PATCH] schedstats additions Nick Piggin
  2004-09-04  8:26 ` Rafael J. Wysocki
@ 2004-09-08  8:09 ` Rick Lindsley
  2004-09-08 22:56   ` Nick Piggin
  1 sibling, 1 reply; 5+ messages in thread
From: Rick Lindsley @ 2004-09-08  8:09 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-kernel

    I have a patch here to provide more useful statistics for me. Basically
    it moves a lot more of the balancing information into the domains instead
    of the runqueue, where it is nearly useless on multi-domain setups (eg.
    SMT+SMP, SMP+NUMA).
    
    It requires a version number bump, but that isn't much of an issue because
    I think we're about the only two using it at the moment. But your tools
    will need a little bit of work.
    
    What do you think?

The idea of moving some counters from runqueues to domains is fine in
general, but I've some questions about a couple of specific changes in
your patch.

    It looks to me like there are some changes in try_to_wake_up() that
	aren't schedstats related, although schedstats code is among some
	that is moved around.  Is there some code there that should be
	broken out separately?

    alb_cnt
	by moving this, we won't get an accurate look at the number of
	times we called active_load_balance and returned immediately
	because nr_running had slipped to 0 or 1.  how about we add
	another counter to count that too, and/or change the name of
	this one?

    lb_balanced
	are you sure lb_balanced[idle] can't be deduced from lb_cnt[idle]
	and lb_failed[idle]?

    ttwu_attempts
    ttwu_moved
	removing these makes it harder to determine how successful
	try_to_wake_up() was at moving a process.  What counters would
	I use to get this information if these were removed?

    ttwu_remote
    ttwu_wake_remote
	so what's the one line description of what these count now?

    smt_cnt
    sbe_cnt
	how might I see how often sched_migrate_task() and sched_exec()
	were called if these were deleted?

    lb_pulled
	Rather than add another counter here, would it be as effective
	to make pt_gained a domain counter? Looks like you're collecting
	the same information.  pt_lost would have to remain a runqueue
	counter, though, since losing a task has nothing to do with a
	particular domain.

Rick

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] schedstats additions
  2004-09-08  8:09 ` Rick Lindsley
@ 2004-09-08 22:56   ` Nick Piggin
  0 siblings, 0 replies; 5+ messages in thread
From: Nick Piggin @ 2004-09-08 22:56 UTC (permalink / raw)
  To: Rick Lindsley; +Cc: linux-kernel

Rick Lindsley wrote:
>     I have a patch here to provide more useful statistics for me. Basically
>     it moves a lot more of the balancing information into the domains instead
>     of the runqueue, where it is nearly useless on multi-domain setups (eg.
>     SMT+SMP, SMP+NUMA).
>     
>     It requires a version number bump, but that isn't much of an issue because
>     I think we're about the only two using it at the moment. But your tools
>     will need a little bit of work.
>     
>     What do you think?
> 
> The idea of moving some counters from runqueues to domains is fine in
> general, but I've some questions about a couple of specific changes in
> your patch.
> 
>     It looks to me like there are some changes in try_to_wake_up() that
> 	aren't schedstats related, although schedstats code is among some
> 	that is moved around.  Is there some code there that should be
> 	broken out separately?
> 

There is, yes. I'll be sure to seperate it.

>     alb_cnt
> 	by moving this, we won't get an accurate look at the number of
> 	times we called active_load_balance and returned immediately
> 	because nr_running had slipped to 0 or 1.  how about we add
> 	another counter to count that too, and/or change the name of
> 	this one?
> 

OK.

>     lb_balanced
> 	are you sure lb_balanced[idle] can't be deduced from lb_cnt[idle]
> 	and lb_failed[idle]?
> 

I don't think so, because you also have the success case, which is
!balanced && !failed.

>     ttwu_attempts
>     ttwu_moved
> 	removing these makes it harder to determine how successful
> 	try_to_wake_up() was at moving a process.  What counters would
> 	I use to get this information if these were removed?
> 

ttwu_cnt in the rq stats, and ttwu_wake_affine / ttwu_wake_balance
in the domain stats.

>     ttwu_remote
>     ttwu_wake_remote
> 	so what's the one line description of what these count now?
> 

ttwu_remote/ttwu_wake_remote are the number of times a runqueue has
woken a remote task / a remote task within that domain, respectively.
Regardless of whether or not it gets pulled onto the local CPU.

>     smt_cnt
>     sbe_cnt
> 	how might I see how often sched_migrate_task() and sched_exec()
> 	were called if these were deleted?
> 

sbe_pushed should basically be the same as smt_cnt, barring rare
races with the cpus_allowed mask. I guess sbe_cnt doesn't have to
go.

>     lb_pulled
> 	Rather than add another counter here, would it be as effective
> 	to make pt_gained a domain counter? Looks like you're collecting

Yeah removing the runqueue counters for these would be good.

> 	the same information.  pt_lost would have to remain a runqueue
> 	counter, though, since losing a task has nothing to do with a
> 	particular domain.


Whatever domain that the pulling CPU was in, is also a fair candidate
for pt_lost. Remember, all the domains are per-CPU so any information
you can get from a per-runqueue counter you can also get from a domain
counter.

I'll make a few changes and give you another look. Thanks for the comments.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2004-09-09  0:03 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-09-04  5:07 [PATCH] schedstats additions Nick Piggin
2004-09-04  8:26 ` Rafael J. Wysocki
2004-09-04  9:04   ` Nick Piggin
2004-09-08  8:09 ` Rick Lindsley
2004-09-08 22:56   ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox