Re: [patch 2.6.16-rc5-mm2] sched_cleanup-V17 - task throttling patch 1 of 2

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Mike Galbraith <efault@gmx.de>
To: Peter Williams <pwil3058@bigpond.net.au>
Cc: lkml <linux-kernel@vger.kernel.org>,
	mingo@elte.hu, kernel@kolivas.org, nickpiggin@yahoo.com.au,
	"Chen, Kenneth W" <kenneth.w.chen@intel.com>,
	Andrew Morton <akpm@osdl.org>
Subject: Re: [patch 2.6.16-rc5-mm2]  sched_cleanup-V17 - task throttling patch 1 of 2
Date: Sat, 04 Mar 2006 11:53:40 +0100	[thread overview]
Message-ID: <1141469620.13593.10.camel@homer> (raw)
In-Reply-To: <1141449654.7703.36.camel@homer>

On Sat, 2006-03-04 at 06:21 +0100, Mike Galbraith wrote:

> I need to reconsider the nanosecond tracking a bit anyway.  I was too
> quick on the draw with the granularity change.  It doesn't do what the
> original does, and won't work at all when interrupts become tasks.
> 
> To do this properly, I need to maintain separate tick hit count (a.k.a.
> time_slice;) and run_time.  If I had slice_info in the first patch, I
> could store granularity there and not have to add anything to the task
> struct, but alas...

Ala the below.  Leave the ticked timeslice available for those places
where nanoseconds are useless, and do precise counting only where that
is useful.  I need to come up with a better name that slice_ticks, and
time_slice is already taken.

	-Mike

--- linux-2.6.16-rc5-mm2/include/linux/sched.h.org	2006-03-01 15:06:22.000000000 +0100
+++ linux-2.6.16-rc5-mm2/include/linux/sched.h	2006-03-02 08:33:12.000000000 +0100
@@ -720,7 +720,8 @@
 
 	unsigned long policy;
 	cpumask_t cpus_allowed;
-	unsigned int time_slice, first_time_slice;
+	int time_slice;
+	unsigned int slice_ticks, first_time_slice;
 
 #ifdef CONFIG_SCHEDSTATS
 	struct sched_info sched_info;
--- linux-2.6.16-rc5-mm2/kernel/sched.c.org	2006-03-01 15:05:56.000000000 +0100
+++ linux-2.6.16-rc5-mm2/kernel/sched.c	2006-03-02 10:05:47.000000000 +0100
@@ -99,6 +99,10 @@
 #define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
 #define STARVATION_LIMIT	(MAX_SLEEP_AVG)
 #define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
+#define NS_MAX_SLEEP_AVG_PCNT	(NS_MAX_SLEEP_AVG / 100)
+#define PCNT_PER_DYNPRIO	(100 / MAX_BONUS)
+#define NS_PER_DYNPRIO		(PCNT_PER_DYNPRIO * NS_MAX_SLEEP_AVG_PCNT)
+#define NS_TICK			(JIFFIES_TO_NS(1))
 
 /*
  * If a task is 'interactive' then we reinsert it in the active
@@ -153,9 +157,25 @@
 #define TASK_INTERACTIVE(p) \
 	((p)->prio <= (p)->static_prio - DELTA(p))
 
-#define INTERACTIVE_SLEEP(p) \
-	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
-		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
+#define SLEEP_AVG_DIVISOR(p) (1 + CURRENT_BONUS(p))
+
+#define INTERACTIVE_SLEEP_AVG(p) \
+	(min(JIFFIES_TO_NS(MAX_SLEEP_AVG * (MAX_BONUS / 2 + DELTA(p)) / \
+	MAX_BONUS), NS_MAX_SLEEP_AVG))
+
+/*
+ * Returns whether a task has been asleep long enough to be considered idle.
+ * The metric is whether this quantity of sleep would promote the task more
+ * than one priority beyond marginally interactive.
+ */
+static int task_interactive_idle(task_t *p, unsigned long sleep_time)
+{
+	unsigned long ceiling = (CURRENT_BONUS(p) + 2) * NS_PER_DYNPRIO;
+
+	if (p->sleep_avg + sleep_time < ceiling)
+		return 0;
+	return p->sleep_avg + sleep_time >= INTERACTIVE_SLEEP_AVG(p);
+}
 
 #define TASK_PREEMPTS_CURR(p, rq) \
 	((p)->prio < (rq)->curr->prio)
@@ -182,7 +202,7 @@
 
 static inline unsigned int task_timeslice(task_t *p)
 {
-	return static_prio_timeslice(p->static_prio);
+	return JIFFIES_TO_NS(static_prio_timeslice(p->static_prio));
 }
 
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
@@ -777,6 +797,9 @@
 	unsigned long long __sleep_time = now - p->timestamp;
 	unsigned long sleep_time;
 
+	if (unlikely(now < p->timestamp))
+		__sleep_time = 0ULL;
+
 	if (unlikely(p->policy == SCHED_BATCH))
 		sleep_time = 0;
 	else {
@@ -788,32 +811,32 @@
 
 	if (likely(sleep_time > 0)) {
 		/*
-		 * User tasks that sleep a long time are categorised as
-		 * idle. They will only have their sleep_avg increased to a
+		 * Tasks that sleep a long time are categorised as idle.
+		 * They will only have their sleep_avg increased to a
 		 * level that makes them just interactive priority to stay
 		 * active yet prevent them suddenly becoming cpu hogs and
 		 * starving other processes.
 		 */
-		if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
-				unsigned long ceiling;
-
-				ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
-					DEF_TIMESLICE);
-				if (p->sleep_avg < ceiling)
-					p->sleep_avg = ceiling;
-		} else {
+		if (task_interactive_idle(p, sleep_time)) {
+			unsigned long ceiling = INTERACTIVE_SLEEP_AVG(p);
 
 			/*
-			 * The lower the sleep avg a task has the more
-			 * rapidly it will rise with sleep time. This enables
-			 * tasks to rapidly recover to a low latency priority.
-			 * If a task was sleeping with the noninteractive
-			 * label do not apply this non-linear boost
+			 * Promote previously interactive task.
 			 */
-			if (p->sleep_type != SLEEP_NONINTERACTIVE || !p->mm)
-				sleep_time *=
-					(MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
+			if (p->sleep_avg > ceiling) {
+				ceiling = p->sleep_avg / NS_PER_DYNPRIO;
+				if (ceiling < MAX_BONUS)
+					ceiling++;
+				ceiling *= NS_PER_DYNPRIO;
+			} else {
+				ceiling += p->time_slice >> 2;
+				if (ceiling > NS_MAX_SLEEP_AVG)
+					ceiling = NS_MAX_SLEEP_AVG;
+			}
 
+			if (p->sleep_avg < ceiling)
+				p->sleep_avg = ceiling;
+		} else {
 			/*
 			 * This code gives a bonus to interactive tasks.
 			 *
@@ -1367,7 +1390,8 @@
 
 out_activate:
 #endif /* CONFIG_SMP */
-	if (old_state == TASK_UNINTERRUPTIBLE) {
+
+	if (old_state & TASK_UNINTERRUPTIBLE) {
 		rq->nr_uninterruptible--;
 		/*
 		 * Tasks waking from uninterruptible sleep are likely
@@ -1461,6 +1485,8 @@
 	 */
 	local_irq_disable();
 	p->time_slice = (current->time_slice + 1) >> 1;
+	if (unlikely(p->time_slice < NS_TICK))
+		p->time_slice = NS_TICK;
 	/*
 	 * The remainder of the first timeslice might be recovered by
 	 * the parent if the child exits early enough.
@@ -1468,13 +1494,12 @@
 	p->first_time_slice = 1;
 	current->time_slice >>= 1;
 	p->timestamp = sched_clock();
-	if (unlikely(!current->time_slice)) {
+	if (unlikely(current->time_slice < NS_TICK)) {
 		/*
 		 * This case is rare, it happens when the parent has only
 		 * a single jiffy left from its timeslice. Taking the
 		 * runqueue lock is not a problem.
 		 */
-		current->time_slice = 1;
 		scheduler_tick();
 	}
 	local_irq_enable();
@@ -2586,6 +2611,7 @@
 {
 	unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
 	p->sched_time += now - last;
+	p->time_slice -= now - last;
 }
 
 /*
@@ -2730,13 +2756,16 @@
 	 * timeslice. This makes it possible for interactive tasks
 	 * to use up their timeslices at their highest priority levels.
 	 */
+	if (p->slice_ticks > 1)
+		p->slice_ticks--;
 	if (rt_task(p)) {
 		/*
 		 * RR tasks need a special form of timeslice management.
 		 * FIFO tasks have no timeslices.
 		 */
-		if ((p->policy == SCHED_RR) && !--p->time_slice) {
-			p->time_slice = task_timeslice(p);
+		if ((p->policy == SCHED_RR) && p->time_slice < NS_TICK) {
+			p->time_slice += task_timeslice(p);
+			p->slice_ticks = NS_TO_JIFFIES(p->time_slice);
 			p->first_time_slice = 0;
 			set_tsk_need_resched(p);
 
@@ -2745,11 +2774,22 @@
 		}
 		goto out_unlock;
 	}
-	if (!--p->time_slice) {
+	if (p->time_slice < NS_TICK) {
+		unsigned int time_slice = task_timeslice(p);
+		int run_time = time_slice - p->time_slice;
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
+		p->time_slice += time_slice;
+		p->slice_ticks = NS_TO_JIFFIES(p->time_slice);
+		/*
+		 * Tasks are charged proportionately less run_time at high
+		 * sleep_avg to delay them losing their interactive status
+		 */
+		run_time /= SLEEP_AVG_DIVISOR(p);
+		p->sleep_avg -= run_time;
+		if ((long)p->sleep_avg < 0)
+			p->sleep_avg = 0;
 		p->prio = effective_prio(p);
-		p->time_slice = task_timeslice(p);
 		p->first_time_slice = 0;
 
 		if (!rq->expired_timestamp)
@@ -2777,13 +2817,12 @@
 		 * This only applies to tasks in the interactive
 		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
 		 */
-		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
-			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
-			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
+		if (TASK_INTERACTIVE(p) && !((NS_TO_JIFFIES(task_timeslice(p)) -
+			p->slice_ticks) % TIMESLICE_GRANULARITY(p)) &&
+			(p->slice_ticks >= TIMESLICE_GRANULARITY(p)) &&
 			(p->array == rq->active)) {
-
-			requeue_task(p, rq->active);
-			set_tsk_need_resched(p);
+				requeue_task(p, rq->active);
+				set_tsk_need_resched(p);
 		}
 	}
 out_unlock:
@@ -2851,7 +2890,7 @@
  */
 static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
 {
-	return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+	return p->slice_ticks * (100 - sd->per_cpu_gain) / 100;
 }
 
 static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
@@ -3014,7 +3053,6 @@
 	prio_array_t *array;
 	struct list_head *queue;
 	unsigned long long now;
-	unsigned long run_time;
 	int cpu, idx, new_prio;
 
 	/*
@@ -3050,19 +3088,6 @@
 
 	schedstat_inc(rq, sched_cnt);
 	now = sched_clock();
-	if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
-		run_time = now - prev->timestamp;
-		if (unlikely((long long)(now - prev->timestamp) < 0))
-			run_time = 0;
-	} else
-		run_time = NS_MAX_SLEEP_AVG;
-
-	/*
-	 * Tasks charged proportionately less run_time at high sleep_avg to
-	 * delay them losing their interactive status
-	 */
-	run_time /= (CURRENT_BONUS(prev) ? : 1);
-
 	spin_lock_irq(&rq->lock);
 
 	if (unlikely(prev->flags & PF_DEAD))
@@ -3075,7 +3100,7 @@
 				unlikely(signal_pending(prev))))
 			prev->state = TASK_RUNNING;
 		else {
-			if (prev->state == TASK_UNINTERRUPTIBLE)
+			if (prev->state & TASK_UNINTERRUPTIBLE)
 				rq->nr_uninterruptible++;
 			deactivate_task(prev, rq);
 		}
@@ -3136,7 +3161,6 @@
 		if (next->sleep_type == SLEEP_INTERACTIVE)
 			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
 
-		array = next->array;
 		new_prio = recalc_task_prio(next, next->timestamp + delta);
 
 		if (unlikely(next->prio != new_prio)) {
@@ -3156,9 +3180,6 @@
 
 	update_cpu_clock(prev, rq, now);
 
-	prev->sleep_avg -= run_time;
-	if ((long)prev->sleep_avg <= 0)
-		prev->sleep_avg = 0;
 	prev->timestamp = prev->last_ran = now;
 
 	sched_info_switch(prev, next);

next prev parent reply	other threads:[~2006-03-04 10:51 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-02-17 13:45 [patch 2.6.16-rc3-mm1] Task Throttling V9 MIke Galbraith
2006-02-24 20:29 ` [patch 2.6.16-rc4-mm1] Task Throttling V14 MIke Galbraith
2006-02-24 22:15   ` Andrew Morton
2006-02-25  1:16     ` Peter Williams
2006-02-25  2:20       ` MIke Galbraith
2006-02-25  2:42       ` Nick Piggin
2006-02-25  2:57         ` Con Kolivas
2006-02-25  3:08           ` Nick Piggin
2006-02-25  3:35             ` MIke Galbraith
2006-02-25  2:23     ` MIke Galbraith
2006-03-03 10:43       ` [patch 2.6.16-rc5-mm2] sched_cleanup-V17 - task throttling patch 1 of 2 Mike Galbraith
2006-03-03 10:58         ` [patch 2.6.16-rc5-mm2] sched_throttle-V17 - task throttling patch 2 " Mike Galbraith
2006-03-03 23:58         ` [patch 2.6.16-rc5-mm2] sched_cleanup-V17 - task throttling patch 1 " Peter Williams
2006-03-04  4:54           ` Mike Galbraith
2006-03-04 21:37             ` Peter Williams
2006-03-05  4:53               ` Mike Galbraith
2006-03-05  6:54               ` Mike Galbraith
2006-03-04  2:33         ` Peter Williams
2006-03-04  5:20           ` Mike Galbraith
2006-03-04  5:24             ` Con Kolivas
2006-03-04  5:29               ` Mike Galbraith
2006-03-04  5:40                 ` Randy.Dunlap
2006-03-04  5:54                   ` Con Kolivas
2006-03-04  6:05                     ` Randy.Dunlap
2006-03-04  6:50                     ` Mike Galbraith
2006-03-04  6:50                       ` Con Kolivas
2006-03-04  7:04                         ` Mike Galbraith
2006-03-05 22:29                         ` Peter Williams
2006-03-04 21:44                       ` Peter Williams
2006-03-04 10:53             ` Mike Galbraith [this message]
2006-02-26 11:26   ` [patch 2.6.16-rc4-mm1] Task Throttling V14 Daniel K.
2006-02-26 13:19     ` MIke Galbraith

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1141469620.13593.10.camel@homer \
    --to=efault@gmx.de \
    --cc=akpm@osdl.org \
    --cc=kenneth.w.chen@intel.com \
    --cc=kernel@kolivas.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=nickpiggin@yahoo.com.au \
    --cc=pwil3058@bigpond.net.au \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.