public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Mike Galbraith <efault@gmx.de>
To: Ingo Molnar <mingo@elte.hu>, Andrew Morton <akpm@osdl.org>
Cc: lkml <linux-kernel@vger.kernel.org>,
	Peter Williams <pwil3058@bigpond.net.au>,
	Nick Piggin <nickpiggin@yahoo.com.au>,
	Con Kolivas <kernel@kolivas.org>
Subject: Re: [patch 2.6.16-mm2 6/9] sched throttle tree extract - move division to slow path
Date: Sat, 01 Apr 2006 11:12:11 +0200	[thread overview]
Message-ID: <1143882731.7617.55.camel@homer> (raw)
In-Reply-To: <1143881983.7617.41.camel@homer>

This patch moves division of runtime from the fast path into the slow
path.  In doing so, it converts the task's time slice into nanoseconds,
which also disallows high frequency scheduling tasks from stealing time
from their neighbors by ducking the timer interrupt. (two birds, one
stone)

slice_time_ns is identical in function to time_slice, but conversion of
everything that referred to time_slice to nanoseconds just made a mess.
Ergo, I close the lesser of two evils, and duplicated what was necessary
to allow use of nanoseconds only where needed.

Signed-off-by: Mike Galbraith <efault@gmx.de>

--- linux-2.6.16-mm2/include/linux/sched.h.org	2006-03-31 11:25:02.000000000 +0200
+++ linux-2.6.16-mm2/include/linux/sched.h	2006-03-31 11:27:32.000000000 +0200
@@ -744,6 +744,7 @@ struct task_struct {
 	unsigned long policy;
 	cpumask_t cpus_allowed;
 	unsigned int time_slice, first_time_slice;
+	long slice_time_ns;
 
 #ifdef CONFIG_SCHEDSTATS
 	struct sched_info sched_info;
--- linux-2.6.16-mm2/kernel/sched.c-5.interactive_idle	2006-04-01 09:12:51.000000000 +0200
+++ linux-2.6.16-mm2/kernel/sched.c	2006-03-31 13:34:12.000000000 +0200
@@ -102,6 +102,7 @@
 #define NS_MAX_SLEEP_AVG_PCNT	(NS_MAX_SLEEP_AVG / 100)
 #define PCNT_PER_DYNPRIO	(100 / MAX_BONUS)
 #define NS_PER_DYNPRIO		(PCNT_PER_DYNPRIO * NS_MAX_SLEEP_AVG_PCNT)
+#define NS_TICK			(1000000000 / HZ)
 
 /*
  * If a task is 'interactive' then we reinsert it in the active
@@ -156,6 +157,8 @@
 #define TASK_INTERACTIVE(p) \
 	((p)->prio <= (p)->static_prio - DELTA(p))
 
+#define SLEEP_AVG_DIVISOR(p) (1 + CURRENT_BONUS(p))
+
 #define INTERACTIVE_SLEEP_AVG(p) \
 	(min(JIFFIES_TO_NS(MAX_SLEEP_AVG * (MAX_BONUS / 2 + DELTA(p)) / \
 	MAX_BONUS), NS_MAX_SLEEP_AVG))
@@ -202,6 +205,11 @@ static inline unsigned int task_timeslic
 	return static_prio_timeslice(p->static_prio);
 }
 
+static inline unsigned int task_timeslice_ns(task_t *p)
+{
+	return static_prio_timeslice(p->static_prio) * NS_TICK;
+}
+
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
 				< (long long) (sd)->cache_hot_time)
 
@@ -1563,21 +1571,23 @@ void fastcall sched_fork(task_t *p, int 
 	 * resulting in more scheduling fairness.
 	 */
 	local_irq_disable();
-	p->time_slice = (current->time_slice + 1) >> 1;
+	p->slice_time_ns = current->slice_time_ns >> 1;
+	if (unlikely(p->slice_time_ns < NS_TICK))
+		p->slice_time_ns = NS_TICK;
+	p->time_slice = NS_TO_JIFFIES(p->slice_time_ns);
 	/*
 	 * The remainder of the first timeslice might be recovered by
 	 * the parent if the child exits early enough.
 	 */
 	p->first_time_slice = 1;
-	current->time_slice >>= 1;
 	p->timestamp = sched_clock();
-	if (unlikely(!current->time_slice)) {
+	current->slice_time_ns >>= 1;
+	if (unlikely(current->slice_time_ns < NS_TICK)) {
 		/*
 		 * This case is rare, it happens when the parent has only
 		 * a single jiffy left from its timeslice. Taking the
 		 * runqueue lock is not a problem.
 		 */
-		current->time_slice = 1;
 		scheduler_tick();
 	}
 	local_irq_enable();
@@ -1686,9 +1696,9 @@ void fastcall sched_exit(task_t *p)
 	 */
 	rq = task_rq_lock(p->parent, &flags);
 	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
-		p->parent->time_slice += p->time_slice;
-		if (unlikely(p->parent->time_slice > task_timeslice(p)))
-			p->parent->time_slice = task_timeslice(p);
+		p->parent->slice_time_ns += p->slice_time_ns;
+		if (unlikely(p->parent->slice_time_ns > task_timeslice_ns(p)))
+			p->parent->slice_time_ns = task_timeslice_ns(p);
 	}
 	if (p->sleep_avg < p->parent->sleep_avg)
 		p->parent->sleep_avg = p->parent->sleep_avg /
@@ -2709,7 +2719,9 @@ static inline void update_cpu_clock(task
 				    unsigned long long now)
 {
 	unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
-	p->sched_time += now - last;
+	long run_time = now - last;
+	p->sched_time += run_time;
+	p->slice_time_ns -= run_time;
 }
 
 /*
@@ -2837,13 +2849,21 @@ void scheduler_tick(void)
 	 * priority until it either goes to sleep or uses up its
 	 * timeslice. This makes it possible for interactive tasks
 	 * to use up their timeslices at their highest priority levels.
-	 */
+	 *
+	 * We don't want to update every task at each tick, so we
+	 * update a task's time_slice according to how much cpu
+	 * time it has received since it was last ticked.
+	 */
+	if (p->slice_time_ns < NS_TICK)
+		p->time_slice = 0;
+	else p->time_slice = NS_TO_JIFFIES(p->slice_time_ns);
 	if (rt_task(p)) {
 		/*
 		 * RR tasks need a special form of timeslice management.
 		 * FIFO tasks have no timeslices.
 		 */
-		if ((p->policy == SCHED_RR) && !--p->time_slice) {
+		if ((p->policy == SCHED_RR) && !p->time_slice) {
+			p->slice_time_ns = task_timeslice_ns(p);
 			p->time_slice = task_timeslice(p);
 			p->first_time_slice = 0;
 			set_tsk_need_resched(p);
@@ -2853,11 +2873,22 @@ void scheduler_tick(void)
 		}
 		goto out_unlock;
 	}
-	if (!--p->time_slice) {
+	if (!p->time_slice) {
+		long slice_time_ns = task_timeslice_ns(p);
+		long run_time = (-1 * p->slice_time_ns) + slice_time_ns;
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
-		p->prio = effective_prio(p);
+		p->slice_time_ns = slice_time_ns;
 		p->time_slice = task_timeslice(p);
+		/*
+		 * Tasks are charged proportionately less run_time at high
+		 * sleep_avg to delay them losing their interactive status
+		 */
+		run_time /= SLEEP_AVG_DIVISOR(p);
+		if (p->sleep_avg >= run_time)
+			p->sleep_avg -= run_time;
+		else p->sleep_avg = 0;
+		p->prio = effective_prio(p);
 		p->first_time_slice = 0;
 
 		if (!rq->expired_timestamp)
@@ -3122,7 +3153,6 @@ asmlinkage void __sched schedule(void)
 	prio_array_t *array;
 	struct list_head *queue;
 	unsigned long long now;
-	unsigned long run_time;
 	int cpu, idx, new_prio;
 
 	/*
@@ -3156,19 +3186,6 @@ need_resched_nonpreemptible:
 
 	schedstat_inc(rq, sched_cnt);
 	now = sched_clock();
-	if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
-		run_time = now - prev->timestamp;
-		if (unlikely((long long)(now - prev->timestamp) < 0))
-			run_time = 0;
-	} else
-		run_time = NS_MAX_SLEEP_AVG;
-
-	/*
-	 * Tasks charged proportionately less run_time at high sleep_avg to
-	 * delay them losing their interactive status
-	 */
-	run_time /= (CURRENT_BONUS(prev) ? : 1);
-
 	spin_lock_irq(&rq->lock);
 
 	if (unlikely(prev->flags & PF_DEAD))
@@ -3242,7 +3259,6 @@ go_idle:
 		if (next->sleep_type == SLEEP_INTERACTIVE)
 			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
 
-		array = next->array;
 		new_prio = recalc_task_prio(next, next->timestamp + delta);
 
 		if (unlikely(next->prio != new_prio)) {
@@ -3262,9 +3278,6 @@ switch_tasks:
 
 	update_cpu_clock(prev, rq, now);
 
-	prev->sleep_avg -= run_time;
-	if ((long)prev->sleep_avg <= 0)
-		prev->sleep_avg = 0;
 	prev->timestamp = prev->last_ran = now;
 
 	sched_info_switch(prev, next);



  reply	other threads:[~2006-04-01  9:11 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-04-01  8:28 [patch 2.6.16-mm2 0/9] sched throttle tree extract Mike Galbraith
2006-04-01  8:33 ` [patch 2.6.16-mm2 1/9] sched throttle tree extract - ignore invalid timestamps Mike Galbraith
2006-04-01  8:38   ` [patch 2.6.16-mm2 2/9] sched throttle tree extract - fix potential task uninterruptible bug Mike Galbraith
2006-04-01  8:44     ` [patch 2.6.16-mm2 3/9] sched throttle tree extract - remove IO priority barrier Mike Galbraith
2006-04-01  8:51       ` [patch 2.6.16-mm2 4/9] sched throttle tree extract - remove kthread barrier Mike Galbraith
2006-04-01  8:59         ` [patch 2.6.16-mm2 5/9] sched throttle tree extract - correct idle sleep logic Mike Galbraith
2006-04-01  9:12           ` Mike Galbraith [this message]
2006-04-01  9:23             ` [patch 2.6.16-mm2 7/9] sched throttle tree extract - implement throttling Mike Galbraith
2006-04-01  9:26               ` [patch 2.6.16-mm2 8/9] sched throttle tree extract - maximize timeslice accounting Mike Galbraith
2006-04-01  9:31                 ` [patch 2.6.16-mm2 9/9] sched throttle tree extract - export tunables Mike Galbraith
2006-04-05 17:38                   ` [patch 2.6.16-mm2 10/9] sched throttle tree extract - kill interactive task feedback loop Mike Galbraith
2006-04-05 23:15                     ` Con Kolivas
2006-04-06  4:10                       ` Mike Galbraith
2006-04-06  4:29                         ` Con Kolivas
2006-04-01 16:53           ` [patch 2.6.16-mm2 5/9] sched throttle tree extract - correct idle sleep logic Lee Revell
2006-04-01 18:00             ` Mike Galbraith

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1143882731.7617.55.camel@homer \
    --to=efault@gmx.de \
    --cc=akpm@osdl.org \
    --cc=kernel@kolivas.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=nickpiggin@yahoo.com.au \
    --cc=pwil3058@bigpond.net.au \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox