All of lore.kernel.org
 help / color / mirror / Atom feed
From: MIke Galbraith <efault@gmx.de>
To: Lee Revell <rlrevell@joe-job.com>
Cc: Jan Engelhardt <jengelh@linux01.gwdg.de>,
	Con Kolivas <kernel@kolivas.org>,
	gcoady@gmail.com, linux-kernel@vger.kernel.org,
	Ingo Molnar <mingo@elte.hu>
Subject: Re: 2.6 vs 2.4, ssh terminal slowdown
Date: Sun, 12 Feb 2006 22:36:33 +0100	[thread overview]
Message-ID: <1139780193.7837.7.camel@homer> (raw)
In-Reply-To: <1139771016.19342.253.camel@mindpipe>

On Sun, 2006-02-12 at 14:03 -0500, Lee Revell wrote:
> On Sun, 2006-02-12 at 14:47 +0100, MIke Galbraith wrote:
> > If you think it's the scheduler, how about try the patch below.  It's
> > against 2.6.16-rc2-mm1, and should tell you if it is the interactivity
> > logic in the scheduler or not.  I don't see other candidates in there,
> > not that that means there aren't any of course. 
> 
> I'll try, but it's a serious pain for me to build an -mm kernel.  A
> patch against 2.6.16-rc1 would be much easier.

Ok, here she comes.  It's a bit too reluctant to release a task so it
can reach interactive status at the moment, but for this test, that's a
feature. In fact, for this test, it's probably best to jump straight to
setting both g1 and g2 to zero.

	-Mike

--- linux-2.6.16-rc1/include/linux/sched.h.org	2006-02-12 21:28:28.000000000 +0100
+++ linux-2.6.16-rc1/include/linux/sched.h	2006-02-12 21:54:40.000000000 +0100
@@ -688,6 +688,13 @@
 struct audit_context;		/* See audit.c */
 struct mempolicy;
 
+enum sleep_type {
+	SLEEP_NORMAL,
+	SLEEP_NONINTERACTIVE,
+	SLEEP_INTERACTIVE,
+	SLEEP_INTERRUPTED,
+};
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	struct thread_info *thread_info;
@@ -709,14 +716,14 @@
 
 	unsigned short ioprio;
 
-	unsigned long sleep_avg;
+	unsigned long sleep_avg, last_slice, throttle_stamp;
 	unsigned long long timestamp, last_ran;
 	unsigned long long sched_time; /* sched_clock time spent running */
-	int activated;
+	enum sleep_type sleep_type;
 
 	unsigned long policy;
 	cpumask_t cpus_allowed;
-	unsigned int time_slice, first_time_slice;
+	unsigned int time_slice, slice_info;
 
 #ifdef CONFIG_SCHEDSTATS
 	struct sched_info sched_info;
--- linux-2.6.16-rc1/include/linux/sysctl.h.org	2006-02-12 21:28:44.000000000 +0100
+++ linux-2.6.16-rc1/include/linux/sysctl.h	2006-02-12 21:34:46.000000000 +0100
@@ -146,6 +146,8 @@
 	KERN_RANDOMIZE=68, /* int: randomize virtual address space */
 	KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */
 	KERN_SPIN_RETRY=70,	/* int: number of spinlock retries */
+	KERN_SCHED_THROTTLE1=71,  /* int: throttling grace period 1 in secs */
+	KERN_SCHED_THROTTLE2=72,  /* int: throttling grace period 2 in secs */
 };
 
 
--- linux-2.6.16-rc1/kernel/sched.c.org	2006-02-12 21:29:13.000000000 +0100
+++ linux-2.6.16-rc1/kernel/sched.c	2006-02-12 21:58:14.000000000 +0100
@@ -149,9 +149,195 @@
 #define TASK_INTERACTIVE(p) \
 	((p)->prio <= (p)->static_prio - DELTA(p))
 
-#define INTERACTIVE_SLEEP(p) \
-	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
-		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
+/*
+ * Interactivity boost can lead to serious starvation problems if the
+ * task being boosted turns out to be a cpu hog.  To combat this, we
+ * compute a running slice_avg, which is the sane upper limit for the
+ * task's sleep_avg.  If an 'interactive' task begins burning cpu, it's
+ * slice_avg will decay, making it visible as a problem so corrective
+ * measures can be applied.
+ *
+ * /proc/sys/kernel tunables.
+ *
+ * sched_g1: Grace period in seconds that a task is allowed to run unchecked.
+ * sched_g2: seconds thereafter, to force a priority adjustment.
+ */
+
+int sched_g1 = 20;
+int sched_g2 = 10;
+
+/*
+ * Offset from the time we noticed a potential problem until we disable the
+ * interactive bonus multiplier, and adjust sleep_avg consumption rate.
+ */
+#define G1 (sched_g1 * HZ)
+
+/*
+ * Offset thereafter that we disable the interactive bonus divisor, and adjust
+ * a runaway task's priority.
+ */
+#define G2 (sched_g2 * HZ + G1)
+
+/*
+ * Grace period has expired.
+ */
+#define grace_expired(p, grace) ((p)->throttle_stamp && \
+	time_after_eq(jiffies, (p)->throttle_stamp + (grace)))
+
+#define NEXT_PRIO (NS_MAX_SLEEP_AVG / MAX_BONUS)
+
+/*
+ * Warning: do not reduce threshold below NS_MAX_SLEEP_AVG / MAX_BONUS
+ * else you may break the case where one of a pair of communicating tasks
+ * only sleeps a miniscule amount of time, but must to be able to preempt
+ * it's partner in order to get any cpu time to speak of.  If you push that
+ * task to the same level or below it's partner, it will not be able to
+ * preempt and will starve.  This scenario was fixed for bonus calculation
+ * by converting sleep_avg to ns.
+ */
+#define THROTTLE_THRESHOLD (NEXT_PRIO)
+
+#define NS_MAX_SLEEP_AVG_PCNT (NS_MAX_SLEEP_AVG / 100)
+
+/*
+ * Masks for p->slice_info, formerly p->first_time_slice.
+ * SLICE_FTS:   0x80000000  Task is in it's first ever timeslice.
+ * SLICE_NEW:   0x40000000  Slice refreshed.
+ * SLICE_SPA:   0x3FFF8000  Spare bits.
+ * SLICE_LTS:   0x00007F80  Last time slice
+ * SLICE_AVG:   0x0000007F  Task slice_avg stored as percentage.
+ */
+#define SLICE_AVG_BITS    7
+#define SLICE_LTS_BITS   10
+#define SLICE_SPA_BITS   13
+#define SLICE_NEW_BITS    1
+#define SLICE_FTS_BITS    1
+
+#define SLICE_AVG_SHIFT   0
+#define SLICE_LTS_SHIFT   (SLICE_AVG_SHIFT + SLICE_AVG_BITS)
+#define SLICE_SPA_SHIFT   (SLICE_LTS_SHIFT + SLICE_LTS_BITS) 
+#define SLICE_NEW_SHIFT   (SLICE_SPA_SHIFT + SLICE_SPA_BITS)
+#define SLICE_FTS_SHIFT   (SLICE_NEW_SHIFT + SLICE_NEW_BITS)
+
+#define INFO_MASK(x)      ((1U << (x))-1)
+#define SLICE_AVG_MASK    (INFO_MASK(SLICE_AVG_BITS) << SLICE_AVG_SHIFT)
+#define SLICE_LTS_MASK    (INFO_MASK(SLICE_LTS_BITS) << SLICE_LTS_SHIFT)
+#define SLICE_SPA_MASK    (INFO_MASK(SLICE_SPA_BITS) << SLICE_SPA_SHIFT)
+#define SLICE_NEW_MASK    (INFO_MASK(SLICE_NEW_BITS) << SLICE_NEW_SHIFT)
+#define SLICE_FTS_MASK    (INFO_MASK(SLICE_FTS_BITS) << SLICE_FTS_SHIFT)
+
+#define first_time_slice(p) ((p)->slice_info & SLICE_FTS_MASK)
+#define set_first_time_slice(p) ((p)->slice_info |= SLICE_FTS_MASK)
+#define clr_first_time_slice(p) ((p)->slice_info &= ~SLICE_FTS_MASK)
+
+#define slice_is_new(p) ((p)->slice_info & SLICE_NEW_MASK)
+#define set_slice_is_new(p) ((p)->slice_info |= SLICE_NEW_MASK)
+#define clr_slice_is_new(p) ((p)->slice_info &= ~SLICE_NEW_MASK)
+
+#define last_slice(p) \
+	((((p)->slice_info & SLICE_LTS_MASK) >> SLICE_LTS_SHIFT) ? : \
+	DEF_TIMESLICE)
+#define set_last_slice(p, n) ((p)->slice_info = (((p)->slice_info & \
+	~SLICE_LTS_MASK) | (((n) << SLICE_LTS_SHIFT) & SLICE_LTS_MASK))) 
+
+#define slice_avg(p) \
+	((((p)->slice_info & SLICE_AVG_MASK) >> SLICE_AVG_SHIFT) * \
+	NS_MAX_SLEEP_AVG_PCNT)
+#define set_slice_avg(p, n) ((p)->slice_info = (((p)->slice_info & \
+	~SLICE_AVG_MASK) | ((((n) / NS_MAX_SLEEP_AVG_PCNT) \
+	<< SLICE_AVG_SHIFT) & SLICE_AVG_MASK))) 
+#define slice_avg_raw(p)  \
+	(((p)->slice_info & SLICE_AVG_MASK) >> SLICE_AVG_SHIFT)
+#define set_slice_avg_raw(p, n) ((p)->slice_info = (((p)->slice_info & \
+	~SLICE_AVG_MASK) | (((n) << SLICE_AVG_SHIFT) & SLICE_AVG_MASK))) 
+
+#define cpu_avg(p) \
+	(100 - slice_avg_raw(p))
+
+#define slice_time_avg(p) \
+	(100 * last_slice(p) / max((unsigned)cpu_avg(p), 1U))
+
+#define time_this_slice(p) \
+	(jiffies - (p)->last_slice)
+
+#define cpu_this_slice(p) \
+	(100 * last_slice(p) / max((unsigned)time_this_slice(p), \
+	(unsigned)last_slice(p)))
+
+#define this_slice_avg(p) \
+	((100 - cpu_this_slice(p)) * NS_MAX_SLEEP_AVG_PCNT)
+
+/*
+ * In order to prevent tasks from thrashing between domesticated livestock
+ * and irate rhino, once a throttle is hung on a task, the only way to get
+ * rid of it is to change behavior.  We push the throttle stamp forward in
+ * time as things improve until the stamp is in the future.  Only then may
+ * we safely pull our 'tranquilizer dart'. 
+ */
+#define conditional_tag(p) ((!(p)->throttle_stamp && 			\
+	(p)->sleep_avg > slice_avg(p) + THROTTLE_THRESHOLD) ?		\
+({									\
+	((p)->throttle_stamp = jiffies) ? : 1;				\
+}) : 0)
+
+/*
+ * Those who use the least cpu receive the most encouragement.
+ */
+#define SLICE_AVG_MULTIPLIER(p) \
+	(1 + NS_TO_JIFFIES(this_slice_avg(p)) * MAX_BONUS / MAX_SLEEP_AVG)
+
+#define conditional_release(p) (((p)->throttle_stamp &&			\
+	(p)->sched_time >= (G2 ? JIFFIES_TO_NS(HZ) : ~0ULL) &&		\
+	((20 + cpu_this_slice(p) < cpu_avg(p) && (p)->sleep_avg < 	\
+	slice_avg(p) + THROTTLE_THRESHOLD) || cpu_avg(p) <= 5)) ?	\
+({									\
+	int __ret = 0;							\
+	int delay = slice_time_avg(p) - last_slice(p);			\
+	if (delay > 0) {						\
+		delay *= SLICE_AVG_MULTIPLIER(p);			\
+		(p)->throttle_stamp += delay;				\
+	}								\
+	if (time_before(jiffies, (p)->throttle_stamp)) {		\
+		(p)->throttle_stamp = 0;				\
+		__ret++;						\
+		if (!((p)->state & TASK_NONINTERACTIVE))		\
+			(p)->sleep_type = SLEEP_NORMAL;			\
+	}								\
+	__ret;								\
+}) : 0)
+
+/*
+ * CURRENT_BONUS(p) adjusted to match slice_avg after grace expiration.
+ */
+#define ADJUSTED_BONUS(p, grace)				\
+({								\
+	unsigned long sleep_avg = (p)->sleep_avg;		\
+	if (grace_expired(p, (grace)))				\
+		sleep_avg = min((unsigned long)(p)->sleep_avg,	\
+		(unsigned long)slice_avg(p));			\
+	NS_TO_JIFFIES(sleep_avg) * MAX_BONUS / MAX_SLEEP_AVG;	\
+})
+
+#define BONUS_MULTIPLIER(p) \
+	(grace_expired(p, G1) ? : SLICE_AVG_MULTIPLIER(p))
+
+#define BONUS_DIVISOR(p) \
+	(grace_expired(p, G2) ? : (1 + ADJUSTED_BONUS(p, G1)))
+
+#define INTERACTIVE_SLEEP_AVG(p) \
+	(min(JIFFIES_TO_NS(MAX_SLEEP_AVG * (MAX_BONUS / 2 + DELTA(p)) / MAX_BONUS), \
+	NS_MAX_SLEEP_AVG))
+
+/*
+ * The quantity of sleep quaranteed to elevate a task to interactive status,
+ * or if already there, to elevate it to the next priority or beyond.
+ */
+#define INTERACTIVE_SLEEP_NS(p, ns) \
+	(BONUS_MULTIPLIER(p) * (ns) >= INTERACTIVE_SLEEP_AVG(p)	|| \
+	((p)->sleep_avg < INTERACTIVE_SLEEP_AVG(p) && BONUS_MULTIPLIER(p) * \
+	(ns) + (p)->sleep_avg >= INTERACTIVE_SLEEP_AVG(p))      || \
+	((p)->sleep_avg >= INTERACTIVE_SLEEP_AVG(p) && BONUS_MULTIPLIER(p) * \
+	(ns) + ((p)->sleep_avg % NEXT_PRIO) >= NEXT_PRIO))
 
 #define TASK_PREEMPTS_CURR(p, rq) \
 	((p)->prio < (rq)->curr->prio)
@@ -659,7 +845,7 @@
 	if (rt_task(p))
 		return p->prio;
 
-	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
+	bonus = ADJUSTED_BONUS(p, G2) - MAX_BONUS / 2;
 
 	prio = p->static_prio - bonus;
 	if (prio < MAX_RT_PRIO)
@@ -759,36 +945,50 @@
 
 	if (likely(sleep_time > 0)) {
 		/*
-		 * User tasks that sleep a long time are categorised as
-		 * idle and will get just interactive status to stay active &
-		 * prevent them suddenly becoming cpu hogs and starving
-		 * other processes.
+		 * Tasks that sleep a long time are categorised as idle.
+		 * They will only have their sleep_avg increased to a
+		 * level that makes them just interactive priority to stay
+		 * active yet prevent them suddenly becoming cpu hogs and
+		 * starving other processes.  All tasks must stop at each
+		 * TASK_INTERACTIVE boundry before moving on so that no
+		 * single sleep slams it straight into NS_MAX_SLEEP_AVG.
 		 */
-		if (p->mm && p->activated != -1 &&
-			sleep_time > INTERACTIVE_SLEEP(p)) {
-				p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
-						DEF_TIMESLICE);
+		if (INTERACTIVE_SLEEP_NS(p, sleep_time)) {
+			int ticks = last_slice(p) / BONUS_DIVISOR(p);
+			unsigned long ceiling = INTERACTIVE_SLEEP_AVG(p);
+
+			ticks = JIFFIES_TO_NS(ticks);
+
+			if (grace_expired(p, G2) && slice_avg(p) < ceiling)
+				ceiling = slice_avg(p);
+			/* Promote previously interactive task. */
+			else if (p->sleep_avg >= INTERACTIVE_SLEEP_AVG(p) &&
+					!grace_expired(p, G2)) {
+
+				ceiling = p->sleep_avg / NEXT_PRIO;
+				if (ceiling < MAX_BONUS)
+					ceiling++;
+				ceiling *= NEXT_PRIO;
+			}
+
+			ceiling += ticks;
+
+			if (ceiling > NS_MAX_SLEEP_AVG)
+				ceiling = NS_MAX_SLEEP_AVG;
+
+			if (p->sleep_avg < ceiling)
+				p->sleep_avg = ceiling;
 		} else {
-			/*
-			 * The lower the sleep avg a task has the more
-			 * rapidly it will rise with sleep time.
-			 */
-			sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
 
 			/*
-			 * Tasks waking from uninterruptible sleep are
-			 * limited in their sleep_avg rise as they
-			 * are likely to be waiting on I/O
+			 * The lower the sleep avg a task has the more
+			 * rapidly it will rise with sleep time. This enables
+			 * tasks to rapidly recover to a low latency priority.
+			 * If a task was sleeping with the noninteractive
+			 * label do not apply this non-linear boost
 			 */
-			if (p->activated == -1 && p->mm) {
-				if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
-					sleep_time = 0;
-				else if (p->sleep_avg + sleep_time >=
-						INTERACTIVE_SLEEP(p)) {
-					p->sleep_avg = INTERACTIVE_SLEEP(p);
-					sleep_time = 0;
-				}
-			}
+			if (p->sleep_type != SLEEP_NONINTERACTIVE)
+				sleep_time *= BONUS_MULTIPLIER(p);
 
 			/*
 			 * This code gives a bonus to interactive tasks.
@@ -835,7 +1035,7 @@
 	 * This checks to make sure it's not an uninterruptible task
 	 * that is now waking up.
 	 */
-	if (!p->activated) {
+	if (p->sleep_type != SLEEP_NONINTERACTIVE) {
 		/*
 		 * Tasks which were woken up by interrupts (ie. hw events)
 		 * are most likely of interactive nature. So we give them
@@ -844,13 +1044,13 @@
 		 * on a CPU, first time around:
 		 */
 		if (in_interrupt())
-			p->activated = 2;
+			p->sleep_type = SLEEP_INTERRUPTED;
 		else {
 			/*
 			 * Normal first-time wakeups get a credit too for
 			 * on-runqueue time, but it will be weighted down:
 			 */
-			p->activated = 1;
+			p->sleep_type = SLEEP_INTERACTIVE;
 		}
 	}
 	p->timestamp = now;
@@ -1371,25 +1571,28 @@
 
 out_activate:
 #endif /* CONFIG_SMP */
-	if (old_state == TASK_UNINTERRUPTIBLE) {
-		rq->nr_uninterruptible--;
+
+	conditional_release(p);
+
+	if (old_state & TASK_UNINTERRUPTIBLE) {
 		/*
-		 * Tasks on involuntary sleep don't earn
-		 * sleep_avg beyond just interactive state.
+		 * Tasks waking from uninterruptible sleep are likely
+		 * to be sleeping involuntarily on I/O and are otherwise
+		 * cpu bound so label them as noninteractive.
 		 */
-		p->activated = -1;
-	}
+		p->sleep_type = SLEEP_NONINTERACTIVE;
+	} else
 
 	/*
 	 * Tasks that have marked their sleep as noninteractive get
-	 * woken up without updating their sleep average. (i.e. their
-	 * sleep is handled in a priority-neutral manner, no priority
-	 * boost and no penalty.)
+	 * woken up with their sleep average not weighted in an
+	 * interactive way.
 	 */
-	if (old_state & TASK_NONINTERACTIVE)
-		__activate_task(p, rq);
-	else
-		activate_task(p, rq, cpu == this_cpu);
+		if (old_state & TASK_NONINTERACTIVE)
+			p->sleep_type = SLEEP_NONINTERACTIVE;
+
+
+	activate_task(p, rq, cpu == this_cpu);
 	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
 	 * has indicated that it will leave the CPU in short order)
@@ -1471,9 +1674,27 @@
 	 * The remainder of the first timeslice might be recovered by
 	 * the parent if the child exits early enough.
 	 */
-	p->first_time_slice = 1;
+	set_first_time_slice(p);
 	current->time_slice >>= 1;
 	p->timestamp = sched_clock();
+
+	/*
+	 * Set up slice_info for the child.
+	 *
+	 * Note: The child inherits the parent's throttle,
+	 * and must shake it loose.  It does not inherit
+	 * the parent's slice_avg.
+	 */
+	set_slice_avg(p, NS_MAX_SLEEP_AVG);
+	set_last_slice(p, p->time_slice);
+	set_slice_is_new(p);
+	p->last_slice = jiffies;
+	/*
+	 * Limit the difficulty to what the parent faced.
+	 */
+	if (p->throttle_stamp && grace_expired(p, G2))
+		p->throttle_stamp = jiffies - G2;
+
 	if (unlikely(!current->time_slice)) {
 		/*
 		 * This case is rare, it happens when the parent has only
@@ -1587,7 +1808,7 @@
 	 * the sleep_avg of the parent as well.
 	 */
 	rq = task_rq_lock(p->parent, &flags);
-	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
+	if (first_time_slice(p) && task_cpu(p) == task_cpu(p->parent)) {
 		p->parent->time_slice += p->time_slice;
 		if (unlikely(p->parent->time_slice > task_timeslice(p)))
 			p->parent->time_slice = task_timeslice(p);
@@ -2655,6 +2876,51 @@
 }
 
 /*
+ * Calculate a task's average cpu usage rate in terms of sleep_avg, and
+ * check whether the task may soon need throttling.  Must be called after
+ * refreshing the task's time slice.
+ * @p: task for which slice_avg should be computed.
+ */
+static void recalc_task_slice_avg(task_t *p)
+{
+	unsigned int slice_avg = slice_avg_raw(p);
+	unsigned int time_slice = last_slice(p);
+	int w = MAX_BONUS, idle;
+
+	if (unlikely(!time_slice))
+		set_last_slice(p, p->time_slice);
+
+	idle = 100 - cpu_this_slice(p);
+
+	/*
+	 * If the task is lowering it's cpu usage, speed up the
+	 * effect on slice_avg so we don't over-throttle.
+	 */
+	if (idle > slice_avg) {
+		w -= idle / w;
+		if (!w)
+			w = 1;
+	}
+
+	slice_avg = (w * (slice_avg ? : 1) + idle) / (w + 1);
+
+	/* Check to see if we should start/stop throttling. */
+	if(!rt_task(p) && !conditional_release(p))
+		conditional_tag(p);
+
+	/* Update slice_avg. */
+	set_slice_avg_raw(p, slice_avg);
+
+	/* Update cached slice length. */
+	if (time_slice != p->time_slice)
+		set_last_slice(p, p->time_slice);
+
+	/* And finally, stamp and tag the new slice. */
+	set_slice_is_new(p);
+	p->last_slice = jiffies;
+}
+
+/*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  *
@@ -2699,20 +2965,24 @@
 		 */
 		if ((p->policy == SCHED_RR) && !--p->time_slice) {
 			p->time_slice = task_timeslice(p);
-			p->first_time_slice = 0;
+			recalc_task_slice_avg(p);
+			clr_first_time_slice(p);
 			set_tsk_need_resched(p);
 
 			/* put it at the end of the queue: */
 			requeue_task(p, rq->active);
 		}
+		if (unlikely(p->throttle_stamp))
+			p->throttle_stamp = 0;
 		goto out_unlock;
 	}
 	if (!--p->time_slice) {
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
-		p->prio = effective_prio(p);
 		p->time_slice = task_timeslice(p);
-		p->first_time_slice = 0;
+		recalc_task_slice_avg(p);
+		p->prio = effective_prio(p);
+		clr_first_time_slice(p);
 
 		if (!rq->expired_timestamp)
 			rq->expired_timestamp = jiffies;
@@ -2959,6 +3229,12 @@
 
 #endif
 
+static inline int interactive_sleep(enum sleep_type sleep_type)
+{
+	return (sleep_type == SLEEP_INTERACTIVE ||
+		sleep_type == SLEEP_INTERRUPTED);
+}
+
 /*
  * schedule() is the main scheduler function.
  */
@@ -3017,7 +3293,7 @@
 	 * Tasks charged proportionately less run_time at high sleep_avg to
 	 * delay them losing their interactive status
 	 */
-	run_time /= (CURRENT_BONUS(prev) ? : 1);
+	run_time /= BONUS_DIVISOR(prev);
 
 	spin_lock_irq(&rq->lock);
 
@@ -3031,7 +3307,7 @@
 				unlikely(signal_pending(prev))))
 			prev->state = TASK_RUNNING;
 		else {
-			if (prev->state == TASK_UNINTERRUPTIBLE)
+			if (prev->state & TASK_UNINTERRUPTIBLE)
 				rq->nr_uninterruptible++;
 			deactivate_task(prev, rq);
 		}
@@ -3080,16 +3356,17 @@
 		rq->best_expired_prio = MAX_PRIO;
 	}
 
+repeat_selection:
 	idx = sched_find_first_bit(array->bitmap);
 	queue = array->queue + idx;
 	next = list_entry(queue->next, task_t, run_list);
 
-	if (!rt_task(next) && next->activated > 0) {
+	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
 		unsigned long long delta = now - next->timestamp;
 		if (unlikely((long long)(now - next->timestamp) < 0))
 			delta = 0;
 
-		if (next->activated == 1)
+		if (next->sleep_type == SLEEP_INTERACTIVE)
 			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
 
 		array = next->array;
@@ -3099,10 +3376,16 @@
 			dequeue_task(next, array);
 			next->prio = new_prio;
 			enqueue_task(next, array);
-		} else
-			requeue_task(next, array);
+
+			/*
+			 * We may have just been demoted below other
+			 * runnable tasks in our previous queue.
+			 */
+			next->sleep_type = SLEEP_NORMAL;
+			goto repeat_selection;
+		}
 	}
-	next->activated = 0;
+	next->sleep_type = SLEEP_NORMAL;
 switch_tasks:
 	if (next == rq->idle)
 		schedstat_inc(rq, sched_goidle);
@@ -3118,6 +3401,14 @@
 		prev->sleep_avg = 0;
 	prev->timestamp = prev->last_ran = now;
 
+	/*
+	 * Tag start of execution of a new timeslice.
+	 */
+	if (unlikely(slice_is_new(next))) {
+		next->last_slice = jiffies;
+		clr_slice_is_new(next);
+	}
+
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		next->timestamp = now;
--- linux-2.6.16-rc1/kernel/sysctl.c.org	2006-02-12 21:29:24.000000000 +0100
+++ linux-2.6.16-rc1/kernel/sysctl.c	2006-02-12 21:29:53.000000000 +0100
@@ -71,6 +71,8 @@
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
+extern int sched_g1;
+extern int sched_g2;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
@@ -226,6 +228,11 @@
 	{ .ctl_name = 0 }
 };
 
+/* Constants for minimum and maximum testing in vm_table and
+ * kern_table.  We use these as one-element integer vectors. */
+static int zero;
+static int one_hundred = 100;
+
 static ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_OSTYPE,
@@ -658,15 +665,29 @@
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+	{
+		.ctl_name	= KERN_SCHED_THROTTLE1,
+		.procname	= "sched_g1",
+		.data		= &sched_g1,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= KERN_SCHED_THROTTLE2,
+		.procname	= "sched_g2",
+		.data		= &sched_g2,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
 	{ .ctl_name = 0 }
 };
 
-/* Constants for minimum and maximum testing in vm_table.
-   We use these as one-element integer vectors. */
-static int zero;
-static int one_hundred = 100;
-
-
 static ctl_table vm_table[] = {
 	{
 		.ctl_name	= VM_OVERCOMMIT_MEMORY,
--- linux-2.6.16-rc1/fs/pipe.c.org	2006-02-12 21:29:35.000000000 +0100
+++ linux-2.6.16-rc1/fs/pipe.c	2006-02-12 21:29:53.000000000 +0100
@@ -39,11 +39,7 @@
 {
 	DEFINE_WAIT(wait);
 
-	/*
-	 * Pipes are system-local resources, so sleeping on them
-	 * is considered a noninteractive wait:
-	 */
-	prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE);
+	prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE);
 	mutex_unlock(PIPE_MUTEX(*inode));
 	schedule();
 	finish_wait(PIPE_WAIT(*inode), &wait);



  reply	other threads:[~2006-02-12 21:31 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-02-08  2:11 2.6 vs 2.4, ssh terminal slowdown Grant Coady
2006-02-08  2:24 ` Benjamin LaHaise
2006-02-08  2:50   ` Grant Coady
2006-02-08  3:02     ` Benjamin LaHaise
2006-02-08  2:35 ` Con Kolivas
2006-02-08  2:55   ` Grant Coady
2006-02-08  3:00     ` Con Kolivas
2006-02-08  4:12       ` Barry K. Nathan
2006-02-08  4:41         ` Grant Coady
2006-02-08  4:51       ` Grant Coady
2006-02-08  5:17         ` Willy Tarreau
2006-02-08  5:39           ` Grant Coady
2006-02-08  7:43       ` Lee Revell
2006-02-09 17:06   ` Jan Engelhardt
2006-02-09 20:06     ` Lee Revell
2006-02-10  6:35       ` MIke Galbraith
2006-02-12 13:47         ` MIke Galbraith
2006-02-12 19:03           ` Lee Revell
2006-02-12 21:36             ` MIke Galbraith [this message]
2006-02-12 23:23               ` Lee Revell
2006-02-12 23:39               ` Lee Revell
2006-02-13  3:09                 ` MIke Galbraith
2006-02-13  3:39                   ` Lee Revell
2006-02-13  4:59                     ` MIke Galbraith
2006-02-13  5:05                       ` Con Kolivas
2006-02-13  5:32                         ` MIke Galbraith
2006-02-13  5:37                           ` Con Kolivas
2006-02-13  5:57                             ` MIke Galbraith
2006-02-13  6:08                               ` Con Kolivas
2006-02-13  6:35                                 ` MIke Galbraith
2006-02-13  6:38                                   ` Lee Revell
2006-02-13  7:08                                     ` MIke Galbraith
2006-02-13  8:43                                       ` Lee Revell
2006-02-13 10:06                                         ` MIke Galbraith
2006-02-13 12:35                                         ` MIke Galbraith
2006-02-15  4:22                                           ` Lee Revell
2006-02-15  5:22                                             ` MIke Galbraith
2006-02-15  6:11                                               ` Lee Revell
2006-02-15  7:17                                                 ` MIke Galbraith
2006-02-13  7:15                                   ` Con Kolivas
2006-02-13  7:41                                     ` MIke Galbraith

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1139780193.7837.7.camel@homer \
    --to=efault@gmx.de \
    --cc=gcoady@gmail.com \
    --cc=jengelh@linux01.gwdg.de \
    --cc=kernel@kolivas.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=rlrevell@joe-job.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.