[patch 2.6.21-rc5] nicksched v33

All of lore.kernel.org
 help / color / mirror / Atom feed

* [patch 2.6.21-rc5] nicksched v33
@ 2007-03-29 11:58 Nick Piggin
  0 siblings, 0 replies; only message in thread
From: Nick Piggin @ 2007-03-29 11:58 UTC (permalink / raw)
  To: Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 398 bytes --]

This contains fixes for a bug that could cause too much latency with non-zero
niced processes, as well as fixes so that it SMP balancing actually works now
(the v32 forward port on top of smpnice was buggy).

X should be reniced to -10 for best results.

/proc/sys/kernel/base_timeslice can be reduced if the default (large) timeslice
is causing interactivity problems.

-- 
SUSE Labs, Novell Inc.

[-- Attachment #2: sched.patch --]
[-- Type: text/plain, Size: 57153 bytes --]

Index: linux-2.6/fs/proc/array.c
===================================================================
--- linux-2.6.orig/fs/proc/array.c	2007-03-29 19:45:08.000000000 +1000
+++ linux-2.6/fs/proc/array.c	2007-03-29 19:46:08.000000000 +1000
@@ -165,7 +165,13 @@ static inline char * task_state(struct t
 	rcu_read_lock();
 	buffer += sprintf(buffer,
 		"State:\t%s\n"
-		"SleepAVG:\t%lu%%\n"
+		"timeslice:\t%d\n"
+		"usedslice:\t%d\n"
+		"arrayseq:\t%lu\n"
+		"staticprio:\t%u\n"
+		"prio:\t%u\n"
+		"sleep_time:\t%lu\n"
+		"total_time:\t%lu\n"
 		"Tgid:\t%d\n"
 		"Pid:\t%d\n"
 		"PPid:\t%d\n"
@@ -173,7 +179,9 @@ static inline char * task_state(struct t
 		"Uid:\t%d\t%d\t%d\t%d\n"
 		"Gid:\t%d\t%d\t%d\t%d\n",
 		get_task_state(p),
-		(p->sleep_avg/1024)*100/(1020000000/1024),
+		get_task_timeslice(p), p->used_slice, p->array_sequence,
+		p->static_prio, p->prio,
+		p->sleep_time, p->total_time,
 	       	p->tgid, p->pid,
 	       	pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
 		pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h	2007-03-29 19:45:08.000000000 +1000
+++ linux-2.6/include/linux/sched.h	2007-03-29 19:52:33.000000000 +1000
@@ -523,7 +523,7 @@ struct signal_struct {
 #define MAX_USER_RT_PRIO	100
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
 
-#define MAX_PRIO		(MAX_RT_PRIO + 40)
+#define MAX_PRIO		(MAX_RT_PRIO + 59)
 
 #define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
 #define rt_task(p)		rt_prio((p)->prio)
@@ -788,24 +788,16 @@ struct mempolicy;
 struct pipe_inode_info;
 struct uts_namespace;
 
-enum sleep_type {
-	SLEEP_NORMAL,
-	SLEEP_NONINTERACTIVE,
-	SLEEP_INTERACTIVE,
-	SLEEP_INTERRUPTED,
-};
-
 struct prio_array;
 
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	struct thread_info *thread_info;
 	atomic_t usage;
+	int lock_depth;		/* BKL lock depth */
 	unsigned long flags;	/* per process flags, defined below */
 	unsigned long ptrace;
 
-	int lock_depth;		/* BKL lock depth */
-
 #ifdef CONFIG_SMP
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	int oncpu;
@@ -813,27 +805,29 @@ struct task_struct {
 #endif
 	int load_weight;	/* for niceness load balancing purposes */
 	int prio, static_prio, normal_prio;
+	int used_slice, over_slice;
 	struct list_head run_list;
 	struct prio_array *array;
+	unsigned long array_sequence;
 
-	unsigned short ioprio;
-#ifdef CONFIG_BLK_DEV_IO_TRACE
-	unsigned int btrace_seq;
-#endif
-	unsigned long sleep_avg;
 	unsigned long long timestamp, last_ran;
 	unsigned long long sched_time; /* sched_clock time spent running */
-	enum sleep_type sleep_type;
+	unsigned long total_time, sleep_time;
+
+	struct list_head tasks;
+	struct mm_struct *mm, *active_mm;
 
 	unsigned long policy;
 	cpumask_t cpus_allowed;
-	unsigned int time_slice, first_time_slice;
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
 #endif
 
-	struct list_head tasks;
+	unsigned short ioprio;
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+	unsigned int btrace_seq;
+#endif
 	/*
 	 * ptrace_list/ptrace_children forms the list of my children
 	 * that were stolen by a ptracer.
@@ -841,8 +835,6 @@ struct task_struct {
 	struct list_head ptrace_children;
 	struct list_head ptrace_list;
 
-	struct mm_struct *mm, *active_mm;
-
 /* task state */
 	struct linux_binfmt *binfmt;
 	long exit_state;
@@ -1199,6 +1191,8 @@ extern unsigned long long sched_clock(vo
 extern unsigned long long
 current_sched_time(const struct task_struct *current_task);
 
+extern int get_task_timeslice(struct task_struct *t);
+
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
 extern void sched_exec(void);
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c	2007-03-29 19:45:08.000000000 +1000
+++ linux-2.6/kernel/sched.c	2007-03-29 21:37:26.000000000 +1000
@@ -71,135 +71,75 @@ unsigned long long __attribute__((weak))
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  * and back.
  */
-#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
+#define NICE_TO_PRIO(nice)    (MAX_RT_PRIO + (nice) + 30)
+#define PRIO_TO_NICE(prio)    ((prio) - MAX_RT_PRIO - 30)
 
 /*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
+ * it's a [ 0 ... 58 ] range.
  */
 #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 
+#define US_TO_JIFFIES(x)      ((x) * HZ / 1000000)
+#define JIFFIES_TO_US(x)      ((x) * 1000000 / HZ)
+
 /*
- * Some helpers for converting nanosecond timing to jiffy resolution
+ * MIN_TIMESLICE is the timeslice that a minimum priority process gets if there
+ * is a maximum priority process runnable. MAX_TIMESLICE is derived from the
+ * formula in task_timeslice. It cannot be changed here. It is the timesilce
+ * that the maximum priority process will get. Larger timeslices are attainable
+ * by low priority processes however.
  */
-#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
-#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+int __read_mostly sched_base_timeslice = 300;
+int __read_mostly sched_min_base = 10;
+int __read_mostly sched_max_base = 10000;
+
+#define MIN_TIMESLICE		1
+#define RT_TIMESLICE		(50 * 1000 / HZ)                /* 50ms */
+#define BASE_TIMESLICE		(sched_base_timeslice)
+
+/* Maximum amount of history that will be used to calculate priority */
+#define MAX_SLEEP_SHIFT		18
+#define MAX_SLEEP		(1UL << MAX_SLEEP_SHIFT)        /* ~0.25s */
 
 /*
- * These are the 'tuning knobs' of the scheduler:
+ * Maximum effect that 1 block of activity (run/sleep/etc) can have. This is
+ * will moderate dicard freak events (eg. SIGSTOP)
  *
- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
- * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
- * Timeslices get refilled after they expire.
  */
-#define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
-#define DEF_TIMESLICE		(100 * HZ / 1000)
-#define ON_RUNQUEUE_WEIGHT	 30
-#define CHILD_PENALTY		 95
-#define PARENT_PENALTY		100
-#define EXIT_WEIGHT		  3
-#define PRIO_BONUS_RATIO	 25
-#define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
-#define INTERACTIVE_DELTA	  2
-#define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
-#define STARVATION_LIMIT	(MAX_SLEEP_AVG)
-#define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
+#define MAX_SLEEP_AFFECT	(MAX_SLEEP/4)
 
 /*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- *  priority range a task can explore, a value of '1' means the
- *  task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
+ * The amount of history can be decreased (on fork for example). This puts a
+ * lower bound on it.
  */
+#define MIN_HISTORY		(MAX_SLEEP/8)
+#define FORKED_TS_MAX		(US_TO_JIFFIES(10000) ?: 1)
 
-#define CURRENT_BONUS(p) \
-	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
-		MAX_SLEEP_AVG)
-
-#define GRANULARITY	(10 * HZ / 1000 ? : 1)
-
-#ifdef CONFIG_SMP
-#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
-			num_online_cpus())
-#else
-#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
-#endif
-
-#define SCALE(v1,v1_max,v2_max) \
-	(v1) * (v2_max) / (v1_max)
-
-#define DELTA(p) \
-	(SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
-		INTERACTIVE_DELTA)
-
-#define TASK_INTERACTIVE(p) \
-	((p)->prio <= (p)->static_prio - DELTA(p))
-
-#define INTERACTIVE_SLEEP(p) \
-	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
-		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
-
-#define TASK_PREEMPTS_CURR(p, rq) \
-	((p)->prio < (rq)->curr->prio)
-
-#define SCALE_PRIO(x, prio) \
-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
-
-static unsigned int static_prio_timeslice(int static_prio)
-{
-	if (static_prio < NICE_TO_PRIO(0))
-		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
-	else
-		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
-}
+/*
+ * SLEEP_FACTOR is a fixed point factor used to scale history tracking things.
+ * In particular: total_time, sleep_time, sleep_avg.
+ */
+#define SLEEP_FACTOR		1024
 
 /*
- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
- * to time slice values: [800ms ... 100ms ... 5ms]
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
+ * The scheduler classifies a process as performing one of the following
+ * activities
  */
+#define STIME_SLEEP           1       /* Sleeping */
+#define STIME_RUN             2       /* Using CPU */
 
-static inline unsigned int task_timeslice(struct task_struct *p)
-{
-	return static_prio_timeslice(p->static_prio);
-}
+#define TASK_PREEMPTS_CURR(p, rq)     ((p)->prio < (rq)->curr->prio)
 
 /*
  * These are the runqueue data structures:
  */
 
 struct prio_array {
+	int min_static_prio;
 	unsigned int nr_active;
 	DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
 	struct list_head queue[MAX_PRIO];
@@ -224,6 +164,7 @@ struct rq {
 #ifdef CONFIG_SMP
 	unsigned long cpu_load[3];
 #endif
+	unsigned long array_sequence;
 	unsigned long long nr_switches;
 
 	/*
@@ -234,15 +175,12 @@ struct rq {
 	 */
 	unsigned long nr_uninterruptible;
 
-	unsigned long expired_timestamp;
-	/* Cached timestamp set by update_cpu_clock() */
-	unsigned long long most_recent_timestamp;
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
-	struct prio_array *active, *expired, arrays[2];
-	int best_expired_prio;
 	atomic_t nr_iowait;
+	struct prio_array *active, *expired, arrays[2];
+//	struct list_head quotient_queue[10];
 
 #ifdef CONFIG_SMP
 	struct sched_domain *sd;
@@ -261,9 +199,6 @@ struct rq {
 	struct sched_info rq_sched_info;
 
 	/* sys_sched_yield() stats */
-	unsigned long yld_exp_empty;
-	unsigned long yld_act_empty;
-	unsigned long yld_both_empty;
 	unsigned long yld_cnt;
 
 	/* schedule() stats */
@@ -411,9 +346,8 @@ static struct rq *task_rq_lock(struct ta
 	struct rq *rq;
 
 repeat_lock_task:
-	local_irq_save(*flags);
 	rq = task_rq(p);
-	spin_lock(&rq->lock);
+	spin_lock_irqsave(&rq->lock, *flags);
 	if (unlikely(rq != task_rq(p))) {
 		spin_unlock_irqrestore(&rq->lock, *flags);
 		goto repeat_lock_task;
@@ -456,8 +390,8 @@ static int show_schedstat(struct seq_fil
 		/* runqueue-specific stats */
 		seq_printf(seq,
 		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
-		    cpu, rq->yld_both_empty,
-		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
+		    cpu,
+		    0UL, 0UL, 0UL, rq->yld_cnt,
 		    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
 		    rq->ttwu_cnt, rq->ttwu_local,
 		    rq->rq_sched_info.cpu_time,
@@ -561,21 +495,6 @@ rq_sched_info_depart(struct rq *rq, unsi
 # define schedstat_add(rq, field, amt)	do { } while (0)
 #endif
 
-/*
- * this_rq_lock - lock this runqueue and disable interrupts.
- */
-static inline struct rq *this_rq_lock(void)
-	__acquires(rq->lock)
-{
-	struct rq *rq;
-
-	local_irq_disable();
-	rq = this_rq();
-	spin_lock(&rq->lock);
-
-	return rq;
-}
-
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 /*
  * Called when a process is dequeued from the active array and given
@@ -682,6 +601,12 @@ sched_info_switch(struct task_struct *pr
 #define sched_info_switch(t, next)	do { } while (0)
 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 
+static void update_min_prio(struct task_struct *p, struct prio_array *array)
+{
+	if (!rt_task(p) && p->static_prio < array->min_static_prio)
+		array->min_static_prio = p->static_prio;
+}
+
 /*
  * Adding/removing a task to/from a priority array:
  */
@@ -695,20 +620,15 @@ static void dequeue_task(struct task_str
 
 static void enqueue_task(struct task_struct *p, struct prio_array *array)
 {
+	struct list_head *entry = array->queue + p->prio;
+
 	sched_info_queued(p);
-	list_add_tail(&p->run_list, array->queue + p->prio);
+	list_add_tail(&p->run_list, entry);
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
 	p->array = array;
-}
 
-/*
- * Put task to the end of the run list without the overhead of dequeue
- * followed by enqueue.
- */
-static void requeue_task(struct task_struct *p, struct prio_array *array)
-{
-	list_move_tail(&p->run_list, array->queue + p->prio);
+	update_min_prio(p, array);
 }
 
 static inline void
@@ -721,35 +641,6 @@ enqueue_task_head(struct task_struct *p,
 }
 
 /*
- * __normal_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
- *
- * We use 25% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
- *
- * Both properties are important to certain workloads.
- */
-
-static inline int __normal_prio(struct task_struct *p)
-{
-	int bonus, prio;
-
-	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
-
-	prio = p->static_prio - bonus;
-	if (prio < MAX_RT_PRIO)
-		prio = MAX_RT_PRIO;
-	if (prio > MAX_PRIO-1)
-		prio = MAX_PRIO-1;
-	return prio;
-}
-
-/*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
  * each task makes to its run queue's load is weighted according to its
@@ -758,12 +649,17 @@ static inline int __normal_prio(struct t
  * slice expiry etc.
  */
 
+static int static_prio_timeslice(unsigned int prio)
+{
+	return 59 - USER_PRIO(prio);
+}
+
 /*
- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
+ * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == BASE_TIMESLICE
  * If static_prio_timeslice() is ever changed to break this assumption then
  * this code will need modification
  */
-#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
+#define TIME_SLICE_NICE_ZERO 28
 #define LOAD_WEIGHT(lp) \
 	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
 #define PRIO_TO_LOAD_WEIGHT(prio) \
@@ -813,134 +709,155 @@ static inline void dec_nr_running(struct
 	dec_raw_weighted_load(rq, p);
 }
 
+
+static inline long long sched_clock_us(void)
+{
+	return ((unsigned long long)jiffies) << (20 - SHIFT_HZ);
+}
+
 /*
- * Calculate the expected normal priority: i.e. priority
- * without taking RT-inheritance into account. Might be
- * boosted by interactivity modifiers. Changes upon fork,
- * setprio syscalls, and whenever the interactivity
- * estimator recalculates.
+ * add_task_time updates a task p after time of doing the specified @type
+ * of activity. See STIME_*. This is used for priority calculation.
  */
-static inline int normal_prio(struct task_struct *p)
+static void add_task_time(struct task_struct *p,
+				unsigned long long time, unsigned long type)
 {
-	int prio;
+	unsigned long ratio;
+	unsigned long long tmp;
+	unsigned long t;
 
-	if (has_rt_policy(p))
-		prio = MAX_RT_PRIO-1 - p->rt_priority;
-	else
-		prio = __normal_prio(p);
-	return prio;
+	if (!time)
+		return;
+
+	t = time;
+	if (type == STIME_SLEEP) {
+		unsigned long fac;
+		fac = USER_PRIO(p->static_prio); /* 0..39 */
+		fac = fac + 12; /* 12..41 */
+		if (time > MAX_SLEEP_AFFECT*8)
+			t = MAX_SLEEP_AFFECT*8;
+		t = (t * fac) / 32;
+		t = (t + 7) / 8;
+	} else {
+		if (time > MAX_SLEEP)
+			t = MAX_SLEEP;
+	}
+
+	ratio = MAX_SLEEP - t;
+	tmp = (unsigned long long)ratio*p->total_time + MAX_SLEEP/2;
+	tmp >>= MAX_SLEEP_SHIFT;
+	p->total_time = (unsigned long)tmp;
+
+	tmp = (unsigned long long)ratio*p->sleep_time + MAX_SLEEP/2;
+	tmp >>= MAX_SLEEP_SHIFT;
+	p->sleep_time = (unsigned long)tmp;
+
+	p->total_time += t;
+	if (type == STIME_SLEEP)
+		p->sleep_time += t;
 }
 
-/*
- * Calculate the current priority, i.e. the priority
- * taken into account by the scheduler. This value might
- * be boosted by RT tasks, or might be boosted by
- * interactivity modifiers. Will be RT if the task got
- * RT-boosted. If not then it returns p->normal_prio.
- */
-static int effective_prio(struct task_struct *p)
-{
-	p->normal_prio = normal_prio(p);
-	/*
-	 * If we are RT tasks or we were boosted to RT priority,
-	 * keep the priority unchanged. Otherwise, update priority
-	 * to the normal priority:
-	 */
-	if (!rt_prio(p->prio))
-		return p->normal_prio;
-	return p->prio;
+static inline unsigned long task_sleep_avg(struct task_struct *p)
+{
+	return (SLEEP_FACTOR * p->sleep_time) / (p->total_time + 1);
 }
 
 /*
- * __activate_task - move a task to the runqueue.
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
+ *
+ * Timeslices are scaled, so if only low priority processes are running,
+ * they will all get long timeslices.
  */
-static void __activate_task(struct task_struct *p, struct rq *rq)
+static int task_timeslice(struct task_struct *p, struct rq *rq)
 {
-	struct prio_array *target = rq->active;
+	unsigned int prio, delta, scale, ts;
 
-	if (batch_task(p))
-		target = rq->expired;
-	enqueue_task(p, target);
-	inc_nr_running(p, rq);
+	if (rt_task(p))
+		return RT_TIMESLICE;
+
+	/* prio is 10..49 */
+	prio = USER_PRIO(p->static_prio) - 10 + 9;
+
+	ts = prio * 1024;	/* 10240..50176 */
+
+	/* delta is 3..42 (delta-3 <= prio-9) */
+	delta = p->static_prio - min(min(rq->active->min_static_prio,
+					 rq->expired->min_static_prio),
+				     p->static_prio) + 3;
+
+	scale = delta*delta;	/* 9..1764 */
+
+	ts = ts / scale;	/* 1137..(28..5575) */
+
+	/* a nice 0 task has ts (29*29/9) here. scale to BASE_TIMESLICE */
+	ts = ts * BASE_TIMESLICE / (29*1024/9);
+
+	ts = msecs_to_jiffies(ts);
+	if (unlikely(ts == 0))
+		ts = 1;
+
+	return ts;
 }
 
-/*
- * __activate_idle_task - move idle task to the _front_ of runqueue.
- */
-static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
+int get_task_timeslice(struct task_struct *p)
 {
-	enqueue_task_head(p, rq->active);
-	inc_nr_running(p, rq);
+	int ts;
+	struct rq *rq;
+	unsigned long flags;
+	rq = task_rq_lock(p, &flags);
+	ts = task_timeslice(p, rq);
+	task_rq_unlock(rq, &flags);
+
+	return ts;
 }
 
 /*
- * Recalculate p->normal_prio and p->prio after having slept,
- * updating the sleep-average too:
+ * task_priority: calculates a task's priority based on previous running
+ * history (see add_task_time). The priority is just a simple linear function
+ * based on sleep_avg and static_prio.
  */
-static int recalc_task_prio(struct task_struct *p, unsigned long long now)
+static int task_priority(struct task_struct *p)
 {
-	/* Caller must always ensure 'now >= p->timestamp' */
-	unsigned long sleep_time = now - p->timestamp;
+	unsigned long sleep_avg;
+	int bonus, prio;
 
-	if (batch_task(p))
-		sleep_time = 0;
+	if (rt_prio(p->prio))
+		return p->prio;
 
-	if (likely(sleep_time > 0)) {
-		/*
-		 * This ceiling is set to the lowest priority that would allow
-		 * a task to be reinserted into the active array on timeslice
-		 * completion.
-		 */
-		unsigned long ceiling = INTERACTIVE_SLEEP(p);
+	sleep_avg = task_sleep_avg(p);
 
-		if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
-			/*
-			 * Prevents user tasks from achieving best priority
-			 * with one single large enough sleep.
-			 */
-			p->sleep_avg = ceiling;
-			/*
-			 * Using INTERACTIVE_SLEEP() as a ceiling places a
-			 * nice(0) task 1ms sleep away from promotion, and
-			 * gives it 700ms to round-robin with no chance of
-			 * being demoted.  This is more than generous, so
-			 * mark this sleep as non-interactive to prevent the
-			 * on-runqueue bonus logic from intervening should
-			 * this task not receive cpu immediately.
-			 */
-			p->sleep_type = SLEEP_NONINTERACTIVE;
-		} else {
-			/*
-			 * Tasks waking from uninterruptible sleep are
-			 * limited in their sleep_avg rise as they
-			 * are likely to be waiting on I/O
-			 */
-			if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
-				if (p->sleep_avg >= ceiling)
-					sleep_time = 0;
-				else if (p->sleep_avg + sleep_time >=
-					 ceiling) {
-						p->sleep_avg = ceiling;
-						sleep_time = 0;
-				}
-			}
+	prio = USER_PRIO(p->static_prio) + 10;
+	bonus = (((MAX_USER_PRIO + 1) / 3) * sleep_avg + (SLEEP_FACTOR / 2))
+					/ SLEEP_FACTOR;
+	prio = MAX_RT_PRIO + prio - bonus;
 
-			/*
-			 * This code gives a bonus to interactive tasks.
-			 *
-			 * The boost works by updating the 'average sleep time'
-			 * value here, based on ->timestamp. The more time a
-			 * task spends sleeping, the higher the average gets -
-			 * and the higher the priority boost gets as well.
-			 */
-			p->sleep_avg += sleep_time;
+	if (prio < MAX_RT_PRIO)
+		return MAX_RT_PRIO;
+	if (prio > MAX_PRIO-1)
+		return MAX_PRIO-1;
 
-		}
-		if (p->sleep_avg > NS_MAX_SLEEP_AVG)
-			p->sleep_avg = NS_MAX_SLEEP_AVG;
-	}
+	return prio;
+}
 
-	return effective_prio(p);
+/*
+ * __activate_task - move a task to the runqueue.
+ */
+static inline void __activate_task(struct task_struct *p, struct rq *rq,
+					struct prio_array *array)
+{
+	enqueue_task(p, array);
+	inc_nr_running(p, rq);
+}
+
+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
+{
+	enqueue_task_head(p, rq->active);
+	inc_nr_running(p, rq);
 }
 
 /*
@@ -951,20 +868,14 @@ static int recalc_task_prio(struct task_
  */
 static void activate_task(struct task_struct *p, struct rq *rq, int local)
 {
-	unsigned long long now;
+	unsigned long long now, sleep;
+	struct prio_array *array;
 
+	array = rq->active;
 	if (rt_task(p))
 		goto out;
 
-	now = sched_clock();
-#ifdef CONFIG_SMP
-	if (!local) {
-		/* Compensate for drifting sched_clock */
-		struct rq *this_rq = this_rq();
-		now = (now - this_rq->most_recent_timestamp)
-			+ rq->most_recent_timestamp;
-	}
-#endif
+	now = sched_clock_us();
 
 	/*
 	 * Sleep time is in units of nanosecs, so shift by 20 to get a
@@ -976,41 +887,33 @@ static void activate_task(struct task_st
 			profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
 				     (now - p->timestamp) >> 20);
 	}
-
-	p->prio = recalc_task_prio(p, now);
-
 	/*
-	 * This checks to make sure it's not an uninterruptible task
-	 * that is now waking up.
+	 * If we have slept through an active/expired array switch, restart
+	 * our timeslice too.
 	 */
-	if (p->sleep_type == SLEEP_NORMAL) {
-		/*
-		 * Tasks which were woken up by interrupts (ie. hw events)
-		 * are most likely of interactive nature. So we give them
-		 * the credit of extending their sleep time to the period
-		 * of time they spend on the runqueue, waiting for execution
-		 * on a CPU, first time around:
-		 */
-		if (in_interrupt())
-			p->sleep_type = SLEEP_INTERRUPTED;
-		else {
-			/*
-			 * Normal first-time wakeups get a credit too for
-			 * on-runqueue time, but it will be weighted down:
-			 */
-			p->sleep_type = SLEEP_INTERACTIVE;
-		}
-	}
+	sleep = now - p->timestamp;
 	p->timestamp = now;
+	add_task_time(p, sleep, STIME_SLEEP);
+	p->prio = task_priority(p);
+
+	if (rq->array_sequence != p->array_sequence) {
+		p->used_slice = 0;
+		p->over_slice = 0;
+	} else if (unlikely(p->used_slice == -1)) {
+		array = rq->expired;
+		p->used_slice = 0;
+	}
+
 out:
-	__activate_task(p, rq);
+	__activate_task(p, rq, array);
 }
 
 /*
  * deactivate_task - remove a task from the runqueue.
  */
-static void deactivate_task(struct task_struct *p, struct rq *rq)
+static inline void deactivate_task(struct task_struct *p, struct rq *rq)
 {
+	p->array_sequence = rq->array_sequence;
 	dec_nr_running(p, rq);
 	dequeue_task(p, p->array);
 	p->array = NULL;
@@ -1508,10 +1411,12 @@ static int try_to_wake_up(struct task_st
 out_set_cpu:
 	new_cpu = wake_idle(new_cpu, p);
 	if (new_cpu != cpu) {
+		int seq_delta = p->array_sequence - rq->array_sequence;
 		set_task_cpu(p, new_cpu);
 		task_rq_unlock(rq, &flags);
 		/* might preempt at this point */
 		rq = task_rq_lock(p, &flags);
+		p->array_sequence = rq->array_sequence + seq_delta;
 		old_state = p->state;
 		if (!(old_state & state))
 			goto out;
@@ -1524,33 +1429,9 @@ out_set_cpu:
 
 out_activate:
 #endif /* CONFIG_SMP */
-	if (old_state == TASK_UNINTERRUPTIBLE) {
+	if (old_state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
-		/*
-		 * Tasks on involuntary sleep don't earn
-		 * sleep_avg beyond just interactive state.
-		 */
-		p->sleep_type = SLEEP_NONINTERACTIVE;
-	} else
-
-	/*
-	 * Tasks that have marked their sleep as noninteractive get
-	 * woken up with their sleep average not weighted in an
-	 * interactive way.
-	 */
-		if (old_state & TASK_NONINTERACTIVE)
-			p->sleep_type = SLEEP_NONINTERACTIVE;
-
-
 	activate_task(p, rq, cpu == this_cpu);
-	/*
-	 * Sync wakeups (i.e. those types of wakeups where the waker
-	 * has indicated that it will leave the CPU in short order)
-	 * don't trigger a preemption, if the woken up task will run on
-	 * this cpu. (in this case the 'I will reschedule' promise of
-	 * the waker guarantees that the freshly woken up task is going
-	 * to be considered on this CPU.)
-	 */
 	if (!sync || cpu != this_cpu) {
 		if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
@@ -1584,6 +1465,9 @@ static void task_running_tick(struct rq 
  */
 void fastcall sched_fork(struct task_struct *p, int clone_flags)
 {
+	unsigned long sleep_avg;
+	struct rq *rq;
+	struct task_struct *cur;
 	int cpu = get_cpu();
 
 #ifdef CONFIG_SMP
@@ -1617,31 +1501,47 @@ void fastcall sched_fork(struct task_str
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
-	/*
-	 * Share the timeslice between parent and child, thus the
-	 * total amount of pending timeslices in the system doesn't change,
-	 * resulting in more scheduling fairness.
-	 */
+	p->timestamp = sched_clock_us();
+	p->used_slice = 0;
+	p->over_slice = 0;
+	if (rt_task(p))
+		goto out;
+
+	rq = this_rq();
+	cur = current;
+
+	/* Get MIN_HISTORY of history with a bit less sleep_avg than parent. */
+	sleep_avg = task_sleep_avg(cur);
+	sleep_avg -= sleep_avg >> 2;
+	p->total_time = MIN_HISTORY;
+	p->sleep_time = MIN_HISTORY * sleep_avg / SLEEP_FACTOR;
+
+	p->prio = p->normal_prio = task_priority(p);
+
+	/* Parent loses sleep time the child took */
+	cur->sleep_time -= min(cur->sleep_time/4, p->sleep_time);
+
 	local_irq_disable();
-	p->time_slice = (current->time_slice + 1) >> 1;
-	/*
-	 * The remainder of the first timeslice might be recovered by
-	 * the parent if the child exits early enough.
-	 */
-	p->first_time_slice = 1;
-	current->time_slice >>= 1;
-	p->timestamp = sched_clock();
-	if (unlikely(!current->time_slice)) {
-		/*
-		 * This case is rare, it happens when the parent has only
-		 * a single jiffy left from its timeslice. Taking the
-		 * runqueue lock is not a problem.
-		 */
-		current->time_slice = 1;
-		task_running_tick(cpu_rq(cpu), current);
+	if (unlikely(cur->used_slice == -1 || cur == rq->idle))
+		p->used_slice = -1;
+	else {
+		int ts = task_timeslice(p, rq);
+		int child_ts = min_t(int, ts/4, FORKED_TS_MAX);
+
+		if (child_ts == 0) {
+			p->used_slice = -1;
+		} else {
+			p->used_slice = ts - child_ts;
+			cur->used_slice += child_ts;
+			if (cur->used_slice >= task_timeslice(p, rq)) {
+				cur->used_slice = -1;
+				set_need_resched();
+			}
+		}
 	}
 	local_irq_enable();
-	put_cpu();
+out:
+  	put_cpu();
 }
 
 /*
@@ -1653,108 +1553,54 @@ void fastcall sched_fork(struct task_str
  */
 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
-	struct rq *rq, *this_rq;
 	unsigned long flags;
 	int this_cpu, cpu;
+	struct rq *rq;
+	struct prio_array *array;
+	struct task_struct *cur;
+
+	cpu = task_cpu(p);
 
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id();
-	cpu = task_cpu(p);
 
-	/*
-	 * We decrease the sleep average of forking parents
-	 * and children as well, to keep max-interactive tasks
-	 * from forking tasks that are max-interactive. The parent
-	 * (current) is done further down, under its lock.
-	 */
-	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
-		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
-
-	p->prio = effective_prio(p);
+	array = rq->active;
+	if (unlikely(p->used_slice == -1)) {
+		array = rq->expired;
+		p->used_slice = 0;
+	}
 
+	cur = current;
 	if (likely(cpu == this_cpu)) {
-		if (!(clone_flags & CLONE_VM)) {
+		if (!rt_task(p) && !rt_task(cur) &&
+			!(clone_flags & CLONE_VM) && (array == rq->active)) {
 			/*
 			 * The VM isn't cloned, so we're in a good position to
 			 * do child-runs-first in anticipation of an exec. This
 			 * usually avoids a lot of COW overhead.
 			 */
-			if (unlikely(!current->array))
-				__activate_task(p, rq);
-			else {
-				p->prio = current->prio;
-				p->normal_prio = current->normal_prio;
-				list_add_tail(&p->run_list, &current->run_list);
-				p->array = current->array;
-				p->array->nr_active++;
-				inc_nr_running(p, rq);
-			}
 			set_need_resched();
-		} else
-			/* Run child last */
-			__activate_task(p, rq);
-		/*
-		 * We skip the following code due to cpu == this_cpu
-	 	 *
-		 *   task_rq_unlock(rq, &flags);
-		 *   this_rq = task_rq_lock(current, &flags);
-		 */
-		this_rq = rq;
-	} else {
-		this_rq = cpu_rq(this_cpu);
+			p->prio = cur->prio;
+			list_add_tail(&p->run_list, &cur->run_list);
+			p->array = cur->array;
+			p->array->nr_active++;
+			inc_nr_running(p, rq);
+			goto child_queued;
+		}
+	}
+	__activate_task(p, rq, array);
 
-		/*
-		 * Not the local CPU - must adjust timestamp. This should
-		 * get optimised away in the !CONFIG_SMP case.
-		 */
-		p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
-					+ rq->most_recent_timestamp;
-		__activate_task(p, rq);
-		if (TASK_PREEMPTS_CURR(p, rq))
-			resched_task(rq->curr);
+	/* This catches RT tasks and remote SMP tasks */
+	if (TASK_PREEMPTS_CURR(p, rq))
+		resched_task(rq->curr);
 
-		/*
-		 * Parent and child are on different CPUs, now get the
-		 * parent runqueue to update the parent's ->sleep_avg:
-		 */
-		task_rq_unlock(rq, &flags);
-		this_rq = task_rq_lock(current, &flags);
-	}
-	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
-		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
-	task_rq_unlock(this_rq, &flags);
+child_queued:
+	task_rq_unlock(rq, &flags);
 }
 
-/*
- * Potentially available exiting-child timeslices are
- * retrieved here - this way the parent does not get
- * penalized for creating too many threads.
- *
- * (this cannot be used to 'generate' timeslices
- * artificially, because any timeslice recovered here
- * was given away by the parent in the first place.)
- */
 void fastcall sched_exit(struct task_struct *p)
 {
-	unsigned long flags;
-	struct rq *rq;
-
-	/*
-	 * If the child was a (relative-) CPU hog then decrease
-	 * the sleep_avg of the parent as well.
-	 */
-	rq = task_rq_lock(p->parent, &flags);
-	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
-		p->parent->time_slice += p->time_slice;
-		if (unlikely(p->parent->time_slice > task_timeslice(p)))
-			p->parent->time_slice = task_timeslice(p);
-	}
-	if (p->sleep_avg < p->parent->sleep_avg)
-		p->parent->sleep_avg = p->parent->sleep_avg /
-		(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
-		(EXIT_WEIGHT + 1);
-	task_rq_unlock(rq, &flags);
 }
 
 /**
@@ -1831,6 +1677,7 @@ static inline void finish_task_switch(st
 asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
+	struct task_struct *cur;
 	struct rq *rq = this_rq();
 
 	finish_task_switch(rq, prev);
@@ -1838,8 +1685,9 @@ asmlinkage void schedule_tail(struct tas
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
 #endif
-	if (current->set_child_tid)
-		put_user(current->pid, current->set_child_tid);
+	cur = current;
+	if (cur->set_child_tid)
+		put_user(cur->pid, cur->set_child_tid);
 }
 
 /*
@@ -1979,19 +1827,19 @@ static void double_rq_lock(struct rq *rq
 	__acquires(rq1->lock)
 	__acquires(rq2->lock)
 {
+	spinlock_t *l1, *l2;
+
 	BUG_ON(!irqs_disabled());
-	if (rq1 == rq2) {
-		spin_lock(&rq1->lock);
-		__acquire(rq2->lock);	/* Fake it out ;) */
-	} else {
-		if (rq1 < rq2) {
-			spin_lock(&rq1->lock);
-			spin_lock(&rq2->lock);
-		} else {
-			spin_lock(&rq2->lock);
-			spin_lock(&rq1->lock);
-		}
+
+	l1 = &rq1->lock;
+	l2 = &rq2->lock;
+	if (rq1 > rq2) {
+		l1 = l2;
+		l2 = &rq1->lock;
 	}
+
+	spin_lock(l1);
+	spin_lock(l2);
 }
 
 /*
@@ -2005,10 +1853,7 @@ static void double_rq_unlock(struct rq *
 	__releases(rq2->lock)
 {
 	spin_unlock(&rq1->lock);
-	if (rq1 != rq2)
-		spin_unlock(&rq2->lock);
-	else
-		__release(rq2->lock);
+	spin_unlock(&rq2->lock);
 }
 
 /*
@@ -2045,9 +1890,12 @@ static void sched_migrate_task(struct ta
 	struct migration_req req;
 	unsigned long flags;
 	struct rq *rq;
+	int cpu;
 
 	rq = task_rq_lock(p, &flags);
+	cpu = task_cpu(p);
 	if (!cpu_isset(dest_cpu, p->cpus_allowed)
+	    || cpu == dest_cpu
 	    || unlikely(cpu_is_offline(dest_cpu)))
 		goto out;
 
@@ -2094,8 +1942,7 @@ static void pull_task(struct rq *src_rq,
 	set_task_cpu(p, this_cpu);
 	inc_nr_running(p, this_rq);
 	enqueue_task(p, this_array);
-	p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
-				+ this_rq->most_recent_timestamp;
+
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
@@ -2110,7 +1957,7 @@ static void pull_task(struct rq *src_rq,
 static
 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 		     struct sched_domain *sd, enum idle_type idle,
-		     int *all_pinned)
+		     int *all_pinned, unsigned long long now)
 {
 	/*
 	 * We do not migrate tasks that are:
@@ -2133,18 +1980,20 @@ int can_migrate_task(struct task_struct 
 
 	if (sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
-		if (task_hot(p, rq->most_recent_timestamp, sd))
+		if (task_hot(p, now, sd))
 			schedstat_inc(sd, lb_hot_gained[idle]);
 #endif
 		return 1;
 	}
 
-	if (task_hot(p, rq->most_recent_timestamp, sd))
+	if (task_hot(p, now, sd))
 		return 0;
 	return 1;
 }
 
-#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
+#define rq_best_prio(rq) min(min((rq)->active->min_static_prio,		\
+				(rq)->expired->min_static_prio),	\
+			(rq)->curr->prio);
 
 /*
  * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
@@ -2164,12 +2013,14 @@ static int move_tasks(struct rq *this_rq
 	struct list_head *head, *curr;
 	struct task_struct *tmp;
 	long rem_load_move;
+	unsigned long long now;
 
 	if (max_nr_move == 0 || max_load_move == 0)
 		goto out;
 
 	rem_load_move = max_load_move;
 	pinned = 1;
+	now = sched_clock_us();
 	this_best_prio = rq_best_prio(this_rq);
 	best_prio = rq_best_prio(busiest);
 	/*
@@ -2228,7 +2079,7 @@ skip_queue:
 	if (skip_for_load && idx < this_best_prio)
 		skip_for_load = !best_prio_seen && idx == best_prio;
 	if (skip_for_load ||
-	    !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+	    !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned, now)) {
 
 		best_prio_seen |= idx == best_prio;
 		if (curr != head)
@@ -2289,6 +2140,8 @@ find_busiest_group(struct sched_domain *
 	struct sched_group *group_min = NULL, *group_leader = NULL;
 #endif
 
+	prefetch(group);
+
 	max_load = this_load = total_load = total_pwr = 0;
 	busiest_load_per_task = busiest_nr_running = 0;
 	this_load_per_task = this_nr_running = 0;
@@ -2306,6 +2159,8 @@ find_busiest_group(struct sched_domain *
 		unsigned int balance_cpu = -1, first_idle_cpu = 0;
 		unsigned long sum_nr_running, sum_weighted_load;
 
+		prefetch(group->next);
+
 		local_group = cpu_isset(this_cpu, group->cpumask);
 
 		if (local_group)
@@ -3018,7 +2873,7 @@ static inline void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
 	p->sched_time += now - p->last_ran;
-	p->last_ran = rq->most_recent_timestamp = now;
+	p->last_ran = now;
 }
 
 /*
@@ -3031,34 +2886,13 @@ unsigned long long current_sched_time(co
 	unsigned long flags;
 
 	local_irq_save(flags);
-	ns = p->sched_time + sched_clock() - p->last_ran;
+	ns = p->sched_time + sched_clock_us() - p->last_ran;
 	local_irq_restore(flags);
 
 	return ns;
 }
 
 /*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired:
- */
-static inline int expired_starving(struct rq *rq)
-{
-	if (rq->curr->static_prio > rq->best_expired_prio)
-		return 1;
-	if (!STARVATION_LIMIT || !rq->expired_timestamp)
-		return 0;
-	if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
-		return 1;
-	return 0;
-}
-
-/*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3133,77 +2967,26 @@ void account_steal_time(struct task_stru
 
 static void task_running_tick(struct rq *rq, struct task_struct *p)
 {
-	if (p->array != rq->active) {
-		/* Task has expired but was not scheduled yet */
-		set_tsk_need_resched(p);
+	int allowed;
+	int ts;
+
+	/* Task might have expired already, but not scheduled off yet */
+	if (unlikely(p->used_slice == -1))
 		return;
-	}
-	spin_lock(&rq->lock);
-	/*
-	 * The task was running during this tick - update the
-	 * time slice counter. Note: we do not update a thread's
-	 * priority until it either goes to sleep or uses up its
-	 * timeslice. This makes it possible for interactive tasks
-	 * to use up their timeslices at their highest priority levels.
-	 */
-	if (rt_task(p)) {
-		/*
-		 * RR tasks need a special form of timeslice management.
-		 * FIFO tasks have no timeslices.
-		 */
-		if ((p->policy == SCHED_RR) && !--p->time_slice) {
-			p->time_slice = task_timeslice(p);
-			p->first_time_slice = 0;
-			set_tsk_need_resched(p);
 
-			/* put it at the end of the queue: */
-			requeue_task(p, rq->active);
-		}
-		goto out_unlock;
-	}
-	if (!--p->time_slice) {
-		dequeue_task(p, rq->active);
-		set_tsk_need_resched(p);
-		p->prio = effective_prio(p);
-		p->time_slice = task_timeslice(p);
-		p->first_time_slice = 0;
-
-		if (!rq->expired_timestamp)
-			rq->expired_timestamp = jiffies;
-		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
-			enqueue_task(p, rq->expired);
-			if (p->static_prio < rq->best_expired_prio)
-				rq->best_expired_prio = p->static_prio;
-		} else
-			enqueue_task(p, rq->active);
-	} else {
-		/*
-		 * Prevent a too long timeslice allowing a task to monopolize
-		 * the CPU. We do this by splitting up the timeslice into
-		 * smaller pieces.
-		 *
-		 * Note: this does not mean the task's timeslices expire or
-		 * get lost in any way, they just might be preempted by
-		 * another task of equal priority. (one with higher
-		 * priority would have preempted this task already.) We
-		 * requeue this task to the end of the list on this priority
-		 * level, which is in essence a round-robin of tasks with
-		 * equal priority.
-		 *
-		 * This only applies to tasks in the interactive
-		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
-		 */
-		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
-			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
-			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
-			(p->array == rq->active)) {
+	/* SCHED_FIFO tasks have no timeslice */
+	if (unlikely(p->policy == SCHED_FIFO))
+		return;
 
-			requeue_task(p, rq->active);
-			set_tsk_need_resched(p);
-		}
+	/* p was running during this tick. Update its time slice counter. */
+	p->used_slice++;
+	ts = task_timeslice(p, rq);
+	allowed = ts - min(p->over_slice, ts >> 1);
+	if (unlikely(p->used_slice >= allowed)) {
+		p->over_slice = allowed - p->used_slice;
+		p->used_slice = -1;
+		set_tsk_need_resched(p);
 	}
-out_unlock:
-	spin_unlock(&rq->lock);
 }
 
 /*
@@ -3215,7 +2998,7 @@ out_unlock:
  */
 void scheduler_tick(void)
 {
-	unsigned long long now = sched_clock();
+	unsigned long long now = sched_clock_us();
 	struct task_struct *p = current;
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
@@ -3269,12 +3052,6 @@ EXPORT_SYMBOL(sub_preempt_count);
 
 #endif
 
-static inline int interactive_sleep(enum sleep_type sleep_type)
-{
-	return (sleep_type == SLEEP_INTERACTIVE ||
-		sleep_type == SLEEP_INTERRUPTED);
-}
-
 /*
  * schedule() is the main scheduler function.
  */
@@ -3285,56 +3062,46 @@ asmlinkage void __sched schedule(void)
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
-	int cpu, idx, new_prio;
+	int cpu, idx;
 	long *switch_count;
 	struct rq *rq;
 
+	prev = current;
+	prefetch(prev);
+
+	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
+
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
-	if (unlikely(in_atomic() && !current->exit_state)) {
-		printk(KERN_ERR "BUG: scheduling while atomic: "
-			"%s/0x%08x/%d\n",
-			current->comm, preempt_count(), current->pid);
-		debug_show_held_locks(current);
-		if (irqs_disabled())
-			print_irqtrace_events(current);
-		dump_stack();
+	if (unlikely(in_atomic())) {
+		if (unlikely(!current->exit_state)) {
+			printk(KERN_ERR "BUG: scheduling while atomic: "
+				"%s/0x%08x/%d\n",
+				current->comm, preempt_count(), current->pid);
+			debug_show_held_locks(current);
+			if (irqs_disabled())
+				print_irqtrace_events(current);
+			dump_stack();
+		}
 	}
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
-need_resched:
 	preempt_disable();
-	prev = current;
-	release_kernel_lock(prev);
-need_resched_nonpreemptible:
 	rq = this_rq();
+	prefetchw(rq);
+need_resched:
+	cpu = smp_processor_id();
+	release_kernel_lock(prev);
 
-	/*
-	 * The idle thread is not allowed to schedule!
-	 * Remove this check after it has been exercised a bit.
-	 */
-	if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
-		printk(KERN_ERR "bad: scheduling from the idle thread!\n");
-		dump_stack();
-	}
-
+need_resched_nobkl:
 	schedstat_inc(rq, sched_cnt);
-	now = sched_clock();
-	if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
-		run_time = now - prev->timestamp;
-		if (unlikely((long long)(now - prev->timestamp) < 0))
-			run_time = 0;
-	} else
-		run_time = NS_MAX_SLEEP_AVG;
-
-	/*
-	 * Tasks charged proportionately less run_time at high sleep_avg to
-	 * delay them losing their interactive status
-	 */
-	run_time /= (CURRENT_BONUS(prev) ? : 1);
+	now = sched_clock_us();
+	run_time = now - prev->timestamp;
+	prev->timestamp = prev->last_ran = now;
+	add_task_time(prev, run_time, STIME_RUN);
 
 	spin_lock_irq(&rq->lock);
 
@@ -3342,24 +3109,39 @@ need_resched_nonpreemptible:
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		switch_count = &prev->nvcsw;
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
-				unlikely(signal_pending(prev))))
+				unlikely(signal_pending(prev)))) {
 			prev->state = TASK_RUNNING;
-		else {
+		} else {
 			if (prev->state == TASK_UNINTERRUPTIBLE)
 				rq->nr_uninterruptible++;
 			deactivate_task(prev, rq);
+			goto no_check_expired;
 		}
 	}
 
-	cpu = smp_processor_id();
-	if (unlikely(!rq->nr_running)) {
+	if (unlikely(prev->used_slice == -1)) {
+		dequeue_task(prev, prev->array);
+		/* SCHED_FIFO can come in here too, from sched_yield */
+		array = rq->active;
+		if (!rt_task(prev))
+			array = rq->expired;
+		enqueue_task(prev, array);
+		prev->used_slice = 0;
+		goto no_check_idle;
+	}
+no_check_expired:
+
+	if (!rq->nr_running) {
+		rq->active->min_static_prio = MAX_PRIO;
+		rq->array_sequence++;
 		idle_balance(cpu, rq);
 		if (!rq->nr_running) {
+			schedstat_inc(rq, sched_goidle);
 			next = rq->idle;
-			rq->expired_timestamp = 0;
 			goto switch_tasks;
 		}
 	}
+no_check_idle:
 
 	array = rq->active;
 	if (unlikely(!array->nr_active)) {
@@ -3370,72 +3152,55 @@ need_resched_nonpreemptible:
 		rq->active = rq->expired;
 		rq->expired = array;
 		array = rq->active;
-		rq->expired_timestamp = 0;
-		rq->best_expired_prio = MAX_PRIO;
+		rq->array_sequence++;
+		rq->expired->min_static_prio = MAX_PRIO;
 	}
 
 	idx = sched_find_first_bit(array->bitmap);
 	queue = array->queue + idx;
 	next = list_entry(queue->next, struct task_struct, run_list);
 
-	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
-		unsigned long long delta = now - next->timestamp;
-		if (unlikely((long long)(now - next->timestamp) < 0))
-			delta = 0;
-
-		if (next->sleep_type == SLEEP_INTERACTIVE)
-			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
-
-		array = next->array;
-		new_prio = recalc_task_prio(next, next->timestamp + delta);
-
-		if (unlikely(next->prio != new_prio)) {
-			dequeue_task(next, array);
-			next->prio = new_prio;
-			enqueue_task(next, array);
-		}
-	}
-	next->sleep_type = SLEEP_NORMAL;
 switch_tasks:
-	if (next == rq->idle)
-		schedstat_inc(rq, sched_goidle);
+	prefetch(&next->mm);
 	prefetch(next);
 	prefetch_stack(next);
 	clear_tsk_need_resched(prev);
-	rcu_qsctr_inc(task_cpu(prev));
+	rcu_qsctr_inc(cpu);
 
 	update_cpu_clock(prev, rq, now);
 
-	prev->sleep_avg -= run_time;
-	if ((long)prev->sleep_avg <= 0)
-		prev->sleep_avg = 0;
-	prev->timestamp = prev->last_ran = now;
-
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
+		struct task_struct *from;
+
+		prefetch(next->mm);
+		prefetch(next->thread_info);
+
 		next->timestamp = next->last_ran = now;
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
 
 		prepare_task_switch(rq, next);
-		prev = context_switch(rq, prev, next);
-		barrier();
+		from = context_switch(rq, prev, next);
 		/*
 		 * this_rq must be evaluated again because prev may have moved
 		 * CPUs since it called schedule(), thus the 'rq' on its stack
 		 * frame will be invalid.
 		 */
-		finish_task_switch(this_rq(), prev);
+		rq = this_rq();
+		finish_task_switch(rq, from);
 	} else
 		spin_unlock_irq(&rq->lock);
 
-	prev = current;
 	if (unlikely(reacquire_kernel_lock(prev) < 0))
-		goto need_resched_nonpreemptible;
+		goto need_resched_nobkl;
 	preempt_enable_no_resched();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) {
+		preempt_disable();
+		rq = this_rq();
 		goto need_resched;
+	}
 }
 EXPORT_SYMBOL(schedule);
 
@@ -3916,7 +3681,7 @@ void set_user_nice(struct task_struct *p
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
 	old_prio = p->prio;
-	p->prio = effective_prio(p);
+	p->prio = task_priority(p);
 	delta = p->prio - old_prio;
 
 	if (array) {
@@ -4047,14 +3812,9 @@ static void __setscheduler(struct task_s
 
 	p->policy = policy;
 	p->rt_priority = prio;
-	p->normal_prio = normal_prio(p);
+	p->normal_prio = task_priority(p);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
-	/*
-	 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
-	 */
-	if (policy == SCHED_BATCH)
-		p->sleep_avg = 0;
 	set_load_weight(p);
 }
 
@@ -4149,8 +3909,11 @@ recheck:
 		deactivate_task(p, rq);
 	oldprio = p->prio;
 	__setscheduler(p, policy, param->sched_priority);
+	if (policy == SCHED_FIFO || policy == SCHED_RR)
+		p->used_slice = 0;
+
 	if (array) {
-		__activate_task(p, rq);
+		__activate_task(p, rq, rq->active);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
@@ -4438,46 +4201,13 @@ asmlinkage long sys_sched_getaffinity(pi
  */
 asmlinkage long sys_sched_yield(void)
 {
-	struct rq *rq = this_rq_lock();
-	struct prio_array *array = current->array, *target = rq->expired;
-
-	schedstat_inc(rq, yld_cnt);
-	/*
-	 * We implement yielding by moving the task into the expired
-	 * queue.
-	 *
-	 * (special rule: RT tasks will just roundrobin in the active
-	 *  array.)
-	 */
-	if (rt_task(current))
-		target = rq->active;
-
-	if (array->nr_active == 1) {
-		schedstat_inc(rq, yld_act_empty);
-		if (!rq->expired->nr_active)
-			schedstat_inc(rq, yld_both_empty);
-	} else if (!rq->expired->nr_active)
-		schedstat_inc(rq, yld_exp_empty);
-
-	if (array != target) {
-		dequeue_task(current, array);
-		enqueue_task(current, target);
-	} else
-		/*
-		 * requeue_task is cheaper so perform that if possible.
-		 */
-		requeue_task(current, array);
-
-	/*
-	 * Since we are going to call schedule() anyway, there's
-	 * no need to preempt or enable interrupts:
-	 */
-	__release(rq->lock);
-	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-	_raw_spin_unlock(&rq->lock);
-	preempt_enable_no_resched();
-
-	schedule();
+	local_irq_disable();
+#ifdef CONFIG_SCHEDSTATS
+	schedstat_inc(this_rq(), yld_cnt);
+#endif
+	current->used_slice = -1;
+	set_need_resched();
+	local_irq_enable();
 
 	return 0;
 }
@@ -4566,6 +4296,15 @@ void __sched yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
+	/*
+	 * Kernel-space yield won't follow the schedule upon
+	 * return from syscall path. Unless preempt is enabled,
+	 * however in that case, preempt_schedule doesn't drop
+	 * the bkl, which is needed in some paths.
+	 *
+	 * Must call schedule() here.
+	 */
+	schedule();
 }
 EXPORT_SYMBOL(yield);
 
@@ -4662,6 +4401,8 @@ long sys_sched_rr_get_interval(pid_t pid
 	struct task_struct *p;
 	int retval = -EINVAL;
 	struct timespec t;
+	unsigned long flags;
+	struct rq *rq;
 
 	if (pid < 0)
 		goto out_nounlock;
@@ -4676,8 +4417,10 @@ long sys_sched_rr_get_interval(pid_t pid
 	if (retval)
 		goto out_unlock;
 
+	rq = task_rq_lock(p, &flags);
 	jiffies_to_timespec(p->policy == SCHED_FIFO ?
-				0 : task_timeslice(p), &t);
+				0 : task_timeslice(p, rq), &t);
+	task_rq_unlock(rq, &flags);
 	read_unlock(&tasklist_lock);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 out_nounlock:
@@ -4805,11 +4548,12 @@ void __cpuinit init_idle(struct task_str
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	idle->timestamp = sched_clock();
-	idle->sleep_avg = 0;
+	idle->timestamp = sched_clock_us();
 	idle->array = NULL;
 	idle->prio = idle->normal_prio = MAX_PRIO;
 	idle->state = TASK_RUNNING;
+	idle->used_slice = 0;
+	idle->over_slice = 0;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
 	set_task_cpu(idle, cpu);
 
@@ -4928,16 +4672,8 @@ static int __migrate_task(struct task_st
 
 	set_task_cpu(p, dest_cpu);
 	if (p->array) {
-		/*
-		 * Sync timestamp with rq_dest's before activating.
-		 * The same thing could be achieved by doing this step
-		 * afterwards, and pretending it was a local activate.
-		 * This way is cleaner and logically correct.
-		 */
-		p->timestamp = p->timestamp - rq_src->most_recent_timestamp
-				+ rq_dest->most_recent_timestamp;
 		deactivate_task(p, rq_src);
-		__activate_task(p, rq_dest);
+		activate_task(p, rq_dest, 0);
 		if (TASK_PREEMPTS_CURR(p, rq_dest))
 			resched_task(rq_dest->curr);
 	}
@@ -6769,9 +6505,9 @@ void __init sched_init(void)
 		spin_lock_init(&rq->lock);
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
+		rq->raw_weighted_load = 0;
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
-		rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
@@ -6787,11 +6523,12 @@ void __init sched_init(void)
 
 		for (j = 0; j < 2; j++) {
 			array = rq->arrays + j;
+			array->min_static_prio = MAX_PRIO;
 			for (k = 0; k < MAX_PRIO; k++) {
 				INIT_LIST_HEAD(array->queue + k);
 				__clear_bit(k, array->bitmap);
 			}
-			// delimiter for bitsearch
+			/* delimiter for bitsearch */
 			__set_bit(MAX_PRIO, array->bitmap);
 		}
 	}
@@ -6864,10 +6601,10 @@ void normalize_rt_tasks(void)
 
 		array = p->array;
 		if (array)
-			deactivate_task(p, task_rq(p));
+			deactivate_task(p, rq);
 		__setscheduler(p, SCHED_NORMAL, 0);
 		if (array) {
-			__activate_task(p, task_rq(p));
+			__activate_task(p, rq, array);
 			resched_task(rq->curr);
 		}
 
Index: linux-2.6/mm/oom_kill.c
===================================================================
--- linux-2.6.orig/mm/oom_kill.c	2007-03-29 19:46:00.000000000 +1000
+++ linux-2.6/mm/oom_kill.c	2007-03-29 19:46:05.000000000 +1000
@@ -287,11 +287,10 @@ static void __oom_kill_task(struct task_
 		printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm);
 
 	/*
-	 * We give our sacrificial lamb high priority and access to
-	 * all the memory it needs. That way it should be able to
-	 * exit() and clear out its resources quickly...
+	 * We give our sacrificial lamb access to all the memory it needs.
+	 * That way it should be able to exit() and clear out its resources
+	 * quickly...
 	 */
-	p->time_slice = HZ;
 	set_tsk_thread_flag(p, TIF_MEMDIE);
 
 	force_sig(SIGKILL, p);
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c	2007-03-29 19:45:08.000000000 +1000
+++ linux-2.6/kernel/sysctl.c	2007-03-29 19:46:05.000000000 +1000
@@ -76,6 +76,9 @@ extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
+extern int sched_base_timeslice;
+extern int sched_min_base;
+extern int sched_max_base;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -603,7 +606,17 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-
+	{
+		.ctl_name	= KERN_SCHED_TIMESLICE,
+		.procname	= "base_timeslice",
+		.data		= &sched_base_timeslice,
+		.maxlen		= sizeof (sched_base_timeslice),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &sched_min_base,
+		.extra2		= &sched_max_base,
+	},
 	{ .ctl_name = 0 }
 };
 
@@ -859,6 +872,7 @@ static ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 #endif
+
 	{ .ctl_name = 0 }
 };
 
Index: linux-2.6/include/linux/init_task.h
===================================================================
--- linux-2.6.orig/include/linux/init_task.h	2007-03-29 19:45:08.000000000 +1000
+++ linux-2.6/include/linux/init_task.h	2007-03-29 19:46:05.000000000 +1000
@@ -99,16 +99,15 @@ extern struct group_info init_groups;
 	.usage		= ATOMIC_INIT(2),				\
 	.flags		= 0,						\
 	.lock_depth	= -1,						\
-	.prio		= MAX_PRIO-20,					\
-	.static_prio	= MAX_PRIO-20,					\
-	.normal_prio	= MAX_PRIO-20,					\
+	.prio		= MAX_PRIO-29,					\
+	.static_prio	= MAX_PRIO-29,					\
+	.normal_prio	= MAX_PRIO-29,					\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
 	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
 	.ioprio		= 0,						\
-	.time_slice	= HZ,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
Index: linux-2.6/include/linux/sysctl.h
===================================================================
--- linux-2.6.orig/include/linux/sysctl.h	2007-03-29 19:45:08.000000000 +1000
+++ linux-2.6/include/linux/sysctl.h	2007-03-29 19:46:05.000000000 +1000
@@ -165,6 +165,7 @@ enum
 	KERN_MAX_LOCK_DEPTH=74,
 	KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
 	KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
+	KERN_SCHED_TIMESLICE=77, /* int: base timeslice for scheduler */
 };
 
 

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2007-03-29 11:59 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-03-29 11:58 [patch 2.6.21-rc5] nicksched v33 Nick Piggin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.