* [PATCH] [RSDL 5/6] sched: implement rsdl cpu scheduler
@ 2007-03-04 7:06 Con Kolivas
2007-03-05 20:38 ` Matt Mackall
0 siblings, 1 reply; 4+ messages in thread
From: Con Kolivas @ 2007-03-04 7:06 UTC (permalink / raw)
To: linux kernel mailing list, ck list
Implement the "Rotating Staircase DeadLine" RSDL cpu scheduler policy.
Signed-off-by: Con Kolivas <kernel@kolivas.org>
---
include/linux/init_task.h | 2
include/linux/sched.h | 25
kernel/sched.c | 1199 ++++++++++++++++++++++------------------------
3 files changed, 597 insertions(+), 629 deletions(-)
Index: linux-2.6.20-rsdl/kernel/sched.c
===================================================================
--- linux-2.6.20-rsdl.orig/kernel/sched.c 2007-03-04 17:30:25.000000000 +1100
+++ linux-2.6.20-rsdl/kernel/sched.c 2007-03-04 17:30:31.000000000 +1100
@@ -16,6 +16,8 @@
* by Davide Libenzi, preemptible kernel bits by Robert Love.
* 2003-09-03 Interactivity tuning by Con Kolivas.
* 2004-04-02 Scheduler domains code by Nick Piggin
+ * 2007-03-02 Rotating Staircase deadline scheduling policy by Con Kolivas
+ * RSDL v0.26
*/
#include <linux/mm.h>
@@ -73,126 +75,29 @@
#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
+#define SCHED_PRIO(p) ((p)+MAX_RT_PRIO)
+#define MAX_DYN_PRIO (MAX_PRIO + PRIO_RANGE)
/*
- * Some helpers for converting nanosecond timing to jiffy resolution
+ * Preemption needs to take into account that a low priority task can be
+ * at a higher prio due to list merging. Its priority is artificially
+ * elevated and it should be preempted if anything higher priority wakes up.
*/
-#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
-#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
+#define TASK_PREEMPTS_CURR(p, curr) \
+ (((p)->prio < (curr)->prio) || (((p)->prio == (curr)->prio) && \
+ ((p)->static_prio < (curr)->static_prio && \
+ ((curr)->static_prio > (curr)->prio))))
/*
- * These are the 'tuning knobs' of the scheduler:
- *
- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
- * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
- * Timeslices get refilled after they expire.
- */
-#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
-#define DEF_TIMESLICE (100 * HZ / 1000)
-#define ON_RUNQUEUE_WEIGHT 30
-#define CHILD_PENALTY 95
-#define PARENT_PENALTY 100
-#define EXIT_WEIGHT 3
-#define PRIO_BONUS_RATIO 25
-#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
-#define INTERACTIVE_DELTA 2
-#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
-#define STARVATION_LIMIT (MAX_SLEEP_AVG)
-#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
-
-/*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
- * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- * priority range a task can explore, a value of '1' means the
- * task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
- */
-
-#define CURRENT_BONUS(p) \
- (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
- MAX_SLEEP_AVG)
-
-#define GRANULARITY (10 * HZ / 1000 ? : 1)
-
-#ifdef CONFIG_SMP
-#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
- num_online_cpus())
-#else
-#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
-#endif
-
-#define SCALE(v1,v1_max,v2_max) \
- (v1) * (v2_max) / (v1_max)
-
-#define DELTA(p) \
- (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
- INTERACTIVE_DELTA)
-
-#define TASK_INTERACTIVE(p) \
- ((p)->prio <= (p)->static_prio - DELTA(p))
-
-#define INTERACTIVE_SLEEP(p) \
- (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
- (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
-
-#define TASK_PREEMPTS_CURR(p, rq) \
- ((p)->prio < (rq)->curr->prio)
-
-#define SCALE_PRIO(x, prio) \
- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
-
-static unsigned int static_prio_timeslice(int static_prio)
-{
- if (static_prio < NICE_TO_PRIO(0))
- return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
- else
- return SCALE_PRIO(DEF_TIMESLICE, static_prio);
-}
-
-/*
- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
- * to time slice values: [800ms ... 100ms ... 5ms]
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
- */
-
-static inline unsigned int task_timeslice(struct task_struct *p)
-{
- return static_prio_timeslice(p->static_prio);
-}
-
-/*
- * These are the runqueue data structures:
+ * This is the time all tasks within the same priority round robin.
+ * Set to a minimum of 6ms.
*/
+#define RR_INTERVAL ((6 * HZ / 1001) + 1)
+#define DEF_TIMESLICE (RR_INTERVAL * 19)
struct prio_array {
- unsigned int nr_active;
- DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
struct list_head queue[MAX_PRIO];
+ /* Tasks queued at each priority */
};
/*
@@ -224,14 +129,40 @@ struct rq {
*/
unsigned long nr_uninterruptible;
- unsigned long expired_timestamp;
/* Cached timestamp set by update_cpu_clock() */
unsigned long long most_recent_timestamp;
struct task_struct *curr, *idle;
unsigned long next_balance;
struct mm_struct *prev_mm;
+
+ DECLARE_BITMAP(dyn_bitmap, MAX_DYN_PRIO + 1);
+ /*
+ * The bitmap of priorities queued; The extra PRIO_RANGE at the end
+ * is for a bitmap of expired tasks queued. This minimises the number
+ * of bit lookups over prio_array swaps. The dynamic bits can have
+ * false positives. Include 1 bit for delimiter.
+ */
+
+ DECLARE_BITMAP(static_bitmap, MAX_PRIO);
+ /* The bitmap of all static priorities queued */
+
+ unsigned long prio_queued[MAX_PRIO];
+ /* The number of tasks at each static priority */
+
+ long prio_quota[PRIO_RANGE];
+ /*
+ * The quota of ticks the runqueue runs at each dynamic priority
+ * before cycling to the next priority.
+ */
+
struct prio_array *active, *expired, arrays[2];
- int best_expired_prio;
+
+ int prio_level;
+ /* The current dynamic priority level this runqueue is at */
+
+ unsigned long prio_rotation;
+ /* How many times we have rotated the priority queue */
+
atomic_t nr_iowait;
#ifdef CONFIG_SMP
@@ -569,12 +500,9 @@ static inline struct rq *this_rq_lock(vo
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
/*
* Called when a process is dequeued from the active array and given
- * the cpu. We should note that with the exception of interactive
- * tasks, the expired queue will become the active queue after the active
- * queue is empty, without explicitly dequeuing and requeuing tasks in the
- * expired queue. (Interactive tasks may be requeued directly to the
- * active queue, thus delaying tasks in the expired queue from running;
- * see scheduler_tick()).
+ * the cpu. We should note that the expired queue will become the active
+ * queue after the active queue is empty, without explicitly dequeuing and
+ * requeuing tasks in the expired queue.
*
* This function is only called from sched_info_arrive(), rather than
* dequeue_task(). Even though a task may be queued and dequeued multiple
@@ -672,71 +600,167 @@ sched_info_switch(struct task_struct *pr
#define sched_info_switch(t, next) do { } while (0)
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+static inline int task_queued(struct task_struct *task)
+{
+ return !list_empty(&task->run_list);
+}
+
+static inline void set_task_entitlement(struct task_struct *p)
+{
+ __set_bit(USER_PRIO(p->prio), p->bitmap);
+
+ /*
+ * In the case this task has been part of a merged list that has
+ * made it to higher priority than it should be, we remove the
+ * quota from its own priority since it will get a quota at this
+ * priority.
+ */
+ if (p->normal_prio < p->static_prio)
+ __set_bit(USER_PRIO(p->static_prio), p->bitmap);
+ p->time_slice = p->quota;
+}
+
/*
- * Adding/removing a task to/from a priority array:
+ * Only the static_bitmap has hard accounting. The dynamic bits can have
+ * false positives. rt_tasks can only be on the active queue.
*/
-static void dequeue_task(struct task_struct *p, struct prio_array *array)
+static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq)
{
- array->nr_active--;
- list_del(&p->run_list);
- if (list_empty(array->queue + p->prio))
- __clear_bit(p->prio, array->bitmap);
+ if (p->array == rq->active)
+ __set_bit(p->prio, rq->dyn_bitmap);
+ else
+ __set_bit(p->prio + PRIO_RANGE, rq->dyn_bitmap);
}
-static void enqueue_task(struct task_struct *p, struct prio_array *array)
+static inline void set_queue_bits(struct rq *rq, struct task_struct *p)
{
- sched_info_queued(p);
- list_add_tail(&p->run_list, array->queue + p->prio);
- __set_bit(p->prio, array->bitmap);
- array->nr_active++;
- p->array = array;
+ __set_bit(p->static_prio, rq->static_bitmap);
+ set_dynamic_bit(p, rq);
}
/*
- * Put task to the end of the run list without the overhead of dequeue
- * followed by enqueue.
+ * Removing from a runqueue. While we don't know with absolute certainty
+ * where this task really is, the p->array and p->prio are very likely
+ * so we check that queue to see if we can clear that bit to take some
+ * load off finding false positives in next_dynamic_task().
*/
-static void requeue_task(struct task_struct *p, struct prio_array *array)
+static void dequeue_task(struct task_struct *p, struct rq *rq)
{
- list_move_tail(&p->run_list, array->queue + p->prio);
+ list_del_init(&p->run_list);
+ if (!--rq->prio_queued[p->static_prio])
+ __clear_bit(p->static_prio, rq->static_bitmap);
+ if (list_empty(p->array->queue + p->prio)) {
+ int bitmap_prio = p->prio;
+
+ if (p->array == rq->expired)
+ bitmap_prio += PRIO_RANGE;
+ __clear_bit(bitmap_prio, rq->dyn_bitmap);
+ }
}
-static inline void
-enqueue_task_head(struct task_struct *p, struct prio_array *array)
+/*
+ * The task is being queued on a fresh array so it has its entitlement
+ * bitmap cleared.
+ */
+static inline void task_new_array(struct task_struct *p, struct rq *rq)
+{
+ bitmap_zero(p->bitmap, PRIO_RANGE);
+ p->rotation = rq->prio_rotation;
+}
+
+#define rq_quota(rq, prio) ((rq)->prio_quota[USER_PRIO(prio)])
+/*
+ * recalc_task_prio determines what prio a non rt_task will be
+ * queued at. If the task has already been running during this runqueue's
+ * major rotation (rq->prio_rotation) then it continues at the same
+ * priority if it has tick entitlement left. If it does not have entitlement
+ * left, it finds the next priority slot according to its nice value that it
+ * has not extracted quota from. If it has not run during this major
+ * rotation, it starts at its static priority and has its bitmap quota
+ * cleared. If it does not have any slots left it has all its slots reset and
+ * is queued on the expired at its static priority.
+ */
+static void recalc_task_prio(struct task_struct *p, struct rq *rq)
{
- list_add(&p->run_list, array->queue + p->prio);
- __set_bit(p->prio, array->bitmap);
- array->nr_active++;
+ struct prio_array *array = rq->active;
+ int queue_prio, search_prio;
+
+ if (p->rotation == rq->prio_rotation && p->array == array) {
+ if (p->time_slice && rq_quota(rq, p->prio))
+ return;
+ } else
+ task_new_array(p, rq);
+ search_prio = p->static_prio;
+
+ /*
+ * SCHED_BATCH tasks never start at better priority than any other
+ * task that is already running since they are flagged as latency
+ * insensitive. This means they never cause greater latencies in other
+ * non SCHED_BATCH tasks of the same nice level.
+ */
+ if (unlikely(p->policy == SCHED_BATCH))
+ search_prio = max(p->static_prio, rq->prio_level);
+ queue_prio = SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE,
+ USER_PRIO(search_prio)));
+ if (queue_prio == MAX_PRIO) {
+ queue_prio = p->static_prio;
+ array = rq->expired;
+ bitmap_zero(p->bitmap, PRIO_RANGE);
+ } else
+ rq_quota(rq, queue_prio) += p->quota;
+ p->prio = p->normal_prio = queue_prio;
p->array = array;
+ set_task_entitlement(p);
}
/*
- * __normal_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
- *
- * We use 25% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
- *
- * Both properties are important to certain workloads.
+ * Adding to a runqueue. The dynamic priority queue that it is added to is
+ * determined by the priority rotation of the runqueue it is being added to
+ * and the quota still available in the task in p->bitmap and p->time_slice
+ * (see recalc_task_prio above). The rq static_bitmap stores a list of
+ * the static priorities, and prio_queued the number of tasks stored at each
+ * p->static_prio level.
*/
+static inline void __enqueue_task(struct task_struct *p, struct rq *rq)
+{
+ if (rt_task(p))
+ p->array = rq->active;
+ else
+ recalc_task_prio(p, rq);
+ rq->prio_queued[p->static_prio]++;
-static inline int __normal_prio(struct task_struct *p)
+ sched_info_queued(p);
+ set_queue_bits(rq, p);
+}
+
+static void enqueue_task(struct task_struct *p, struct rq *rq)
{
- int bonus, prio;
+ __enqueue_task(p, rq);
+ list_add_tail(&p->run_list, p->array->queue + p->prio);
+}
- bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
+static inline void enqueue_task_head(struct task_struct *p, struct rq *rq)
+{
+ __enqueue_task(p, rq);
+ list_add(&p->run_list, p->array->queue + p->prio);
+}
- prio = p->static_prio - bonus;
- if (prio < MAX_RT_PRIO)
- prio = MAX_RT_PRIO;
- if (prio > MAX_PRIO-1)
- prio = MAX_PRIO-1;
- return prio;
+/*
+ * requeue_task is only called when p->static_prio does not change. p->prio
+ * can change with dynamic tasks.
+ */
+static void requeue_task(struct task_struct *p, struct rq *rq,
+ struct prio_array *old_array, int old_prio)
+{
+ list_move_tail(&p->run_list, p->array->queue + p->prio);
+ if (!rt_task(p)) {
+ if (list_empty(old_array->queue + old_prio)) {
+ if (old_array == rq->expired)
+ old_prio += PRIO_RANGE;
+ __clear_bit(old_prio, rq->dyn_bitmap);
+ }
+ set_dynamic_bit(p, rq);
+ }
}
/*
@@ -749,6 +773,20 @@ static inline int __normal_prio(struct t
*/
/*
+ * task_timeslice - the total duration a task can run during one major
+ * rotation.
+ */
+static inline unsigned int task_timeslice(struct task_struct *p)
+{
+ unsigned int slice, rr;
+
+ slice = rr = p->quota;
+ if (likely(!rt_task(p)))
+ slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * rr;
+ return slice;
+}
+
+/*
* Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
* If static_prio_timeslice() is ever changed to break this assumption then
* this code will need modification
@@ -756,10 +794,9 @@ static inline int __normal_prio(struct t
#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
#define LOAD_WEIGHT(lp) \
(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
-#define PRIO_TO_LOAD_WEIGHT(prio) \
- LOAD_WEIGHT(static_prio_timeslice(prio))
-#define RTPRIO_TO_LOAD_WEIGHT(rp) \
- (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
+#define TASK_LOAD_WEIGHT(p) LOAD_WEIGHT(task_timeslice(p))
+#define RTPRIO_TO_LOAD_WEIGHT(rp) \
+ (LOAD_WEIGHT((RR_INTERVAL + 20 + (rp))))
static void set_load_weight(struct task_struct *p)
{
@@ -776,7 +813,7 @@ static void set_load_weight(struct task_
#endif
p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
} else
- p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+ p->load_weight = TASK_LOAD_WEIGHT(p);
}
static inline void
@@ -804,28 +841,35 @@ static inline void dec_nr_running(struct
}
/*
- * Calculate the expected normal priority: i.e. priority
- * without taking RT-inheritance into account. Might be
- * boosted by interactivity modifiers. Changes upon fork,
- * setprio syscalls, and whenever the interactivity
- * estimator recalculates.
+ * __activate_task - move a task to the runqueue.
*/
-static inline int normal_prio(struct task_struct *p)
+static inline void __activate_task(struct task_struct *p, struct rq *rq)
+{
+ enqueue_task(p, rq);
+ inc_nr_running(p, rq);
+}
+
+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
{
- int prio;
+ enqueue_task_head(p, rq);
+ inc_nr_running(p, rq);
+}
+static inline int normal_prio(struct task_struct *p)
+{
if (has_rt_policy(p))
- prio = MAX_RT_PRIO-1 - p->rt_priority;
- else
- prio = __normal_prio(p);
- return prio;
+ return MAX_RT_PRIO-1 - p->rt_priority;
+ /* Other tasks all have normal_prio set in recalc_task_prio */
+ return p->static_prio;
}
/*
* Calculate the current priority, i.e. the priority
* taken into account by the scheduler. This value might
- * be boosted by RT tasks, or might be boosted by
- * interactivity modifiers. Will be RT if the task got
+ * be boosted by RT tasks as it will be RT if the task got
* RT-boosted. If not then it returns p->normal_prio.
*/
static int effective_prio(struct task_struct *p)
@@ -842,111 +886,26 @@ static int effective_prio(struct task_st
}
/*
- * __activate_task - move a task to the runqueue.
- */
-static void __activate_task(struct task_struct *p, struct rq *rq)
-{
- struct prio_array *target = rq->active;
-
- if (batch_task(p))
- target = rq->expired;
- enqueue_task(p, target);
- inc_nr_running(p, rq);
-}
-
-/*
- * __activate_idle_task - move idle task to the _front_ of runqueue.
- */
-static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
-{
- enqueue_task_head(p, rq->active);
- inc_nr_running(p, rq);
-}
-
-/*
- * Recalculate p->normal_prio and p->prio after having slept,
- * updating the sleep-average too:
+ * All tasks have quotas based on RR_INTERVAL. From nice 0 to 19 they are
+ * all equal to it and below zero they get progressively larger making their
+ * effective quota significantly larger. rt tasks all get RR_INTERVAL.
*/
-static int recalc_task_prio(struct task_struct *p, unsigned long long now)
+static unsigned int rr_interval(struct task_struct *p)
{
- /* Caller must always ensure 'now >= p->timestamp' */
- unsigned long sleep_time = now - p->timestamp;
+ int nice = TASK_NICE(p);
- if (batch_task(p))
- sleep_time = 0;
-
- if (likely(sleep_time > 0)) {
- /*
- * This ceiling is set to the lowest priority that would allow
- * a task to be reinserted into the active array on timeslice
- * completion.
- */
- unsigned long ceiling = INTERACTIVE_SLEEP(p);
-
- if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
- /*
- * Prevents user tasks from achieving best priority
- * with one single large enough sleep.
- */
- p->sleep_avg = ceiling;
- /*
- * Using INTERACTIVE_SLEEP() as a ceiling places a
- * nice(0) task 1ms sleep away from promotion, and
- * gives it 700ms to round-robin with no chance of
- * being demoted. This is more than generous, so
- * mark this sleep as non-interactive to prevent the
- * on-runqueue bonus logic from intervening should
- * this task not receive cpu immediately.
- */
- p->sleep_type = SLEEP_NONINTERACTIVE;
- } else {
- /*
- * Tasks waking from uninterruptible sleep are
- * limited in their sleep_avg rise as they
- * are likely to be waiting on I/O
- */
- if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
- if (p->sleep_avg >= ceiling)
- sleep_time = 0;
- else if (p->sleep_avg + sleep_time >=
- ceiling) {
- p->sleep_avg = ceiling;
- sleep_time = 0;
- }
- }
-
- /*
- * This code gives a bonus to interactive tasks.
- *
- * The boost works by updating the 'average sleep time'
- * value here, based on ->timestamp. The more time a
- * task spends sleeping, the higher the average gets -
- * and the higher the priority boost gets as well.
- */
- p->sleep_avg += sleep_time;
-
- }
- if (p->sleep_avg > NS_MAX_SLEEP_AVG)
- p->sleep_avg = NS_MAX_SLEEP_AVG;
- }
-
- return effective_prio(p);
+ if (nice < 0 && !rt_task(p))
+ return RR_INTERVAL * (20 - nice) / 20;
+ return RR_INTERVAL;
}
/*
* activate_task - move a task to the runqueue and do priority recalculation
- *
- * Update all the scheduling statistics stuff. (sleep average
- * calculation, priority modifiers, etc.)
*/
static void activate_task(struct task_struct *p, struct rq *rq, int local)
{
- unsigned long long now;
-
- if (rt_task(p))
- goto out;
+ unsigned long long now = sched_clock();
- now = sched_clock();
#ifdef CONFIG_SMP
if (!local) {
/* Compensate for drifting sched_clock */
@@ -967,32 +926,9 @@ static void activate_task(struct task_st
(now - p->timestamp) >> 20);
}
- p->prio = recalc_task_prio(p, now);
-
- /*
- * This checks to make sure it's not an uninterruptible task
- * that is now waking up.
- */
- if (p->sleep_type == SLEEP_NORMAL) {
- /*
- * Tasks which were woken up by interrupts (ie. hw events)
- * are most likely of interactive nature. So we give them
- * the credit of extending their sleep time to the period
- * of time they spend on the runqueue, waiting for execution
- * on a CPU, first time around:
- */
- if (in_interrupt())
- p->sleep_type = SLEEP_INTERRUPTED;
- else {
- /*
- * Normal first-time wakeups get a credit too for
- * on-runqueue time, but it will be weighted down:
- */
- p->sleep_type = SLEEP_INTERACTIVE;
- }
- }
+ p->quota = rr_interval(p);
+ p->prio = effective_prio(p);
p->timestamp = now;
-out:
__activate_task(p, rq);
}
@@ -1002,8 +938,7 @@ out:
static void deactivate_task(struct task_struct *p, struct rq *rq)
{
dec_nr_running(p, rq);
- dequeue_task(p, p->array);
- p->array = NULL;
+ dequeue_task(p, rq);
}
/*
@@ -1085,7 +1020,7 @@ migrate_task(struct task_struct *p, int
* If the task is not on a runqueue (and not running), then
* it is sufficient to simply update the task's cpu field.
*/
- if (!p->array && !task_running(rq, p)) {
+ if (!task_queued(p) && !task_running(rq, p)) {
set_task_cpu(p, dest_cpu);
return 0;
}
@@ -1116,7 +1051,7 @@ void wait_task_inactive(struct task_stru
repeat:
rq = task_rq_lock(p, &flags);
/* Must be off runqueue entirely, not preempted. */
- if (unlikely(p->array || task_running(rq, p))) {
+ if (unlikely(task_queued(p) || task_running(rq, p))) {
/* If it's preempted, we yield. It could be a while. */
preempted = !task_running(rq, p);
task_rq_unlock(rq, &flags);
@@ -1381,6 +1316,20 @@ static inline int wake_idle(int cpu, str
}
#endif
+static inline int task_preempts_curr(struct task_struct *p, struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+
+ return ((p->array == task_rq(p)->active &&
+ TASK_PREEMPTS_CURR(p, curr)) || curr == rq->idle);
+}
+
+static inline void try_preempt(struct task_struct *p, struct rq *rq)
+{
+ if (task_preempts_curr(p, rq))
+ resched_task(rq->curr);
+}
+
/***
* try_to_wake_up - wake up a thread
* @p: the to-be-woken-up thread
@@ -1412,7 +1361,7 @@ static int try_to_wake_up(struct task_st
if (!(old_state & state))
goto out;
- if (p->array)
+ if (task_queued(p))
goto out_running;
cpu = task_cpu(p);
@@ -1505,7 +1454,7 @@ out_set_cpu:
old_state = p->state;
if (!(old_state & state))
goto out;
- if (p->array)
+ if (task_queued(p))
goto out_running;
this_cpu = smp_processor_id();
@@ -1514,26 +1463,10 @@ out_set_cpu:
out_activate:
#endif /* CONFIG_SMP */
- if (old_state == TASK_UNINTERRUPTIBLE) {
+ if (old_state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
- /*
- * Tasks on involuntary sleep don't earn
- * sleep_avg beyond just interactive state.
- */
- p->sleep_type = SLEEP_NONINTERACTIVE;
- } else
/*
- * Tasks that have marked their sleep as noninteractive get
- * woken up with their sleep average not weighted in an
- * interactive way.
- */
- if (old_state & TASK_NONINTERACTIVE)
- p->sleep_type = SLEEP_NONINTERACTIVE;
-
-
- activate_task(p, rq, cpu == this_cpu);
- /*
* Sync wakeups (i.e. those types of wakeups where the waker
* has indicated that it will leave the CPU in short order)
* don't trigger a preemption, if the woken up task will run on
@@ -1541,10 +1474,9 @@ out_activate:
* the waker guarantees that the freshly woken up task is going
* to be considered on this CPU.)
*/
- if (!sync || cpu != this_cpu) {
- if (TASK_PREEMPTS_CURR(p, rq))
- resched_task(rq->curr);
- }
+ activate_task(p, rq, cpu == this_cpu);
+ if (!sync || cpu != this_cpu)
+ try_preempt(p, rq);
success = 1;
out_running:
@@ -1567,7 +1499,7 @@ int fastcall wake_up_state(struct task_s
return try_to_wake_up(p, state, 0);
}
-static void task_running_tick(struct rq *rq, struct task_struct *p);
+static void task_expired_entitlement(struct rq *rq, struct task_struct *p);
/*
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
@@ -1595,7 +1527,6 @@ void fastcall sched_fork(struct task_str
p->prio = current->normal_prio;
INIT_LIST_HEAD(&p->run_list);
- p->array = NULL;
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
if (unlikely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -1607,6 +1538,8 @@ void fastcall sched_fork(struct task_str
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
+ if (unlikely(p->policy == SCHED_FIFO))
+ goto out;
/*
* Share the timeslice between parent and child, thus the
* total amount of pending timeslices in the system doesn't change,
@@ -1621,16 +1554,19 @@ void fastcall sched_fork(struct task_str
p->first_time_slice = 1;
current->time_slice >>= 1;
p->timestamp = sched_clock();
- if (unlikely(!current->time_slice)) {
+ if (!current->time_slice) {
/*
- * This case is rare, it happens when the parent has only
- * a single jiffy left from its timeslice. Taking the
- * runqueue lock is not a problem.
+ * This case happens when the parent has only a single jiffy
+ * left from its timeslice. Taking the runqueue lock is not
+ * a problem.
*/
- current->time_slice = 1;
- task_running_tick(cpu_rq(cpu), current);
+ struct rq *rq = __task_rq_lock(current);
+
+ task_expired_entitlement(rq, current);
+ __task_rq_unlock(rq);
}
local_irq_enable();
+out:
put_cpu();
}
@@ -1652,38 +1588,16 @@ void fastcall wake_up_new_task(struct ta
this_cpu = smp_processor_id();
cpu = task_cpu(p);
- /*
- * We decrease the sleep average of forking parents
- * and children as well, to keep max-interactive tasks
- * from forking tasks that are max-interactive. The parent
- * (current) is done further down, under its lock.
- */
- p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
- CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
-
- p->prio = effective_prio(p);
-
if (likely(cpu == this_cpu)) {
+ activate_task(p, rq, 1);
if (!(clone_flags & CLONE_VM)) {
/*
* The VM isn't cloned, so we're in a good position to
* do child-runs-first in anticipation of an exec. This
* usually avoids a lot of COW overhead.
*/
- if (unlikely(!current->array))
- __activate_task(p, rq);
- else {
- p->prio = current->prio;
- p->normal_prio = current->normal_prio;
- list_add_tail(&p->run_list, ¤t->run_list);
- p->array = current->array;
- p->array->nr_active++;
- inc_nr_running(p, rq);
- }
set_need_resched();
- } else
- /* Run child last */
- __activate_task(p, rq);
+ }
/*
* We skip the following code due to cpu == this_cpu
*
@@ -1700,19 +1614,16 @@ void fastcall wake_up_new_task(struct ta
*/
p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
+ rq->most_recent_timestamp;
- __activate_task(p, rq);
- if (TASK_PREEMPTS_CURR(p, rq))
- resched_task(rq->curr);
+ activate_task(p, rq, 0);
+ try_preempt(p, rq);
/*
* Parent and child are on different CPUs, now get the
- * parent runqueue to update the parent's ->sleep_avg:
+ * parent runqueue to update the parent's ->flags:
*/
task_rq_unlock(rq, &flags);
this_rq = task_rq_lock(current, &flags);
}
- current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
- PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
task_rq_unlock(this_rq, &flags);
}
@@ -1730,20 +1641,12 @@ void fastcall sched_exit(struct task_str
unsigned long flags;
struct rq *rq;
- /*
- * If the child was a (relative-) CPU hog then decrease
- * the sleep_avg of the parent as well.
- */
rq = task_rq_lock(p->parent, &flags);
if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
p->parent->time_slice += p->time_slice;
- if (unlikely(p->parent->time_slice > task_timeslice(p)))
- p->parent->time_slice = task_timeslice(p);
+ if (unlikely(p->parent->time_slice > p->quota))
+ p->parent->time_slice = p->quota;
}
- if (p->sleep_avg < p->parent->sleep_avg)
- p->parent->sleep_avg = p->parent->sleep_avg /
- (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
- (EXIT_WEIGHT + 1);
task_rq_unlock(rq, &flags);
}
@@ -2070,21 +1973,27 @@ void sched_exec(void)
*/
static void pull_task(struct rq *src_rq, struct prio_array *src_array,
struct task_struct *p, struct rq *this_rq,
- struct prio_array *this_array, int this_cpu)
+ int this_cpu)
{
- dequeue_task(p, src_array);
+ dequeue_task(p, src_rq);
dec_nr_running(p, src_rq);
set_task_cpu(p, this_cpu);
inc_nr_running(p, this_rq);
- enqueue_task(p, this_array);
+
+ /*
+ * If this task has already been running on src_rq this priority
+ * cycle, make the new runqueue think it has been on its cycle
+ */
+ if (p->rotation == src_rq->prio_rotation)
+ p->rotation = this_rq->prio_rotation;
+ enqueue_task(p, this_rq);
p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
+ this_rq->most_recent_timestamp;
/*
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
- if (TASK_PREEMPTS_CURR(p, this_rq))
- resched_task(this_rq->curr);
+ try_preempt(p, this_rq);
}
/*
@@ -2127,8 +2036,6 @@ int can_migrate_task(struct task_struct
return 1;
}
-#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
-
/*
* move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
* load from busiest to this_rq, as part of a balancing operation within
@@ -2141,9 +2048,9 @@ static int move_tasks(struct rq *this_rq
struct sched_domain *sd, enum idle_type idle,
int *all_pinned)
{
- int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
+ int idx, test_idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
best_prio_seen, skip_for_load;
- struct prio_array *array, *dst_array;
+ struct prio_array *array;
struct list_head *head, *curr;
struct task_struct *tmp;
long rem_load_move;
@@ -2153,8 +2060,8 @@ static int move_tasks(struct rq *this_rq
rem_load_move = max_load_move;
pinned = 1;
- this_best_prio = rq_best_prio(this_rq);
- best_prio = rq_best_prio(busiest);
+ this_best_prio = this_rq->curr->prio;
+ best_prio = busiest->curr->prio;
/*
* Enable handling of the case where there is more than one task
* with the best priority. If the current running task is one
@@ -2168,32 +2075,35 @@ static int move_tasks(struct rq *this_rq
* We first consider expired tasks. Those will likely not be
* executed in the near future, and they are most likely to
* be cache-cold, thus switching CPUs has the least effect
- * on them.
+ * on them. This is done by starting the search at priority
+ * MAX_PRIO since expired bits are MAX_PRIO...MAX_DYN_PRIO-1
*/
- if (busiest->expired->nr_active) {
- array = busiest->expired;
- dst_array = this_rq->expired;
- } else {
- array = busiest->active;
- dst_array = this_rq->active;
- }
-
-new_array:
- /* Start searching at priority 0: */
- idx = 0;
+ array = busiest->expired;
+ test_idx = MAX_PRIO;
skip_bitmap:
- if (!idx)
- idx = sched_find_first_bit(array->bitmap);
+ if (!test_idx)
+ idx = sched_find_first_bit(busiest->dyn_bitmap);
else
- idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
- if (idx >= MAX_PRIO) {
- if (array == busiest->expired && busiest->active->nr_active) {
+ idx = find_next_bit(busiest->dyn_bitmap, MAX_DYN_PRIO,
+ test_idx);
+ if (idx >= MAX_DYN_PRIO) {
+ if (array == busiest->expired) {
array = busiest->active;
- dst_array = this_rq->active;
- goto new_array;
+ test_idx = 0;
+ goto skip_bitmap;
}
goto out;
}
+ test_idx = idx;
+ if (idx >= MAX_PRIO) {
+ if (array == busiest->active)
+ goto out;
+ idx -= PRIO_RANGE;
+ }
+ if (list_empty(array->queue + idx)) {
+ __clear_bit(test_idx, busiest->dyn_bitmap);
+ goto skip_bitmap;
+ }
head = array->queue + idx;
curr = head->prev;
@@ -2216,11 +2126,11 @@ skip_queue:
best_prio_seen |= idx == best_prio;
if (curr != head)
goto skip_queue;
- idx++;
+ test_idx++;
goto skip_bitmap;
}
- pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+ pull_task(busiest, array, tmp, this_rq, this_cpu);
pulled++;
rem_load_move -= tmp->load_weight;
@@ -2233,7 +2143,7 @@ skip_queue:
this_best_prio = idx;
if (curr != head)
goto skip_queue;
- idx++;
+ test_idx++;
goto skip_bitmap;
}
out:
@@ -3036,27 +2946,6 @@ unsigned long long current_sched_time(co
}
/*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired:
- */
-static inline int expired_starving(struct rq *rq)
-{
- if (rq->curr->static_prio > rq->best_expired_prio)
- return 1;
- if (!STARVATION_LIMIT || !rq->expired_timestamp)
- return 0;
- if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
- return 1;
- return 0;
-}
-
-/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3129,87 +3018,137 @@ void account_steal_time(struct task_stru
cpustat->steal = cputime64_add(cpustat->steal, tmp);
}
+/*
+ * The task has used up its quota of running in this prio_level so it must be
+ * dropped a priority level, all managed by recalc_task_prio().
+ */
+static void task_expired_entitlement(struct rq *rq, struct task_struct *p)
+{
+ struct prio_array *old_array;
+ int old_prio;
+
+ set_tsk_need_resched(p);
+ if (unlikely(p->first_time_slice))
+ p->first_time_slice = 0;
+ if (rt_task(p)) {
+ p->time_slice = p->quota;
+ return;
+ }
+ old_array = p->array;
+ old_prio = p->prio;
+ /* p->prio and p->array will be updated in recalc_task_prio */
+ recalc_task_prio(p, rq);
+ requeue_task(p, rq, old_array, old_prio);
+}
+
+/*
+ * A major priority rotation occurs when all priority quotas for this array
+ * have been exhausted.
+ */
+static inline void major_prio_rotation(struct rq *rq)
+{
+ struct prio_array *new_array = rq->expired;
+
+ rq->expired = rq->active;
+ rq->active = new_array;
+ rq->prio_rotation++;
+ bitmap_zero(rq->dyn_bitmap, MAX_DYN_PRIO);
+ bitmap_copy(rq->dyn_bitmap, rq->static_bitmap, MAX_PRIO);
+ __set_bit(MAX_DYN_PRIO, rq->dyn_bitmap);
+}
+
+/*
+ * This is the heart of the virtual deadline priority management.
+ *
+ * We have used up the quota allocated to this priority level so we rotate
+ * the prio_level of the runqueue to the next lowest priority. We merge any
+ * remaining tasks at this level current_queue with the next priority and
+ * reset this level's queue. MAX_PRIO - 1 is a special case where we perform
+ * a major rotation.
+ */
+static inline void rotate_runqueue_priority(struct rq *rq)
+{
+ int new_prio_level, remaining_quota = rq_quota(rq, rq->prio_level);
+ struct prio_array *array = rq->active;
+
+ if (rq->prio_level > MAX_PRIO - 2) {
+ /* Major rotation required */
+ struct prio_array *new_queue = rq->expired;
+
+ /*
+ * The static_bitmap gives us the highest p->static prio task
+ * that is queued. This value is used as the prio after
+ * the major rotation and all tasks remaining on this
+ * active queue are moved there. This means tasks can end
+ * up a p->prio better than their p->static_prio.
+ */
+ new_prio_level = find_next_bit(rq->static_bitmap, MAX_PRIO,
+ MAX_RT_PRIO);
+ if (!list_empty(array->queue + rq->prio_level)) {
+ list_splice_tail_init(array->queue + rq->prio_level,
+ new_queue->queue + new_prio_level);
+ }
+ memset(rq->prio_quota, 0, ARRAY_SIZE(rq->prio_quota));
+ major_prio_rotation(rq);
+ } else {
+ /* Minor rotation */
+ new_prio_level = rq->prio_level + 1;
+ __clear_bit(rq->prio_level, rq->dyn_bitmap);
+ if (!list_empty(array->queue + rq->prio_level)) {
+ list_splice_tail_init(array->queue + rq->prio_level,
+ array->queue + new_prio_level);
+ __set_bit(new_prio_level, rq->dyn_bitmap);
+ }
+ rq_quota(rq, rq->prio_level) = 0;
+ }
+ rq->prio_level = new_prio_level;
+ /*
+ * While we usually rotate with the rq quota being 0, it is possible
+ * to be negative so we subtract any deficit from the new level.
+ */
+ rq_quota(rq, new_prio_level) += remaining_quota;
+}
+
static void task_running_tick(struct rq *rq, struct task_struct *p)
{
- if (p->array != rq->active) {
+ if (unlikely(!task_queued(p))) {
/* Task has expired but was not scheduled yet */
set_tsk_need_resched(p);
return;
}
+ /* SCHED_FIFO tasks never run out of timeslice. */
+ if (unlikely(p->policy == SCHED_FIFO))
+ return;
+
spin_lock(&rq->lock);
/*
- * The task was running during this tick - update the
- * time slice counter. Note: we do not update a thread's
- * priority until it either goes to sleep or uses up its
- * timeslice. This makes it possible for interactive tasks
- * to use up their timeslices at their highest priority levels.
+ * Accounting is performed by both the task and the runqueue. This
+ * allows frequently sleeping tasks to get their proper quota of
+ * cpu as the runqueue will have their quota still available at
+ * the appropriate priority level. It also means frequently waking
+ * tasks that might miss the scheduler_tick() will get forced down
+ * priority regardless.
+ */
+ if (!--p->time_slice)
+ task_expired_entitlement(rq, p);
+ /*
+ * The rq quota can become negative due to a task being queued in
+ * scheduler without any quota left at that priority level. It is
+ * cheaper to allow it to run till this scheduler tick and then
+ * subtract it from the quota of the merged queues.
*/
- if (rt_task(p)) {
- /*
- * RR tasks need a special form of timeslice management.
- * FIFO tasks have no timeslices.
- */
- if ((p->policy == SCHED_RR) && !--p->time_slice) {
- p->time_slice = task_timeslice(p);
+ if (!rt_task(p) && --rq_quota(rq, rq->prio_level) <= 0) {
+ if (unlikely(p->first_time_slice))
p->first_time_slice = 0;
- set_tsk_need_resched(p);
-
- /* put it at the end of the queue: */
- requeue_task(p, rq->active);
- }
- goto out_unlock;
- }
- if (!--p->time_slice) {
- dequeue_task(p, rq->active);
+ rotate_runqueue_priority(rq);
set_tsk_need_resched(p);
- p->prio = effective_prio(p);
- p->time_slice = task_timeslice(p);
- p->first_time_slice = 0;
-
- if (!rq->expired_timestamp)
- rq->expired_timestamp = jiffies;
- if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
- enqueue_task(p, rq->expired);
- if (p->static_prio < rq->best_expired_prio)
- rq->best_expired_prio = p->static_prio;
- } else
- enqueue_task(p, rq->active);
- } else {
- /*
- * Prevent a too long timeslice allowing a task to monopolize
- * the CPU. We do this by splitting up the timeslice into
- * smaller pieces.
- *
- * Note: this does not mean the task's timeslices expire or
- * get lost in any way, they just might be preempted by
- * another task of equal priority. (one with higher
- * priority would have preempted this task already.) We
- * requeue this task to the end of the list on this priority
- * level, which is in essence a round-robin of tasks with
- * equal priority.
- *
- * This only applies to tasks in the interactive
- * delta range with at least TIMESLICE_GRANULARITY to requeue.
- */
- if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
- p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
- (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
- (p->array == rq->active)) {
-
- requeue_task(p, rq->active);
- set_tsk_need_resched(p);
- }
}
-out_unlock:
spin_unlock(&rq->lock);
}
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
- *
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
*/
void scheduler_tick(void)
{
@@ -3271,6 +3210,12 @@ static void wake_sleeping_dependent(int
}
}
+static inline unsigned long remaining_slice(struct task_struct *p)
+{
+ return p->quota * (MAX_PRIO - 1 - p->prio) +
+ p->time_slice;
+}
+
/*
* number of 'lost' timeslices this task wont be able to fully
* utilize, if another task runs on a sibling. This models the
@@ -3279,7 +3224,7 @@ static void wake_sleeping_dependent(int
static inline unsigned long
smt_slice(struct task_struct *p, struct sched_domain *sd)
{
- return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+ return remaining_slice(p) * (100 - sd->per_cpu_gain) / 100;
}
/*
@@ -3342,7 +3287,7 @@ dependent_sleeper(int this_cpu, struct r
ret = 1;
} else {
if (smt_curr->static_prio < p->static_prio &&
- !TASK_PREEMPTS_CURR(p, smt_rq) &&
+ !task_preempts_curr(p, smt_rq) &&
smt_slice(smt_curr, sd) > task_timeslice(p))
ret = 1;
}
@@ -3400,10 +3345,98 @@ EXPORT_SYMBOL(sub_preempt_count);
#endif
-static inline int interactive_sleep(enum sleep_type sleep_type)
+/*
+ * Leave this debugging in until we are certain all bitmap manipulations are
+ * working as desired since we can safely get out of this situation.
+ */
+static noinline int rq_bitmap_error(struct rq *rq)
+{
+ static int bitmap_error = 0;
+ struct prio_array *array;
+ struct list_head *queue;
+ int idx, test_idx;
+
+ if (!bitmap_error++)
+ printk(KERN_ERR "Scheduler bitmap error - bitmap being reconstructed..\n");
+ for (test_idx = MAX_RT_PRIO ; test_idx < MAX_DYN_PRIO ; test_idx++) {
+ if (test_idx < MAX_PRIO) {
+ idx = test_idx;
+ array = rq->active;
+ } else {
+ idx = test_idx - PRIO_RANGE;
+ array = rq->expired;
+ }
+ queue = array->queue + idx;
+ if (!list_empty(queue)) {
+ if (!test_bit(test_idx, rq->dyn_bitmap)) {
+ __set_bit(test_idx, rq->dyn_bitmap);
+ }
+ }
+ }
+ idx = find_next_bit(rq->dyn_bitmap, MAX_DYN_PRIO, MAX_RT_PRIO);
+ BUG_ON(idx == MAX_DYN_PRIO);
+ return idx;
+}
+
+/*
+ * next_dynamic_task finds the next suitable dynamic task. As the dyn_bitmap
+ * contains all the active and expired dynamic tasks sequentially we only
+ * need to do one bitmap lookup.
+ */
+static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx)
{
- return (sleep_type == SLEEP_INTERACTIVE ||
- sleep_type == SLEEP_INTERRUPTED);
+ struct task_struct *next;
+ struct list_head *queue;
+ struct prio_array *array = rq->active;
+
+retry:
+ if (unlikely(idx == MAX_DYN_PRIO))
+ idx = rq_bitmap_error(rq);
+ if (idx >= MAX_PRIO) {
+ /*
+ * We have selected a bit from the expired range so there are
+ * no more tasks in the active array.
+ */
+ major_prio_rotation(rq);
+ array = rq->active;
+ idx -= PRIO_RANGE;
+ }
+ if (unlikely(list_empty(array->queue + idx))) {
+ /*
+ * This can happen because they are not always cleared on
+ * dequeue_task since they may have been dequeued while
+ * waiting on a runqueue and a rotation has occurred in the
+ * interim. A very rare occurrence.
+ */
+ __clear_bit(idx, rq->dyn_bitmap);
+ idx = find_next_bit(rq->dyn_bitmap, MAX_DYN_PRIO, idx + 1);
+ goto retry;
+ }
+ queue = array->queue + idx;
+ next = list_entry(queue->next, struct task_struct, run_list);
+ /*
+ * When the task is chosen it is checked to see if its quota has been
+ * added to this runqueue level which is only performed once per
+ * level per major rotation for each running task.
+ */
+ if (next->rotation != rq->prio_rotation) {
+ /* Task has moved during major rotation */
+ task_new_array(next, rq);
+ set_task_entitlement(next);
+ rq_quota(rq, idx) += next->quota;
+ } else if (!test_bit(USER_PRIO(idx), next->bitmap)) {
+ /* Task has moved during minor rotation */
+ set_task_entitlement(next);
+ rq_quota(rq, idx) += next->quota;
+ }
+ rq->prio_level = idx;
+ /*
+ * next needs to have its prio and array reset here in case the
+ * values are wrong due to priority rotation.
+ */
+ next->prio = idx;
+ next->array = array;
+ return next;
}
/*
@@ -3412,13 +3445,11 @@ static inline int interactive_sleep(enum
asmlinkage void __sched schedule(void)
{
struct task_struct *prev, *next;
- struct prio_array *array;
struct list_head *queue;
unsigned long long now;
- unsigned long run_time;
- int cpu, idx, new_prio;
long *switch_count;
struct rq *rq;
+ int cpu, idx;
/*
* Test if we are atomic. Since do_exit() needs to call into
@@ -3454,18 +3485,6 @@ need_resched_nonpreemptible:
schedstat_inc(rq, sched_cnt);
now = sched_clock();
- if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
- run_time = now - prev->timestamp;
- if (unlikely((long long)(now - prev->timestamp) < 0))
- run_time = 0;
- } else
- run_time = NS_MAX_SLEEP_AVG;
-
- /*
- * Tasks charged proportionately less run_time at high sleep_avg to
- * delay them losing their interactive status
- */
- run_time /= (CURRENT_BONUS(prev) ? : 1);
spin_lock_irq(&rq->lock);
@@ -3487,47 +3506,19 @@ need_resched_nonpreemptible:
idle_balance(cpu, rq);
if (!rq->nr_running) {
next = rq->idle;
- rq->expired_timestamp = 0;
wake_sleeping_dependent(cpu);
goto switch_tasks;
}
}
- array = rq->active;
- if (unlikely(!array->nr_active)) {
- /*
- * Switch the active and expired arrays.
- */
- schedstat_inc(rq, sched_switch);
- rq->active = rq->expired;
- rq->expired = array;
- array = rq->active;
- rq->expired_timestamp = 0;
- rq->best_expired_prio = MAX_PRIO;
+ idx = sched_find_first_bit(rq->dyn_bitmap);
+ if (!rt_prio(idx))
+ next = next_dynamic_task(rq, idx);
+ else {
+ queue = rq->active->queue + idx;
+ next = list_entry(queue->next, struct task_struct, run_list);
}
- idx = sched_find_first_bit(array->bitmap);
- queue = array->queue + idx;
- next = list_entry(queue->next, struct task_struct, run_list);
-
- if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
- unsigned long long delta = now - next->timestamp;
- if (unlikely((long long)(now - next->timestamp) < 0))
- delta = 0;
-
- if (next->sleep_type == SLEEP_INTERACTIVE)
- delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
-
- array = next->array;
- new_prio = recalc_task_prio(next, next->timestamp + delta);
-
- if (unlikely(next->prio != new_prio)) {
- dequeue_task(next, array);
- next->prio = new_prio;
- enqueue_task(next, array);
- }
- }
- next->sleep_type = SLEEP_NORMAL;
if (dependent_sleeper(cpu, rq, next))
next = rq->idle;
switch_tasks:
@@ -3539,10 +3530,6 @@ switch_tasks:
rcu_qsctr_inc(task_cpu(prev));
update_cpu_clock(prev, rq, now);
-
- prev->sleep_avg -= run_time;
- if ((long)prev->sleep_avg <= 0)
- prev->sleep_avg = 0;
prev->timestamp = prev->last_ran = now;
sched_info_switch(prev, next);
@@ -3978,29 +3965,21 @@ EXPORT_SYMBOL(sleep_on_timeout);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- struct prio_array *array;
unsigned long flags;
+ int queued, oldprio;
struct rq *rq;
- int oldprio;
BUG_ON(prio < 0 || prio > MAX_PRIO);
rq = task_rq_lock(p, &flags);
oldprio = p->prio;
- array = p->array;
- if (array)
- dequeue_task(p, array);
+ if ((queued = task_queued(p)))
+ dequeue_task(p, rq);
p->prio = prio;
- if (array) {
- /*
- * If changing to an RT priority then queue it
- * in the active array!
- */
- if (rt_task(p))
- array = rq->active;
- enqueue_task(p, array);
+ if (queued) {
+ enqueue_task(p, rq);
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
@@ -4009,8 +3988,8 @@ void rt_mutex_setprio(struct task_struct
if (task_running(rq, p)) {
if (p->prio > oldprio)
resched_task(rq->curr);
- } else if (TASK_PREEMPTS_CURR(p, rq))
- resched_task(rq->curr);
+ } else
+ try_preempt(p, rq);
}
task_rq_unlock(rq, &flags);
}
@@ -4019,8 +3998,7 @@ void rt_mutex_setprio(struct task_struct
void set_user_nice(struct task_struct *p, long nice)
{
- struct prio_array *array;
- int old_prio, delta;
+ int queued, old_prio,delta;
unsigned long flags;
struct rq *rq;
@@ -4041,9 +4019,8 @@ void set_user_nice(struct task_struct *p
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
- array = p->array;
- if (array) {
- dequeue_task(p, array);
+ if ((queued = task_queued(p))) {
+ dequeue_task(p, rq);
dec_raw_weighted_load(rq, p);
}
@@ -4053,8 +4030,8 @@ void set_user_nice(struct task_struct *p
p->prio = effective_prio(p);
delta = p->prio - old_prio;
- if (array) {
- enqueue_task(p, array);
+ if (queued) {
+ enqueue_task(p, rq);
inc_raw_weighted_load(rq, p);
/*
* If the task increased its priority or is running and
@@ -4130,7 +4107,7 @@ asmlinkage long sys_nice(int increment)
*
* This is the priority value as seen by users in /proc.
* RT tasks are offset by -200. Normal tasks are centered
- * around 0, value goes from -16 to +15.
+ * around 0, value goes from 0 to +19.
*/
int task_prio(const struct task_struct *p)
{
@@ -4177,18 +4154,13 @@ static inline struct task_struct *find_p
/* Actually do priority change: must hold rq lock. */
static void __setscheduler(struct task_struct *p, int policy, int prio)
{
- BUG_ON(p->array);
+ BUG_ON(task_queued(p));
p->policy = policy;
p->rt_priority = prio;
p->normal_prio = normal_prio(p);
/* we are holding p->pi_lock already */
p->prio = rt_mutex_getprio(p);
- /*
- * SCHED_BATCH tasks are treated as perpetual CPU hogs:
- */
- if (policy == SCHED_BATCH)
- p->sleep_avg = 0;
set_load_weight(p);
}
@@ -4204,8 +4176,7 @@ static void __setscheduler(struct task_s
int sched_setscheduler(struct task_struct *p, int policy,
struct sched_param *param)
{
- int retval, oldprio, oldpolicy = -1;
- struct prio_array *array;
+ int queued, retval, oldprio, oldpolicy = -1;
unsigned long flags;
struct rq *rq;
@@ -4279,12 +4250,11 @@ recheck:
spin_unlock_irqrestore(&p->pi_lock, flags);
goto recheck;
}
- array = p->array;
- if (array)
+ if ((queued = task_queued(p)))
deactivate_task(p, rq);
oldprio = p->prio;
__setscheduler(p, policy, param->sched_priority);
- if (array) {
+ if (queued) {
__activate_task(p, rq);
/*
* Reschedule if we are currently running on this runqueue and
@@ -4294,8 +4264,8 @@ recheck:
if (task_running(rq, p)) {
if (p->prio > oldprio)
resched_task(rq->curr);
- } else if (TASK_PREEMPTS_CURR(p, rq))
- resched_task(rq->curr);
+ } else
+ try_preempt(p, rq);
}
__task_rq_unlock(rq);
spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4568,13 +4538,14 @@ asmlinkage long sys_sched_getaffinity(pi
* sys_sched_yield - yield the current processor to other threads.
*
* this function yields the current CPU by moving the calling thread
- * to the expired array. If there are no other threads running on this
- * CPU then this function will return.
+ * to the expired array.
*/
asmlinkage long sys_sched_yield(void)
{
struct rq *rq = this_rq_lock();
- struct prio_array *array = current->array, *target = rq->expired;
+ struct task_struct *p = current;
+ struct prio_array *old_array = p->array;
+ int old_prio = p->prio;
schedstat_inc(rq, yld_cnt);
/*
@@ -4584,24 +4555,12 @@ asmlinkage long sys_sched_yield(void)
* (special rule: RT tasks will just roundrobin in the active
* array.)
*/
- if (rt_task(current))
- target = rq->active;
-
- if (array->nr_active == 1) {
- schedstat_inc(rq, yld_act_empty);
- if (!rq->expired->nr_active)
- schedstat_inc(rq, yld_both_empty);
- } else if (!rq->expired->nr_active)
- schedstat_inc(rq, yld_exp_empty);
-
- if (array != target) {
- dequeue_task(current, array);
- enqueue_task(current, target);
- } else
- /*
- * requeue_task is cheaper so perform that if possible.
- */
- requeue_task(current, array);
+ if (!rt_task(p)) {
+ p->array = rq->expired;
+ p->prio = p->static_prio;
+ bitmap_zero(p->bitmap, PRIO_RANGE);
+ }
+ requeue_task(p, rq, old_array, old_prio);
/*
* Since we are going to call schedule() anyway, there's
@@ -4940,8 +4899,8 @@ void __cpuinit init_idle(struct task_str
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
+ bitmap_zero(idle->bitmap, PRIO_RANGE + 1);
idle->timestamp = sched_clock();
- idle->sleep_avg = 0;
idle->array = NULL;
idle->prio = idle->normal_prio = MAX_PRIO;
idle->state = TASK_RUNNING;
@@ -5062,7 +5021,7 @@ static int __migrate_task(struct task_st
goto out;
set_task_cpu(p, dest_cpu);
- if (p->array) {
+ if (task_queued(p)) {
/*
* Sync timestamp with rq_dest's before activating.
* The same thing could be achieved by doing this step
@@ -5073,8 +5032,7 @@ static int __migrate_task(struct task_st
+ rq_dest->most_recent_timestamp;
deactivate_task(p, rq_src);
__activate_task(p, rq_dest);
- if (TASK_PREEMPTS_CURR(p, rq_dest))
- resched_task(rq_dest->curr);
+ try_preempt(p, rq_dest);
}
ret = 1;
out:
@@ -6904,9 +6862,10 @@ void __init sched_init(void)
spin_lock_init(&rq->lock);
lockdep_set_class(&rq->lock, &rq->rq_lock_key);
rq->nr_running = 0;
+ rq->prio_rotation = 0;
+ rq->prio_level = MAX_RT_PRIO;
rq->active = rq->arrays;
rq->expired = rq->arrays + 1;
- rq->best_expired_prio = MAX_PRIO;
#ifdef CONFIG_SMP
rq->sd = NULL;
@@ -6921,14 +6880,17 @@ void __init sched_init(void)
atomic_set(&rq->nr_iowait, 0);
for (j = 0; j < 2; j++) {
+
array = rq->arrays + j;
- for (k = 0; k < MAX_PRIO; k++) {
+ for (k = 0; k < MAX_PRIO; k++)
INIT_LIST_HEAD(array->queue + k);
- __clear_bit(k, array->bitmap);
- }
- // delimiter for bitsearch
- __set_bit(MAX_PRIO, array->bitmap);
}
+ for (k = 0; k < PRIO_RANGE; k++)
+ rq->prio_quota[k] = 0;
+ bitmap_zero(rq->dyn_bitmap, MAX_DYN_PRIO);
+ bitmap_zero(rq->static_bitmap, MAX_PRIO);
+ /* delimiter for bitsearch */
+ __set_bit(MAX_DYN_PRIO, rq->dyn_bitmap);
}
set_load_weight(&init_task);
@@ -6984,10 +6946,10 @@ EXPORT_SYMBOL(__might_sleep);
#ifdef CONFIG_MAGIC_SYSRQ
void normalize_rt_tasks(void)
{
- struct prio_array *array;
struct task_struct *p;
unsigned long flags;
struct rq *rq;
+ int queued;
read_lock_irq(&tasklist_lock);
for_each_process(p) {
@@ -6997,11 +6959,10 @@ void normalize_rt_tasks(void)
spin_lock_irqsave(&p->pi_lock, flags);
rq = __task_rq_lock(p);
- array = p->array;
- if (array)
+ if ((queued = task_queued(p)))
deactivate_task(p, task_rq(p));
__setscheduler(p, SCHED_NORMAL, 0);
- if (array) {
+ if (queued) {
__activate_task(p, task_rq(p));
resched_task(rq->curr);
}
Index: linux-2.6.20-rsdl/include/linux/init_task.h
===================================================================
--- linux-2.6.20-rsdl.orig/include/linux/init_task.h 2007-03-04 17:30:25.000000000 +1100
+++ linux-2.6.20-rsdl/include/linux/init_task.h 2007-03-04 17:30:31.000000000 +1100
@@ -102,6 +102,7 @@ extern struct group_info init_groups;
.prio = MAX_PRIO-20, \
.static_prio = MAX_PRIO-20, \
.normal_prio = MAX_PRIO-20, \
+ .rotation = 0, \
.policy = SCHED_NORMAL, \
.cpus_allowed = CPU_MASK_ALL, \
.mm = NULL, \
@@ -109,6 +110,7 @@ extern struct group_info init_groups;
.run_list = LIST_HEAD_INIT(tsk.run_list), \
.ioprio = 0, \
.time_slice = HZ, \
+ .quota = HZ, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \
.ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \
Index: linux-2.6.20-rsdl/include/linux/sched.h
===================================================================
--- linux-2.6.20-rsdl.orig/include/linux/sched.h 2007-03-04 17:30:30.000000000 +1100
+++ linux-2.6.20-rsdl/include/linux/sched.h 2007-03-04 17:30:31.000000000 +1100
@@ -521,8 +521,9 @@ struct signal_struct {
#define MAX_USER_RT_PRIO 100
#define MAX_RT_PRIO MAX_USER_RT_PRIO
+#define PRIO_RANGE (40)
-#define MAX_PRIO (MAX_RT_PRIO + 40)
+#define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE)
#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
#define rt_task(p) rt_prio((p)->prio)
@@ -788,13 +789,6 @@ struct mempolicy;
struct pipe_inode_info;
struct uts_namespace;
-enum sleep_type {
- SLEEP_NORMAL,
- SLEEP_NONINTERACTIVE,
- SLEEP_INTERACTIVE,
- SLEEP_INTERRUPTED,
-};
-
struct prio_array;
struct task_struct {
@@ -814,20 +808,31 @@ struct task_struct {
int load_weight; /* for niceness load balancing purposes */
int prio, static_prio, normal_prio;
struct list_head run_list;
+ DECLARE_BITMAP(bitmap, PRIO_RANGE + 1);
struct prio_array *array;
+ unsigned long rotation;
+ /* Which major runqueue rotation did this task run */
unsigned short ioprio;
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif
- unsigned long sleep_avg;
unsigned long long timestamp, last_ran;
unsigned long long sched_time; /* sched_clock time spent running */
- enum sleep_type sleep_type;
unsigned long policy;
cpumask_t cpus_allowed;
unsigned int time_slice, first_time_slice;
+ /*
+ * How much this task is entitled to run at the current priority
+ * before being requeued at a lower priority, and is this the very
+ * first time_slice this task has ever run.
+ */
+ unsigned int quota;
+ /*
+ * How much this task contributes to the current priority queue
+ * length
+ */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
--
-ck
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] [RSDL 5/6] sched: implement rsdl cpu scheduler
2007-03-04 7:06 [PATCH] [RSDL 5/6] sched: implement rsdl cpu scheduler Con Kolivas
@ 2007-03-05 20:38 ` Matt Mackall
2007-03-05 21:05 ` Gene Heskett
2007-03-05 21:11 ` Con Kolivas
0 siblings, 2 replies; 4+ messages in thread
From: Matt Mackall @ 2007-03-05 20:38 UTC (permalink / raw)
To: Con Kolivas; +Cc: linux kernel mailing list, ck list
On Sun, Mar 04, 2007 at 06:06:22PM +1100, Con Kolivas wrote:
> + * This is the time all tasks within the same priority round robin.
> + * Set to a minimum of 6ms.
> */
> +#define RR_INTERVAL ((6 * HZ / 1001) + 1)
What happens with small HZ? Like 100? I suppose 10ms is a reasonable
number here. But in general, how well does this scheduler do without a
time source other than jiffies?
--
Mathematics is the supreme nostalgia of our time.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] [RSDL 5/6] sched: implement rsdl cpu scheduler
2007-03-05 20:38 ` Matt Mackall
@ 2007-03-05 21:05 ` Gene Heskett
2007-03-05 21:11 ` Con Kolivas
1 sibling, 0 replies; 4+ messages in thread
From: Gene Heskett @ 2007-03-05 21:05 UTC (permalink / raw)
To: linux-kernel; +Cc: Matt Mackall, Con Kolivas, ck list
On Monday 05 March 2007, Matt Mackall wrote:
>On Sun, Mar 04, 2007 at 06:06:22PM +1100, Con Kolivas wrote:
>> + * This is the time all tasks within the same priority round robin.
>> + * Set to a minimum of 6ms.
>> */
>> +#define RR_INTERVAL ((6 * HZ / 1001) + 1)
>
>What happens with small HZ? Like 100? I suppose 10ms is a reasonable
>number here. But in general, how well does this scheduler do without a
>time source other than jiffies?
I don't personally know, but another poster I recall did say he was using
a slow clock and it worked great so you might backtrack in this thread.
I'm using the 1k clock here.
--
Cheers, Gene
"There are four boxes to be used in defense of liberty:
soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] [RSDL 5/6] sched: implement rsdl cpu scheduler
2007-03-05 20:38 ` Matt Mackall
2007-03-05 21:05 ` Gene Heskett
@ 2007-03-05 21:11 ` Con Kolivas
1 sibling, 0 replies; 4+ messages in thread
From: Con Kolivas @ 2007-03-05 21:11 UTC (permalink / raw)
To: Matt Mackall; +Cc: linux kernel mailing list, ck list
On Tuesday 06 March 2007 07:38, Matt Mackall wrote:
> On Sun, Mar 04, 2007 at 06:06:22PM +1100, Con Kolivas wrote:
> > + * This is the time all tasks within the same priority round robin.
> > + * Set to a minimum of 6ms.
> > */
> > +#define RR_INTERVAL ((6 * HZ / 1001) + 1)
>
> What happens with small HZ? Like 100? I suppose 10ms is a reasonable
> number here. But in general, how well does this scheduler do without a
> time source other than jiffies?
Perfectly fine at 100. It only needs to do accounting during a busy tick, and
on dynticks we always service a timer interrupt during that busy tick so it
works fine there as well. In fact, choosing a lower HZ you are indirectly
making a decision that you don't value latency as much as at the higher HZ
and 10ms is fine for the scheduling there. It would be possible to service
the cpu accounting with any timer that fired at at least 100HZ even if there
was some way to turn ticks off fully when the cpu is busy (which there isn't
currently a way to do).
--
-ck
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2007-03-05 21:11 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-03-04 7:06 [PATCH] [RSDL 5/6] sched: implement rsdl cpu scheduler Con Kolivas
2007-03-05 20:38 ` Matt Mackall
2007-03-05 21:05 ` Gene Heskett
2007-03-05 21:11 ` Con Kolivas
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox