From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753309AbXDQLcO (ORCPT ); Tue, 17 Apr 2007 07:32:14 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1753362AbXDQLcO (ORCPT ); Tue, 17 Apr 2007 07:32:14 -0400 Received: from holomorphy.com ([66.93.40.71]:41124 "EHLO holomorphy.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753309AbXDQLcK (ORCPT ); Tue, 17 Apr 2007 07:32:10 -0400 Date: Tue, 17 Apr 2007 04:31:34 -0700 From: William Lee Irwin III To: Ingo Molnar Cc: Davide Libenzi , Nick Piggin , Peter Williams , Mike Galbraith , Con Kolivas , ck list , Bill Huey , Linux Kernel Mailing List , Linus Torvalds , Andrew Morton , Arjan van de Ven , Thomas Gleixner Subject: Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS] Message-ID: <20070417113134.GW8915@holomorphy.com> References: <46244A52.4000403@bigpond.net.au> <20070417042954.GG25513@wotan.suse.de> <20070417060955.GO8915@holomorphy.com> <20070417061503.GC1057@wotan.suse.de> <20070417070949.GR8915@holomorphy.com> <20070417073308.GB30559@elte.hu> <20070417090538.GU8915@holomorphy.com> <20070417092422.GA19414@elte.hu> <20070417095749.GN2986@holomorphy.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20070417095749.GN2986@holomorphy.com> Organization: The Domain of Holomorphy User-Agent: Mutt/1.5.13 (2006-08-11) Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org On Tue, Apr 17, 2007 at 02:57:49AM -0700, William Lee Irwin III wrote: > Interesting! That's what vdls did, except its fundamental data structure > was more like a circular buffer data structure (resembling Davide > Libenzi's timer ring in concept, but with all the details different). > I'm not entirely sure how that would've turned out performancewise if > I'd done any tuning at all. I was mostly interested in doing something > like what I heard Bob Mullens did in 1976 for basic pedagogical value > about schedulers to prepare for writing patches for gang scheduling as > opposed to creating a viable replacement for the mainline scheduler. Con helped me dredge up the vdls bits, so here is the last version I before I got tired of toying with the idea. It's not all that clean, with a fair amount of debug code floating around and a number of idiocies (it seems there was a plot to use a heap somewhere I forgot about entirely, never mind other cruft), but I thought I should at least say something more provable than "there was a patch I never posted." Enjoy! -- wli diff -prauN linux-2.6.0-test11/fs/proc/array.c sched-2.6.0-test11-5/fs/proc/array.c --- linux-2.6.0-test11/fs/proc/array.c 2003-11-26 12:44:26.000000000 -0800 +++ sched-2.6.0-test11-5/fs/proc/array.c 2003-12-17 07:37:11.000000000 -0800 @@ -162,7 +162,7 @@ static inline char * task_state(struct t "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1000000000/1024), + 0UL, /* was ->sleep_avg */ p->tgid, p->pid, p->pid ? p->real_parent->pid : 0, p->pid && p->ptrace ? p->parent->pid : 0, @@ -345,7 +345,7 @@ int proc_pid_stat(struct task_struct *ta read_unlock(&tasklist_lock); res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %d %d\n", task->pid, task->comm, state, @@ -390,8 +390,8 @@ int proc_pid_stat(struct task_struct *ta task->cnswap, task->exit_signal, task_cpu(task), - task->rt_priority, - task->policy); + task_prio(task), + task_sched_policy(task)); if(mm) mmput(mm); return res; diff -prauN linux-2.6.0-test11/include/asm-i386/thread_info.h sched-2.6.0-test11-5/include/asm-i386/thread_info.h --- linux-2.6.0-test11/include/asm-i386/thread_info.h 2003-11-26 12:43:06.000000000 -0800 +++ sched-2.6.0-test11-5/include/asm-i386/thread_info.h 2003-12-17 04:55:22.000000000 -0800 @@ -114,6 +114,8 @@ static inline struct thread_info *curren #define TIF_SINGLESTEP 4 /* restore singlestep on return to user mode */ #define TIF_IRET 5 /* return with iret */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ +#define TIF_QUEUED 17 +#define TIF_PREEMPT 18 #define _TIF_SYSCALL_TRACE (1<prio < MAX_RT_PRIO) +#define NICE_QLEN 128 +#define MIN_TS_PRIO MAX_RT_PRIO +#define MAX_TS_PRIO (40*NICE_QLEN) +#define MIN_BATCH_PRIO (MAX_RT_PRIO + MAX_TS_PRIO) +#define MAX_BATCH_PRIO 100 +#define MAX_PRIO (MIN_BATCH_PRIO + MAX_BATCH_PRIO) +#define USER_PRIO(prio) ((prio) - MAX_RT_PRIO) +#define MAX_USER_PRIO USER_PRIO(MAX_PRIO) /* * Some day this will be a full-fledged user tracking system.. @@ -330,6 +336,36 @@ struct k_itimer { struct io_context; /* See blkdev.h */ void exit_io_context(void); +struct rt_data { + int prio, rt_policy; + unsigned long quantum, ticks; +}; + +/* XXX: do %cpu estimation for ts wakeup levels */ +struct ts_data { + int nice; + unsigned long ticks, frac_cpu; + unsigned long sample_start, sample_ticks; +}; + +struct bt_data { + int prio; + unsigned long ticks; +}; + +union class_data { + struct rt_data rt; + struct ts_data ts; + struct bt_data bt; +}; + +struct sched_info { + int idx; /* queue index, used by all classes */ + unsigned long policy; /* scheduling policy */ + struct list_head run_list; /* list links for priority queues */ + union class_data cl_data; /* class-specific data */ +}; + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -339,18 +375,9 @@ struct task_struct { int lock_depth; /* Lock depth */ - int prio, static_prio; - struct list_head run_list; - prio_array_t *array; - - unsigned long sleep_avg; - long interactive_credit; - unsigned long long timestamp; - int activated; + struct sched_info sched_info; - unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; struct list_head tasks; struct list_head ptrace_children; @@ -391,7 +418,6 @@ struct task_struct { int __user *set_child_tid; /* CLONE_CHILD_SETTID */ int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ - unsigned long rt_priority; unsigned long it_real_value, it_prof_value, it_virt_value; unsigned long it_real_incr, it_prof_incr, it_virt_incr; struct timer_list real_timer; @@ -520,12 +546,14 @@ extern void node_nr_running_init(void); #define node_nr_running_init() {} #endif -extern void set_user_nice(task_t *p, long nice); -extern int task_prio(task_t *p); -extern int task_nice(task_t *p); -extern int task_curr(task_t *p); -extern int idle_cpu(int cpu); - +void set_user_nice(task_t *task, long nice); +int task_prio(task_t *task); +int task_nice(task_t *task); +int task_sched_policy(task_t *task); +void set_task_sched_policy(task_t *task, int policy); +int rt_task(task_t *task); +int task_curr(task_t *task); +int idle_cpu(int cpu); void yield(void); /* @@ -844,6 +872,21 @@ static inline int need_resched(void) return unlikely(test_thread_flag(TIF_NEED_RESCHED)); } +static inline void set_task_queued(task_t *task) +{ + set_tsk_thread_flag(task, TIF_QUEUED); +} + +static inline void clear_task_queued(task_t *task) +{ + clear_tsk_thread_flag(task, TIF_QUEUED); +} + +static inline int task_queued(task_t *task) +{ + return test_tsk_thread_flag(task, TIF_QUEUED); +} + extern void __cond_resched(void); static inline void cond_resched(void) { diff -prauN linux-2.6.0-test11/kernel/Makefile sched-2.6.0-test11-5/kernel/Makefile --- linux-2.6.0-test11/kernel/Makefile 2003-11-26 12:43:24.000000000 -0800 +++ sched-2.6.0-test11-5/kernel/Makefile 2003-12-17 03:30:08.000000000 -0800 @@ -6,7 +6,7 @@ obj-y = sched.o fork.o exec_domain.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ - rcupdate.o intermodule.o extable.o params.o posix-timers.o + rcupdate.o intermodule.o extable.o params.o posix-timers.o sched/ obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o diff -prauN linux-2.6.0-test11/kernel/exit.c sched-2.6.0-test11-5/kernel/exit.c --- linux-2.6.0-test11/kernel/exit.c 2003-11-26 12:45:29.000000000 -0800 +++ sched-2.6.0-test11-5/kernel/exit.c 2003-12-17 07:04:02.000000000 -0800 @@ -225,7 +225,7 @@ void reparent_to_init(void) /* Set the exit signal to SIGCHLD so we signal init on exit */ current->exit_signal = SIGCHLD; - if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0)) + if (task_nice(current) < 0) set_user_nice(current, 0); /* cpus_allowed? */ /* rt_priority? */ diff -prauN linux-2.6.0-test11/kernel/fork.c sched-2.6.0-test11-5/kernel/fork.c --- linux-2.6.0-test11/kernel/fork.c 2003-11-26 12:42:58.000000000 -0800 +++ sched-2.6.0-test11-5/kernel/fork.c 2003-12-23 06:22:59.000000000 -0800 @@ -836,6 +836,9 @@ struct task_struct *copy_process(unsigne atomic_inc(&p->user->__count); atomic_inc(&p->user->processes); + clear_tsk_thread_flag(p, TIF_SIGPENDING); + clear_tsk_thread_flag(p, TIF_QUEUED); + /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there @@ -861,13 +864,21 @@ struct task_struct *copy_process(unsigne p->state = TASK_UNINTERRUPTIBLE; copy_flags(clone_flags, p); - if (clone_flags & CLONE_IDLETASK) + if (clone_flags & CLONE_IDLETASK) { p->pid = 0; - else { + set_task_sched_policy(p, SCHED_IDLE); + } else { + if (task_sched_policy(p) == SCHED_IDLE) { + memset(&p->sched_info, 0, sizeof(struct sched_info)); + set_task_sched_policy(p, SCHED_NORMAL); + set_user_nice(p, 0); + } p->pid = alloc_pidmap(); if (p->pid == -1) goto bad_fork_cleanup; } + if (p->pid == 1) + BUG_ON(task_nice(p)); retval = -EFAULT; if (clone_flags & CLONE_PARENT_SETTID) if (put_user(p->pid, parent_tidptr)) @@ -875,8 +886,7 @@ struct task_struct *copy_process(unsigne p->proc_dentry = NULL; - INIT_LIST_HEAD(&p->run_list); - + INIT_LIST_HEAD(&p->sched_info.run_list); INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); INIT_LIST_HEAD(&p->posix_timers); @@ -885,8 +895,6 @@ struct task_struct *copy_process(unsigne spin_lock_init(&p->alloc_lock); spin_lock_init(&p->switch_lock); spin_lock_init(&p->proc_lock); - - clear_tsk_thread_flag(p, TIF_SIGPENDING); init_sigpending(&p->pending); p->it_real_value = p->it_virt_value = p->it_prof_value = 0; @@ -898,7 +906,6 @@ struct task_struct *copy_process(unsigne p->tty_old_pgrp = 0; p->utime = p->stime = 0; p->cutime = p->cstime = 0; - p->array = NULL; p->lock_depth = -1; /* -1 = no lock */ p->start_time = get_jiffies_64(); p->security = NULL; @@ -948,33 +955,6 @@ struct task_struct *copy_process(unsigne p->pdeath_signal = 0; /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. - */ - local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; - /* - * The remainder of the first timeslice might be recovered by - * the parent if the child exits early enough. - */ - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); - if (!current->time_slice) { - /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the - * runqueue lock is not a problem. - */ - current->time_slice = 1; - preempt_disable(); - scheduler_tick(0, 0); - local_irq_enable(); - preempt_enable(); - } else - local_irq_enable(); - /* * Ok, add it to the run-queues and make it * visible to the rest of the system. * diff -prauN linux-2.6.0-test11/kernel/sched/Makefile sched-2.6.0-test11-5/kernel/sched/Makefile --- linux-2.6.0-test11/kernel/sched/Makefile 1969-12-31 16:00:00.000000000 -0800 +++ sched-2.6.0-test11-5/kernel/sched/Makefile 2003-12-17 03:32:21.000000000 -0800 @@ -0,0 +1 @@ +obj-y = util.o ts.o idle.o rt.o batch.o diff -prauN linux-2.6.0-test11/kernel/sched/batch.c sched-2.6.0-test11-5/kernel/sched/batch.c --- linux-2.6.0-test11/kernel/sched/batch.c 1969-12-31 16:00:00.000000000 -0800 +++ sched-2.6.0-test11-5/kernel/sched/batch.c 2003-12-19 21:32:49.000000000 -0800 @@ -0,0 +1,190 @@ +#include +#include +#include +#include +#include +#include "queue.h" + +struct batch_queue { + int base, tasks; + task_t *curr; + unsigned long bitmap[BITS_TO_LONGS(MAX_BATCH_PRIO)]; + struct list_head queue[MAX_BATCH_PRIO]; +}; + +static int batch_quantum = 1024; +static DEFINE_PER_CPU(struct batch_queue, batch_queues); + +static int batch_init(struct policy *policy, int cpu) +{ + int k; + struct batch_queue *queue = &per_cpu(batch_queues, cpu); + + policy->queue = (struct queue *)queue; + for (k = 0; k < MAX_BATCH_PRIO; ++k) + INIT_LIST_HEAD(&queue->queue[k]); + return 0; +} + +static int batch_tick(struct queue *__queue, task_t *task, int user_ticks, int sys_ticks) +{ + struct batch_queue *queue = (struct batch_queue *)__queue; + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + + cpustat->nice += user_ticks; + cpustat->system += sys_ticks; + + task->sched_info.cl_data.bt.ticks--; + if (!task->sched_info.cl_data.bt.ticks) { + int new_idx; + + task->sched_info.cl_data.bt.ticks = batch_quantum; + new_idx = (task->sched_info.idx + task->sched_info.cl_data.bt.prio) + % MAX_BATCH_PRIO; + if (!test_bit(new_idx, queue->bitmap)) + __set_bit(new_idx, queue->bitmap); + list_move_tail(&task->sched_info.run_list, + &queue->queue[new_idx]); + if (list_empty(&queue->queue[task->sched_info.idx])) + __clear_bit(task->sched_info.idx, queue->bitmap); + task->sched_info.idx = new_idx; + queue->base = find_first_circular_bit(queue->bitmap, + queue->base, + MAX_BATCH_PRIO); + set_need_resched(); + } + return 0; +} + +static void batch_yield(struct queue *__queue, task_t *task) +{ + int new_idx; + struct batch_queue *queue = (struct batch_queue *)__queue; + + new_idx = (queue->base + MAX_BATCH_PRIO - 1) % MAX_BATCH_PRIO; + if (!test_bit(new_idx, queue->bitmap)) + __set_bit(new_idx, queue->bitmap); + list_move_tail(&task->sched_info.run_list, &queue->queue[new_idx]); + if (list_empty(&queue->queue[task->sched_info.idx])) + __clear_bit(task->sched_info.idx, queue->bitmap); + task->sched_info.idx = new_idx; + queue->base = find_first_circular_bit(queue->bitmap, + queue->base, + MAX_BATCH_PRIO); + set_need_resched(); +} + +static task_t *batch_curr(struct queue *__queue) +{ + struct batch_queue *queue = (struct batch_queue *)__queue; + return queue->curr; +} + +static void batch_set_curr(struct queue *__queue, task_t *task) +{ + struct batch_queue *queue = (struct batch_queue *)__queue; + queue->curr = task; +} + +static task_t *batch_best(struct queue *__queue) +{ + int idx; + struct batch_queue *queue = (struct batch_queue *)__queue; + + idx = find_first_circular_bit(queue->bitmap, + queue->base, + MAX_BATCH_PRIO); + BUG_ON(idx >= MAX_BATCH_PRIO); + BUG_ON(list_empty(&queue->queue[idx])); + return list_entry(queue->queue[idx].next, task_t, sched_info.run_list); +} + +static void batch_enqueue(struct queue *__queue, task_t *task) +{ + int idx; + struct batch_queue *queue = (struct batch_queue *)__queue; + + idx = (queue->base + task->sched_info.cl_data.bt.prio) % MAX_BATCH_PRIO; + if (!test_bit(idx, queue->bitmap)) + __set_bit(idx, queue->bitmap); + list_add_tail(&task->sched_info.run_list, &queue->queue[idx]); + task->sched_info.idx = idx; + task->sched_info.cl_data.bt.ticks = batch_quantum; + queue->tasks++; + if (!queue->curr) + queue->curr = task; +} + +static void batch_dequeue(struct queue *__queue, task_t *task) +{ + struct batch_queue *queue = (struct batch_queue *)__queue; + list_del(&task->sched_info.run_list); + if (list_empty(&queue->queue[task->sched_info.idx])) + __clear_bit(task->sched_info.idx, queue->bitmap); + queue->tasks--; + if (!queue->tasks) + queue->curr = NULL; + else if (task == queue->curr) + queue->curr = batch_best(__queue); +} + +static int batch_preempt(struct queue *__queue, task_t *task) +{ + struct batch_queue *queue = (struct batch_queue *)__queue; + if (!queue->curr) + return 1; + else + return task->sched_info.cl_data.bt.prio + < queue->curr->sched_info.cl_data.bt.prio; +} + +static int batch_tasks(struct queue *__queue) +{ + struct batch_queue *queue = (struct batch_queue *)__queue; + return queue->tasks; +} + +static int batch_nice(struct queue *queue, task_t *task) +{ + return 20; +} + +static int batch_prio(task_t *task) +{ + return USER_PRIO(task->sched_info.cl_data.bt.prio + MIN_BATCH_PRIO); +} + +static void batch_setprio(task_t *task, int prio) +{ + BUG_ON(prio < 0); + BUG_ON(prio >= MAX_BATCH_PRIO); + task->sched_info.cl_data.bt.prio = prio; +} + +struct queue_ops batch_ops = { + .init = batch_init, + .fini = nop_fini, + .tick = batch_tick, + .yield = batch_yield, + .curr = batch_curr, + .set_curr = batch_set_curr, + .tasks = batch_tasks, + .best = batch_best, + .enqueue = batch_enqueue, + .dequeue = batch_dequeue, + .start_wait = queue_nop, + .stop_wait = queue_nop, + .sleep = queue_nop, + .wake = queue_nop, + .preempt = batch_preempt, + .nice = batch_nice, + .renice = nop_renice, + .prio = batch_prio, + .setprio = batch_setprio, + .timeslice = nop_timeslice, + .set_timeslice = nop_set_timeslice, +}; + +struct policy batch_policy = { + .ops = &batch_ops, +}; diff -prauN linux-2.6.0-test11/kernel/sched/idle.c sched-2.6.0-test11-5/kernel/sched/idle.c --- linux-2.6.0-test11/kernel/sched/idle.c 1969-12-31 16:00:00.000000000 -0800 +++ sched-2.6.0-test11-5/kernel/sched/idle.c 2003-12-19 17:31:39.000000000 -0800 @@ -0,0 +1,99 @@ +#include +#include +#include +#include +#include +#include "queue.h" + +static DEFINE_PER_CPU(task_t *, idle_tasks) = NULL; + +static int idle_nice(struct queue *queue, task_t *task) +{ + return 20; +} + +static int idle_tasks(struct queue *queue) +{ + task_t **idle = (task_t **)queue; + return !!(*idle); +} + +static task_t *idle_task(struct queue *queue) +{ + return *((task_t **)queue); +} + +static void idle_yield(struct queue *queue, task_t *task) +{ + set_need_resched(); +} + +static void idle_enqueue(struct queue *queue, task_t *task) +{ + task_t **idle = (task_t **)queue; + *idle = task; +} + +static void idle_dequeue(struct queue *queue, task_t *task) +{ +} + +static int idle_preempt(struct queue *queue, task_t *task) +{ + return 0; +} + +static int idle_tick(struct queue *queue, task_t *task, int user_ticks, int sys_ticks) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + runqueue_t *rq = &per_cpu(runqueues, smp_processor_id()); + + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait += sys_ticks; + else + cpustat->idle += sys_ticks; + return 1; +} + +static int idle_init(struct policy *policy, int cpu) +{ + policy->queue = (struct queue *)&per_cpu(idle_tasks, cpu); + return 0; +} + +static int idle_prio(task_t *task) +{ + return MAX_USER_PRIO; +} + +static void idle_setprio(task_t *task, int prio) +{ +} + +static struct queue_ops idle_ops = { + .init = idle_init, + .fini = nop_fini, + .tick = idle_tick, + .yield = idle_yield, + .curr = idle_task, + .set_curr = queue_nop, + .tasks = idle_tasks, + .best = idle_task, + .enqueue = idle_enqueue, + .dequeue = idle_dequeue, + .start_wait = queue_nop, + .stop_wait = queue_nop, + .sleep = queue_nop, + .wake = queue_nop, + .preempt = idle_preempt, + .nice = idle_nice, + .renice = nop_renice, + .prio = idle_prio, + .setprio = idle_setprio, + .timeslice = nop_timeslice, + .set_timeslice = nop_set_timeslice, +}; + +struct policy idle_policy = { + .ops = &idle_ops, +}; diff -prauN linux-2.6.0-test11/kernel/sched/queue.h sched-2.6.0-test11-5/kernel/sched/queue.h --- linux-2.6.0-test11/kernel/sched/queue.h 1969-12-31 16:00:00.000000000 -0800 +++ sched-2.6.0-test11-5/kernel/sched/queue.h 2003-12-23 03:58:02.000000000 -0800 @@ -0,0 +1,104 @@ +#define SCHED_POLICY_RT 0 +#define SCHED_POLICY_TS 1 +#define SCHED_POLICY_BATCH 2 +#define SCHED_POLICY_IDLE 3 + +#define RT_POLICY_FIFO 0 +#define RT_POLICY_RR 1 + +#define NODE_THRESHOLD 125 + +struct queue; +struct queue_ops; + +struct policy { + struct queue *queue; + struct queue_ops *ops; +}; + +extern struct policy rt_policy, ts_policy, batch_policy, idle_policy; + +struct runqueue { + spinlock_t lock; + int curr; + task_t *__curr; + unsigned long policy_bitmap; + struct policy *policies[BITS_PER_LONG]; + unsigned long nr_running, nr_switches, nr_uninterruptible; + struct mm_struct *prev_mm; + int prev_cpu_load[NR_CPUS]; +#ifdef CONFIG_NUMA + atomic_t *node_nr_running; + int prev_node_load[MAX_NUMNODES]; +#endif + task_t *migration_thread; + struct list_head migration_queue; + + atomic_t nr_iowait; +}; + +typedef struct runqueue runqueue_t; + +struct queue_ops { + int (*init)(struct policy *, int); + void (*fini)(struct policy *, int); + task_t *(*curr)(struct queue *); + void (*set_curr)(struct queue *, task_t *); + task_t *(*best)(struct queue *); + int (*tick)(struct queue *, task_t *, int, int); + int (*tasks)(struct queue *); + void (*enqueue)(struct queue *, task_t *); + void (*dequeue)(struct queue *, task_t *); + void (*start_wait)(struct queue *, task_t *); + void (*stop_wait)(struct queue *, task_t *); + void (*sleep)(struct queue *, task_t *); + void (*wake)(struct queue *, task_t *); + int (*preempt)(struct queue *, task_t *); + void (*yield)(struct queue *, task_t *); + int (*prio)(task_t *); + void (*setprio)(task_t *, int); + int (*nice)(struct queue *, task_t *); + void (*renice)(struct queue *, task_t *, int); + unsigned long (*timeslice)(struct queue *, task_t *); + void (*set_timeslice)(struct queue *, task_t *, unsigned long); +}; + +DECLARE_PER_CPU(runqueue_t, runqueues); + +int find_first_circular_bit(unsigned long *, int, int); +void queue_nop(struct queue *, task_t *); +void nop_renice(struct queue *, task_t *, int); +void nop_fini(struct policy *, int); +unsigned long nop_timeslice(struct queue *, task_t *); +void nop_set_timeslice(struct queue *, task_t *, unsigned long); + +/* #define DEBUG_SCHED */ + +#ifdef DEBUG_SCHED +#define __check_task_policy(idx) \ +do { \ + unsigned long __idx__ = (idx); \ + if (__idx__ > SCHED_POLICY_IDLE) { \ + printk("invalid policy 0x%lx\n", __idx__); \ + BUG(); \ + } \ +} while (0) + +#define check_task_policy(task) \ +do { \ + __check_task_policy((task)->sched_info.policy); \ +} while (0) + +#define check_policy(policy) \ +do { \ + BUG_ON((policy) != &rt_policy && \ + (policy) != &ts_policy && \ + (policy) != &batch_policy && \ + (policy) != &idle_policy); \ +} while (0) + +#else /* !DEBUG_SCHED */ +#define __check_task_policy(idx) do { } while (0) +#define check_task_policy(task) do { } while (0) +#define check_policy(policy) do { } while (0) +#endif /* !DEBUG_SCHED */ diff -prauN linux-2.6.0-test11/kernel/sched/rt.c sched-2.6.0-test11-5/kernel/sched/rt.c --- linux-2.6.0-test11/kernel/sched/rt.c 1969-12-31 16:00:00.000000000 -0800 +++ sched-2.6.0-test11-5/kernel/sched/rt.c 2003-12-19 18:16:07.000000000 -0800 @@ -0,0 +1,208 @@ +#include +#include +#include +#include +#include +#include "queue.h" + +#ifdef DEBUG_SCHED +#define check_rt_policy(task) \ +do { \ + BUG_ON((task)->sched_info.policy != SCHED_POLICY_RT); \ + BUG_ON((task)->sched_info.cl_data.rt.rt_policy != RT_POLICY_RR \ + && \ + (task)->sched_info.cl_data.rt.rt_policy!=RT_POLICY_FIFO); \ + BUG_ON((task)->sched_info.cl_data.rt.prio < 0); \ + BUG_ON((task)->sched_info.cl_data.rt.prio >= MAX_RT_PRIO); \ +} while (0) +#else +#define check_rt_policy(task) do { } while (0) +#endif + +struct rt_queue { + unsigned long bitmap[BITS_TO_LONGS(MAX_RT_PRIO)]; + struct list_head queue[MAX_RT_PRIO]; + task_t *curr; + int tasks; +}; + +static DEFINE_PER_CPU(struct rt_queue, rt_queues); + +static int rt_init(struct policy *policy, int cpu) +{ + int k; + struct rt_queue *queue = &per_cpu(rt_queues, cpu); + + policy->queue = (struct queue *)queue; + for (k = 0; k < MAX_RT_PRIO; ++k) + INIT_LIST_HEAD(&queue->queue[k]); + return 0; +} + +static void rt_yield(struct queue *__queue, task_t *task) +{ + struct rt_queue *queue = (struct rt_queue *)__queue; + check_rt_policy(task); + list_del(&task->sched_info.run_list); + if (list_empty(&queue->queue[task->sched_info.cl_data.rt.prio])) + set_need_resched(); + list_add_tail(&task->sched_info.run_list, + &queue->queue[task->sched_info.cl_data.rt.prio]); + check_rt_policy(task); +} + +static int rt_tick(struct queue *queue, task_t *task, int user_ticks, int sys_ticks) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + check_rt_policy(task); + cpustat->user += user_ticks; + cpustat->system += sys_ticks; + if (task->sched_info.cl_data.rt.rt_policy == RT_POLICY_RR) { + task->sched_info.cl_data.rt.ticks--; + if (!task->sched_info.cl_data.rt.ticks) { + task->sched_info.cl_data.rt.ticks = + task->sched_info.cl_data.rt.quantum; + rt_yield(queue, task); + } + } + check_rt_policy(task); + return 0; +} + +static task_t *rt_curr(struct queue *__queue) +{ + struct rt_queue *queue = (struct rt_queue *)__queue; + task_t *task = queue->curr; + check_rt_policy(task); + return task; +} + +static void rt_set_curr(struct queue *__queue, task_t *task) +{ + struct rt_queue *queue = (struct rt_queue *)__queue; + queue->curr = task; + check_rt_policy(task); +} + +static task_t *rt_best(struct queue *__queue) +{ + struct rt_queue *queue = (struct rt_queue *)__queue; + task_t *task; + int idx; + idx = find_first_bit(queue->bitmap, MAX_RT_PRIO); + BUG_ON(idx >= MAX_RT_PRIO); + task = list_entry(queue->queue[idx].next, task_t, sched_info.run_list); + check_rt_policy(task); + return task; +} + +static void rt_enqueue(struct queue *__queue, task_t *task) +{ + struct rt_queue *queue = (struct rt_queue *)__queue; + check_rt_policy(task); + if (!test_bit(task->sched_info.cl_data.rt.prio, queue->bitmap)) + __set_bit(task->sched_info.cl_data.rt.prio, queue->bitmap); + list_add_tail(&task->sched_info.run_list, + &queue->queue[task->sched_info.cl_data.rt.prio]); + check_rt_policy(task); + queue->tasks++; + if (!queue->curr) + queue->curr = task; +} + +static void rt_dequeue(struct queue *__queue, task_t *task) +{ + struct rt_queue *queue = (struct rt_queue *)__queue; + check_rt_policy(task); + list_del(&task->sched_info.run_list); + if (list_empty(&queue->queue[task->sched_info.cl_data.rt.prio])) + __clear_bit(task->sched_info.cl_data.rt.prio, queue->bitmap); + queue->tasks--; + check_rt_policy(task); + if (!queue->tasks) + queue->curr = NULL; + else if (task == queue->curr) + queue->curr = rt_best(__queue); +} + +static int rt_preempt(struct queue *__queue, task_t *task) +{ + struct rt_queue *queue = (struct rt_queue *)__queue; + check_rt_policy(task); + if (!queue->curr) + return 1; + check_rt_policy(queue->curr); + return task->sched_info.cl_data.rt.prio + < queue->curr->sched_info.cl_data.rt.prio; +} + +static int rt_tasks(struct queue *__queue) +{ + struct rt_queue *queue = (struct rt_queue *)__queue; + return queue->tasks; +} + +static int rt_nice(struct queue *queue, task_t *task) +{ + check_rt_policy(task); + return -20; +} + +static unsigned long rt_timeslice(struct queue *queue, task_t *task) +{ + check_rt_policy(task); + if (task->sched_info.cl_data.rt.rt_policy != RT_POLICY_RR) + return 0; + else + return task->sched_info.cl_data.rt.quantum; +} + +static void rt_set_timeslice(struct queue *queue, task_t *task, unsigned long n) +{ + check_rt_policy(task); + if (task->sched_info.cl_data.rt.rt_policy == RT_POLICY_RR) + task->sched_info.cl_data.rt.quantum = n; + check_rt_policy(task); +} + +static void rt_setprio(task_t *task, int prio) +{ + check_rt_policy(task); + BUG_ON(prio < 0); + BUG_ON(prio >= MAX_RT_PRIO); + task->sched_info.cl_data.rt.prio = prio; +} + +static int rt_prio(task_t *task) +{ + check_rt_policy(task); + return USER_PRIO(task->sched_info.cl_data.rt.prio); +} + +static struct queue_ops rt_ops = { + .init = rt_init, + .fini = nop_fini, + .tick = rt_tick, + .yield = rt_yield, + .curr = rt_curr, + .set_curr = rt_set_curr, + .tasks = rt_tasks, + .best = rt_best, + .enqueue = rt_enqueue, + .dequeue = rt_dequeue, + .start_wait = queue_nop, + .stop_wait = queue_nop, + .sleep = queue_nop, + .wake = queue_nop, + .preempt = rt_preempt, + .nice = rt_nice, + .renice = nop_renice, + .prio = rt_prio, + .setprio = rt_setprio, + .timeslice = rt_timeslice, + .set_timeslice = rt_set_timeslice, +}; + +struct policy rt_policy = { + .ops = &rt_ops, +}; diff -prauN linux-2.6.0-test11/kernel/sched/ts.c sched-2.6.0-test11-5/kernel/sched/ts.c --- linux-2.6.0-test11/kernel/sched/ts.c 1969-12-31 16:00:00.000000000 -0800 +++ sched-2.6.0-test11-5/kernel/sched/ts.c 2003-12-23 08:24:55.000000000 -0800 @@ -0,0 +1,841 @@ +#include +#include +#include +#include +#include +#include "queue.h" + +#ifdef DEBUG_SCHED +#define check_ts_policy(task) \ +do { \ + BUG_ON((task)->sched_info.policy != SCHED_POLICY_TS); \ +} while (0) + +#define check_nice(__queue__) \ +({ \ + int __k__, __count__ = 0; \ + if ((__queue__)->tasks < 0) { \ + printk("negative nice task count %d\n", \ + (__queue__)->tasks); \ + BUG(); \ + } \ + for (__k__ = 0; __k__ < NICE_QLEN; ++__k__) { \ + task_t *__task__; \ + if (list_empty(&(__queue__)->queue[__k__])) { \ + if (test_bit(__k__, (__queue__)->bitmap)) { \ + printk("wrong nice bit set\n"); \ + BUG(); \ + } \ + } else { \ + if (!test_bit(__k__, (__queue__)->bitmap)) { \ + printk("wrong nice bit clear\n"); \ + BUG(); \ + } \ + } \ + list_for_each_entry(__task__, \ + &(__queue__)->queue[__k__], \ + sched_info.run_list) { \ + check_ts_policy(__task__); \ + if (__task__->sched_info.idx != __k__) { \ + printk("nice index mismatch\n"); \ + BUG(); \ + } \ + ++__count__; \ + } \ + } \ + if ((__queue__)->tasks != __count__) { \ + printk("wrong nice task count\n"); \ + printk("expected %d, got %d\n", \ + (__queue__)->tasks, \ + __count__); \ + BUG(); \ + } \ + __count__; \ +}) + +#define check_queue(__queue) \ +do { \ + int __k, __count = 0; \ + if ((__queue)->tasks < 0) { \ + printk("negative queue task count %d\n", \ + (__queue)->tasks); \ + BUG(); \ + } \ + for (__k = 0; __k < 40; ++__k) { \ + struct nice_queue *__nice; \ + if (list_empty(&(__queue)->nices[__k])) { \ + if (test_bit(__k, (__queue)->bitmap)) { \ + printk("wrong queue bit set\n"); \ + BUG(); \ + } \ + } else { \ + if (!test_bit(__k, (__queue)->bitmap)) { \ + printk("wrong queue bit clear\n"); \ + BUG(); \ + } \ + } \ + list_for_each_entry(__nice, \ + &(__queue)->nices[__k], \ + list) { \ + __count += check_nice(__nice); \ + if (__nice->idx != __k) { \ + printk("queue index mismatch\n"); \ + BUG(); \ + } \ + } \ + } \ + if ((__queue)->tasks != __count) { \ + printk("wrong queue task count\n"); \ + printk("expected %d, got %d\n", \ + (__queue)->tasks, \ + __count); \ + BUG(); \ + } \ +} while (0) + +#else /* !DEBUG_SCHED */ +#define check_ts_policy(task) do { } while (0) +#define check_nice(nice) do { } while (0) +#define check_queue(queue) do { } while (0) +#endif + +/* + * Hybrid deadline/multilevel scheduling. Cpu utilization + * -dependent deadlines at wake. Queue rotation every 50ms or when + * demotions empty the highest level, setting demoted deadlines + * relative to the new highest level. Intra-level RR quantum at 10ms. + */ +struct nice_queue { + int idx, nice, base, tasks, level_quantum, expired; + unsigned long bitmap[BITS_TO_LONGS(NICE_QLEN)]; + struct list_head list, queue[NICE_QLEN]; + task_t *curr; +}; + +/* + * Deadline schedule nice levels with priority-dependent deadlines, + * default quantum of 100ms. Queue rotates at demotions emptying the + * highest level, setting the demoted deadline relative to the new + * highest level. + */ +struct ts_queue { + struct nice_queue nice_levels[40]; + struct list_head nices[40]; + int base, quantum, tasks; + unsigned long bitmap[BITS_TO_LONGS(40)]; + struct nice_queue *curr; +}; + +/* + * Make these sysctl-tunable. + */ +static int nice_quantum = 100; +static int rr_quantum = 10; +static int level_quantum = 50; +static int sample_interval = HZ; + +static DEFINE_PER_CPU(struct ts_queue, ts_queues); + +static task_t *nice_best(struct nice_queue *); +static struct nice_queue *ts_best_nice(struct ts_queue *); + +static void nice_init(struct nice_queue *queue) +{ + int k; + + INIT_LIST_HEAD(&queue->list); + for (k = 0; k < NICE_QLEN; ++k) { + INIT_LIST_HEAD(&queue->queue[k]); + } +} + +static int ts_init(struct policy *policy, int cpu) +{ + int k; + struct ts_queue *queue = &per_cpu(ts_queues, cpu); + + policy->queue = (struct queue *)queue; + queue->quantum = nice_quantum; + + for (k = 0; k < 40; ++k) { + nice_init(&queue->nice_levels[k]); + queue->nice_levels[k].nice = k; + INIT_LIST_HEAD(&queue->nices[k]); + } + return 0; +} + +static int task_deadline(task_t *task) +{ + u64 frac_cpu = task->sched_info.cl_data.ts.frac_cpu; + frac_cpu *= (u64)NICE_QLEN; + frac_cpu >>= 32; + return (int)min((u32)(NICE_QLEN - 1), (u32)frac_cpu); +} + +static void nice_rotate_queue(struct nice_queue *queue) +{ + int idx, new_idx, deadline, idxdiff; + task_t *task = queue->curr; + + check_nice(queue); + + /* shit what if idxdiff == NICE_QLEN - 1?? */ + idx = queue->curr->sched_info.idx; + idxdiff = (idx - queue->base + NICE_QLEN) % NICE_QLEN; + deadline = min(1 + task_deadline(task), NICE_QLEN - idxdiff - 1); + new_idx = (idx + deadline) % NICE_QLEN; +#if 0 + if (idx == new_idx) { + /* + * buggy; it sets queue->base = idx because in this case + * we have task_deadline(task) == 0 + */ + new_idx = (idx - task_deadline(task) + NICE_QLEN) % NICE_QLEN; + if (queue->base != new_idx) + queue->base = new_idx; + return; + } + BUG_ON(!deadline); + BUG_ON(queue->base <= new_idx && new_idx <= idx); + BUG_ON(idx < queue->base && queue->base <= new_idx); + BUG_ON(new_idx <= idx && idx < queue->base); + if (0 && idx == new_idx) { + printk("FUCKUP: pid = %d, tdl = %d, dl = %d, idx = %d, " + "base = %d, diff = %d, fcpu = 0x%lx\n", + queue->curr->pid, + task_deadline(queue->curr), + deadline, + idx, + queue->base, + idxdiff, + task->sched_info.cl_data.ts.frac_cpu); + BUG(); + } +#else + /* + * RR in the last deadline + * special-cased so as not to trip BUG_ON()'s below + */ + if (idx == new_idx) { + /* if we got here these two things must hold */ + BUG_ON(idxdiff != NICE_QLEN - 1); + BUG_ON(deadline); + list_move_tail(&task->sched_info.run_list, &queue->queue[idx]); + if (queue->expired) { + queue->level_quantum = level_quantum; + queue->expired = 0; + } + return; + } +#endif + task->sched_info.idx = new_idx; + if (!test_bit(new_idx, queue->bitmap)) { + BUG_ON(!list_empty(&queue->queue[new_idx])); + __set_bit(new_idx, queue->bitmap); + } + list_move_tail(&task->sched_info.run_list, + &queue->queue[new_idx]); + + /* expired until list drains */ + if (!list_empty(&queue->queue[idx])) + queue->expired = 1; + else { + int k, w, m = NICE_QLEN % BITS_PER_LONG; + BUG_ON(!test_bit(idx, queue->bitmap)); + __clear_bit(idx, queue->bitmap); + + for (w = 0, k = 0; k < NICE_QLEN/BITS_PER_LONG; ++k) + w += hweight_long(queue->bitmap[k]); + if (NICE_QLEN % BITS_PER_LONG) + w += hweight_long(queue->bitmap[k] & ((1UL << m) - 1)); + if (w > 1) + queue->base = (queue->base + 1) % NICE_QLEN; + queue->level_quantum = level_quantum; + queue->expired = 0; + } + check_nice(queue); +} + +static void nice_tick(struct nice_queue *queue, task_t *task) +{ + int idx = task->sched_info.idx; + BUG_ON(!task_queued(task)); + BUG_ON(task != queue->curr); + BUG_ON(!test_bit(idx, queue->bitmap)); + BUG_ON(list_empty(&queue->queue[idx])); + check_ts_policy(task); + check_nice(queue); + + if (task->sched_info.cl_data.ts.ticks) + task->sched_info.cl_data.ts.ticks--; + + if (queue->level_quantum > level_quantum) { + WARN_ON(1); + queue->level_quantum = 1; + } + + if (!queue->expired) { + if (queue->level_quantum) + queue->level_quantum--; + } else if (0 && queue->queue[idx].prev != &task->sched_info.run_list) { + int queued = 0, new_idx = (queue->base + 1) % NICE_QLEN; + task_t *curr, *sav; + task_t *victim = list_entry(queue->queue[idx].prev, + task_t, + sched_info.run_list); + victim->sched_info.idx = new_idx; + if (!test_bit(new_idx, queue->bitmap)) + __set_bit(new_idx, queue->bitmap); +#if 1 + list_for_each_entry_safe(curr, sav, &queue->queue[new_idx], sched_info.run_list) { + if (victim->sched_info.cl_data.ts.frac_cpu + < curr->sched_info.cl_data.ts.frac_cpu) { + queued = 1; + list_move(&victim->sched_info.run_list, + curr->sched_info.run_list.prev); + break; + } + } + if (!queued) + list_move_tail(&victim->sched_info.run_list, + &queue->queue[new_idx]); +#else + list_move(&victim->sched_info.run_list, &queue->queue[new_idx]); +#endif + BUG_ON(list_empty(&queue->queue[idx])); + } + + if (!queue->level_quantum && !queue->expired) { + check_nice(queue); + nice_rotate_queue(queue); + check_nice(queue); + set_need_resched(); + } else if (!task->sched_info.cl_data.ts.ticks) { + int idxdiff = (idx - queue->base + NICE_QLEN) % NICE_QLEN; + check_nice(queue); + task->sched_info.cl_data.ts.ticks = rr_quantum; + BUG_ON(!test_bit(idx, queue->bitmap)); + BUG_ON(list_empty(&queue->queue[idx])); + if (queue->expired) + nice_rotate_queue(queue); + else if (idxdiff == NICE_QLEN - 1) + list_move_tail(&task->sched_info.run_list, + &queue->queue[idx]); + else { + int new_idx = (idx + 1) % NICE_QLEN; + list_del(&task->sched_info.run_list); + if (list_empty(&queue->queue[idx])) { + BUG_ON(!test_bit(idx, queue->bitmap)); + __clear_bit(idx, queue->bitmap); + } + if (!test_bit(new_idx, queue->bitmap)) { + BUG_ON(!list_empty(&queue->queue[new_idx])); + __set_bit(new_idx, queue->bitmap); + } + task->sched_info.idx = new_idx; + list_add(&task->sched_info.run_list, + &queue->queue[new_idx]); + } + check_nice(queue); + set_need_resched(); + } + check_nice(queue); + check_ts_policy(task); +} + +static void ts_rotate_queue(struct ts_queue *queue) +{ + int idx, new_idx, idxdiff, off, deadline; + + queue->base = find_first_circular_bit(queue->bitmap, queue->base, 40); + + /* shit what if idxdiff == 39?? */ + check_queue(queue); + idx = queue->curr->idx; + idxdiff = (idx - queue->base + 40) % 40; + off = (int)(queue->curr - queue->nice_levels); + deadline = min(1 + off, 40 - idxdiff - 1); + new_idx = (idx + deadline) % 40; + if (idx == new_idx) { + new_idx = (idx - off + 40) % 40; + if (queue->base != new_idx) + queue->base = new_idx; + return; + } + BUG_ON(!deadline); + BUG_ON(queue->base <= new_idx && new_idx <= idx); + BUG_ON(idx < queue->base && queue->base <= new_idx); + BUG_ON(new_idx <= idx && idx < queue->base); + if (!test_bit(new_idx, queue->bitmap)) { + BUG_ON(!list_empty(&queue->nices[new_idx])); + __set_bit(new_idx, queue->bitmap); + } + list_move_tail(&queue->curr->list, &queue->nices[new_idx]); + queue->curr->idx = new_idx; + + if (list_empty(&queue->nices[idx])) { + BUG_ON(!test_bit(idx, queue->bitmap)); + __clear_bit(idx, queue->bitmap); + queue->base = (queue->base + 1) % 40; + } + check_queue(queue); +} + +static int ts_tick(struct queue *__queue, task_t *task, int user_ticks, int sys_ticks) +{ + struct ts_queue *queue = (struct ts_queue *)__queue; + struct nice_queue *nice = queue->curr; + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + int nice_idx = (int)(queue->curr - queue->nice_levels); + unsigned long sample_end, delta; + + check_queue(queue); + check_ts_policy(task); + BUG_ON(!nice); + BUG_ON(nice_idx != task->sched_info.cl_data.ts.nice); + BUG_ON(!test_bit(nice->idx, queue->bitmap)); + BUG_ON(list_empty(&queue->nices[nice->idx])); + + sample_end = jiffies; + delta = sample_end - task->sched_info.cl_data.ts.sample_start; + if (delta) + task->sched_info.cl_data.ts.sample_ticks++; + else { + task->sched_info.cl_data.ts.sample_start = jiffies; + task->sched_info.cl_data.ts.sample_ticks = 1; + } + + if (delta >= sample_interval) { + u64 frac_cpu; + frac_cpu = (u64)task->sched_info.cl_data.ts.sample_ticks << 32; + do_div(frac_cpu, delta); + frac_cpu = 2*frac_cpu + task->sched_info.cl_data.ts.frac_cpu; + do_div(frac_cpu, 3); + frac_cpu = min(frac_cpu, (1ULL << 32) - 1); + task->sched_info.cl_data.ts.frac_cpu = (unsigned long)frac_cpu; + task->sched_info.cl_data.ts.sample_start = sample_end; + task->sched_info.cl_data.ts.sample_ticks = 0; + } + + cpustat->user += user_ticks; + cpustat->system += sys_ticks; + nice_tick(nice, task); + if (queue->quantum > nice_quantum) { + queue->quantum = 0; + WARN_ON(1); + } else if (queue->quantum) + queue->quantum--; + if (!queue->quantum) { + queue->quantum = nice_quantum; + ts_rotate_queue(queue); + set_need_resched(); + } + check_queue(queue); + check_ts_policy(task); + return 0; +} + +static void nice_yield(struct nice_queue *queue, task_t *task) +{ + int idx, new_idx = (queue->base + NICE_QLEN - 1) % NICE_QLEN; + + check_nice(queue); + check_ts_policy(task); + if (!test_bit(new_idx, queue->bitmap)) { + BUG_ON(!list_empty(&queue->queue[new_idx])); + __set_bit(new_idx, queue->bitmap); + } + list_move_tail(&task->sched_info.run_list, &queue->queue[new_idx]); + idx = task->sched_info.idx; + task->sched_info.idx = new_idx; + set_need_resched(); + + if (list_empty(&queue->queue[idx])) { + BUG_ON(!test_bit(idx, queue->bitmap)); + __clear_bit(idx, queue->bitmap); + } + queue->curr = nice_best(queue); +#if 0 + if (queue->curr->sched_info.idx != queue->base) + queue->base = queue->curr->sched_info.idx; +#endif + check_nice(queue); + check_ts_policy(task); +} + +/* + * This is somewhat problematic; nice_yield() only parks tasks on + * the end of their current nice levels. + */ +static void ts_yield(struct queue *__queue, task_t *task) +{ + struct ts_queue *queue = (struct ts_queue *)__queue; + struct nice_queue *nice = queue->curr; + + check_queue(queue); + check_ts_policy(task); + nice_yield(nice, task); + + /* + * If there's no one to yield to, move the whole nice level. + * If this is problematic, setting nice-dependent deadlines + * on a single unified queue may be in order. + */ + if (nice->tasks == 1) { + int idx, new_idx = (queue->base + 40 - 1) % 40; + idx = nice->idx; + if (!test_bit(new_idx, queue->bitmap)) { + BUG_ON(!list_empty(&queue->nices[new_idx])); + __set_bit(new_idx, queue->bitmap); + } + list_move_tail(&nice->list, &queue->nices[new_idx]); + if (list_empty(&queue->nices[idx])) { + BUG_ON(!test_bit(idx, queue->bitmap)); + __clear_bit(idx, queue->bitmap); + } + nice->idx = new_idx; + queue->base = find_first_circular_bit(queue->bitmap, + queue->base, + 40); + BUG_ON(queue->base >= 40); + BUG_ON(!test_bit(queue->base, queue->bitmap)); + queue->curr = ts_best_nice(queue); + } + check_queue(queue); + check_ts_policy(task); +} + +static task_t *ts_curr(struct queue *__queue) +{ + struct ts_queue *queue = (struct ts_queue *)__queue; + task_t *task = queue->curr->curr; + check_queue(queue); + if (task) + check_ts_policy(task); + return task; +} + +static void ts_set_curr(struct queue *__queue, task_t *task) +{ + struct ts_queue *queue = (struct ts_queue *)__queue; + struct nice_queue *nice; + check_queue(queue); + check_ts_policy(task); + nice = &queue->nice_levels[task->sched_info.cl_data.ts.nice]; + queue->curr = nice; + nice->curr = task; + check_queue(queue); + check_ts_policy(task); +} + +static task_t *nice_best(struct nice_queue *queue) +{ + task_t *task; + int idx = find_first_circular_bit(queue->bitmap, + queue->base, + NICE_QLEN); + check_nice(queue); + if (idx >= NICE_QLEN) + return NULL; + BUG_ON(list_empty(&queue->queue[idx])); + BUG_ON(!test_bit(idx, queue->bitmap)); + task = list_entry(queue->queue[idx].next, task_t, sched_info.run_list); + check_nice(queue); + check_ts_policy(task); + return task; +} + +static struct nice_queue *ts_best_nice(struct ts_queue *queue) +{ + int idx = find_first_circular_bit(queue->bitmap, queue->base, 40); + check_queue(queue); + if (idx >= 40) + return NULL; + BUG_ON(list_empty(&queue->nices[idx])); + BUG_ON(!test_bit(idx, queue->bitmap)); + return list_entry(queue->nices[idx].next, struct nice_queue, list); +} + +static task_t *ts_best(struct queue *__queue) +{ + struct ts_queue *queue = (struct ts_queue *)__queue; + struct nice_queue *nice = ts_best_nice(queue); + return nice ? nice_best(nice) : NULL; +} + +static void nice_enqueue(struct nice_queue *queue, task_t *task) +{ + task_t *curr, *sav; + int queued = 0, idx, deadline, base, idxdiff; + check_nice(queue); + check_ts_policy(task); + + /* don't livelock when queue->expired */ + deadline = min(!!queue->expired + task_deadline(task), NICE_QLEN - 1); + idx = (queue->base + deadline) % NICE_QLEN; + + if (!test_bit(idx, queue->bitmap)) { + BUG_ON(!list_empty(&queue->queue[idx])); + __set_bit(idx, queue->bitmap); + } + +#if 1 + /* keep nice level's queue sorted -- use binomial heaps here soon */ + list_for_each_entry_safe(curr, sav, &queue->queue[idx], sched_info.run_list) { + if (task->sched_info.cl_data.ts.frac_cpu + >= curr->sched_info.cl_data.ts.frac_cpu) { + list_add(&task->sched_info.run_list, + curr->sched_info.run_list.prev); + queued = 1; + break; + } + } + if (!queued) + list_add_tail(&task->sched_info.run_list, &queue->queue[idx]); +#else + list_add_tail(&task->sched_info.run_list, &queue->queue[idx]); +#endif + task->sched_info.idx = idx; + /* if (!task->sched_info.cl_data.ts.ticks) */ + task->sched_info.cl_data.ts.ticks = rr_quantum; + + if (queue->tasks) + BUG_ON(!queue->curr); + else { + BUG_ON(queue->curr); + queue->curr = task; + } + queue->tasks++; + check_nice(queue); + check_ts_policy(task); +} + +static void ts_enqueue(struct queue *__queue, task_t *task) +{ + struct ts_queue *queue = (struct ts_queue *)__queue; + struct nice_queue *nice; + + check_queue(queue); + check_ts_policy(task); + nice = &queue->nice_levels[task->sched_info.cl_data.ts.nice]; + if (!nice->tasks) { + int idx = (queue->base + task->sched_info.cl_data.ts.nice) % 40; + if (!test_bit(idx, queue->bitmap)) { + BUG_ON(!list_empty(&queue->nices[idx])); + __set_bit(idx, queue->bitmap); + } + list_add_tail(&nice->list, &queue->nices[idx]); + nice->idx = idx; + if (!queue->curr) + queue->curr = nice; + } + nice_enqueue(nice, task); + queue->tasks++; + queue->base = find_first_circular_bit(queue->bitmap, queue->base, 40); + check_queue(queue); + check_ts_policy(task); +} + +static void nice_dequeue(struct nice_queue *queue, task_t *task) +{ + check_nice(queue); + check_ts_policy(task); + list_del(&task->sched_info.run_list); + if (list_empty(&queue->queue[task->sched_info.idx])) { + BUG_ON(!test_bit(task->sched_info.idx, queue->bitmap)); + __clear_bit(task->sched_info.idx, queue->bitmap); + } + queue->tasks--; + if (task == queue->curr) { + queue->curr = nice_best(queue); +#if 0 + if (queue->curr) + queue->base = queue->curr->sched_info.idx; +#endif + } + check_nice(queue); + check_ts_policy(task); +} + +static void ts_dequeue(struct queue *__queue, task_t *task) +{ + struct ts_queue *queue = (struct ts_queue *)__queue; + struct nice_queue *nice; + + BUG_ON(!queue->tasks); + check_queue(queue); + check_ts_policy(task); + nice = &queue->nice_levels[task->sched_info.cl_data.ts.nice]; + + nice_dequeue(nice, task); + queue->tasks--; + if (!nice->tasks) { + list_del_init(&nice->list); + if (list_empty(&queue->nices[nice->idx])) { + BUG_ON(!test_bit(nice->idx, queue->bitmap)); + __clear_bit(nice->idx, queue->bitmap); + } + if (nice == queue->curr) + queue->curr = ts_best_nice(queue); + } + queue->base = find_first_circular_bit(queue->bitmap, queue->base, 40); + if (queue->base >= 40) + queue->base = 0; + check_queue(queue); + check_ts_policy(task); +} + +static int ts_tasks(struct queue *__queue) +{ + struct ts_queue *queue = (struct ts_queue *)__queue; + check_queue(queue); + return queue->tasks; +} + +static int ts_nice(struct queue *__queue, task_t *task) +{ + int nice = task->sched_info.cl_data.ts.nice - 20; + check_ts_policy(task); + BUG_ON(nice < -20); + BUG_ON(nice >= 20); + return nice; +} + +static void ts_renice(struct queue *queue, task_t *task, int nice) +{ + check_queue((struct ts_queue *)queue); + check_ts_policy(task); + BUG_ON(nice < -20); + BUG_ON(nice >= 20); + task->sched_info.cl_data.ts.nice = nice + 20; + check_queue((struct ts_queue *)queue); +} + +static int nice_task_prio(struct nice_queue *nice, task_t *task) +{ + if (!task_queued(task)) + return task_deadline(task); + else { + int prio = task->sched_info.idx - nice->base; + return prio < 0 ? prio + NICE_QLEN : prio; + } +} + +static int ts_nice_prio(struct ts_queue *ts, struct nice_queue *nice) +{ + if (list_empty(&nice->list)) + return (int)(nice - ts->nice_levels); + else { + int prio = nice->idx - ts->base; + return prio < 0 ? prio + 40 : prio; + } +} + +/* 100% fake priority to report heuristics and the like */ +static int ts_prio(task_t *task) +{ + int policy_idx; + struct policy *policy; + struct ts_queue *ts; + struct nice_queue *nice; + + policy_idx = task->sched_info.policy; + policy = per_cpu(runqueues, task_cpu(task)).policies[policy_idx]; + ts = (struct ts_queue *)policy->queue; + nice = &ts->nice_levels[task->sched_info.cl_data.ts.nice]; + return 40*ts_nice_prio(ts, nice) + nice_task_prio(nice, task); +} + +static void ts_setprio(task_t *task, int prio) +{ +} + +static void ts_start_wait(struct queue *__queue, task_t *task) +{ +} + +static void ts_stop_wait(struct queue *__queue, task_t *task) +{ +} + +static void ts_sleep(struct queue *__queue, task_t *task) +{ +} + +static void ts_wake(struct queue *__queue, task_t *task) +{ +} + +static int nice_preempt(struct nice_queue *queue, task_t *task) +{ + check_nice(queue); + check_ts_policy(task); + /* assume FB style preemption at wakeup */ + if (!task_queued(task) || !queue->curr) + return 1; + else { + int delta_t, delta_q; + delta_t = (task->sched_info.idx - queue->base + NICE_QLEN) + % NICE_QLEN; + delta_q = (queue->curr->sched_info.idx - queue->base + + NICE_QLEN) + % NICE_QLEN; + if (delta_t < delta_q) + return 1; + else if (task->sched_info.cl_data.ts.frac_cpu + < queue->curr->sched_info.cl_data.ts.frac_cpu) + return 1; + else + return 0; + } + check_nice(queue); +} + +static int ts_preempt(struct queue *__queue, task_t *task) +{ + int curr_nice; + struct ts_queue *queue = (struct ts_queue *)__queue; + struct nice_queue *nice = queue->curr; + + check_queue(queue); + check_ts_policy(task); + if (!queue->curr) + return 1; + + curr_nice = (int)(nice - queue->nice_levels); + + /* preempt when nice number is lower, or the above for matches */ + if (task->sched_info.cl_data.ts.nice != curr_nice) + return task->sched_info.cl_data.ts.nice < curr_nice; + else + return nice_preempt(nice, task); +} + +static struct queue_ops ts_ops = { + .init = ts_init, + .fini = nop_fini, + .tick = ts_tick, + .yield = ts_yield, + .curr = ts_curr, + .set_curr = ts_set_curr, + .tasks = ts_tasks, + .best = ts_best, + .enqueue = ts_enqueue, + .dequeue = ts_dequeue, + .start_wait = ts_start_wait, + .stop_wait = ts_stop_wait, + .sleep = ts_sleep, + .wake = ts_wake, + .preempt = ts_preempt, + .nice = ts_nice, + .renice = ts_renice, + .prio = ts_prio, + .setprio = ts_setprio, + .timeslice = nop_timeslice, + .set_timeslice = nop_set_timeslice, +}; + +struct policy ts_policy = { + .ops = &ts_ops, +}; diff -prauN linux-2.6.0-test11/kernel/sched/util.c sched-2.6.0-test11-5/kernel/sched/util.c --- linux-2.6.0-test11/kernel/sched/util.c 1969-12-31 16:00:00.000000000 -0800 +++ sched-2.6.0-test11-5/kernel/sched/util.c 2003-12-19 08:43:20.000000000 -0800 @@ -0,0 +1,37 @@ +#include +#include +#include +#include +#include "queue.h" + +int find_first_circular_bit(unsigned long *addr, int start, int end) +{ + int bit = find_next_bit(addr, end, start); + if (bit < end) + return bit; + bit = find_first_bit(addr, start); + if (bit < start) + return bit; + return end; +} + +void queue_nop(struct queue *queue, task_t *task) +{ +} + +void nop_renice(struct queue *queue, task_t *task, int nice) +{ +} + +void nop_fini(struct policy *policy, int cpu) +{ +} + +unsigned long nop_timeslice(struct queue *queue, task_t *task) +{ + return 0; +} + +void nop_set_timeslice(struct queue *queue, task_t *task, unsigned long n) +{ +} diff -prauN linux-2.6.0-test11/kernel/sched.c sched-2.6.0-test11-5/kernel/sched.c --- linux-2.6.0-test11/kernel/sched.c 2003-11-26 12:45:17.000000000 -0800 +++ sched-2.6.0-test11-5/kernel/sched.c 2003-12-21 06:06:32.000000000 -0800 @@ -15,6 +15,8 @@ * and per-CPU runqueues. Cleanups and useful suggestions * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2003-12-17 Total rewrite and generalized scheduler policies + * by William Irwin. */ #include @@ -38,6 +40,8 @@ #include #include +#include "sched/queue.h" + #ifdef CONFIG_NUMA #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) #else @@ -45,181 +49,79 @@ #endif /* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. - */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) - -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) -#define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\ - (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1))) - -/* - * Some helpers for converting nanosecond timing to jiffy resolution - */ -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) - -/* - * These are the 'tuning knobs' of the scheduler: - * - * Minimum timeslice is 10 msecs, default timeslice is 100 msecs, - * maximum timeslice is 200 msecs. Timeslices get refilled after - * they expire. - */ -#define MIN_TIMESLICE ( 10 * HZ / 1000) -#define MAX_TIMESLICE (200 * HZ / 1000) -#define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) -#define STARVATION_LIMIT (MAX_SLEEP_AVG) -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) -#define NODE_THRESHOLD 125 -#define CREDIT_LIMIT 100 - -/* - * If a task is 'interactive' then we reinsert it in the active - * array after it has expired its current timeslice. (it will not - * continue to run immediately, it will still roundrobin with - * other interactive tasks.) - * - * This part scales the interactivity limit depending on niceness. - * - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. - * Here are a few examples of different nice levels: - * - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] - * - * (the X axis represents the possible -5 ... 0 ... +5 dynamic - * priority range a task can explore, a value of '1' means the - * task is rated interactive.) - * - * Ie. nice +19 tasks can never get 'interactive' enough to be - * reinserted into the active array. And only heavily CPU-hog nice -20 - * tasks will be expired. Default nice 0 tasks are somewhere between, - * it takes some effort for them to get interactive, but it's not - * too hard. - */ - -#define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ - MAX_SLEEP_AVG) - -#ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) -#else -#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) -#endif - -#define SCALE(v1,v1_max,v2_max) \ - (v1) * (v2_max) / (v1_max) - -#define DELTA(p) \ - (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \ - INTERACTIVE_DELTA) - -#define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) - -#define JUST_INTERACTIVE_SLEEP(p) \ - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) - -#define HIGH_CREDIT(p) \ - ((p)->interactive_credit > CREDIT_LIMIT) - -#define LOW_CREDIT(p) \ - ((p)->interactive_credit < -CREDIT_LIMIT) - -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) - -/* - * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] - * to time slice values. - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. - * - * task_timeslice() is the interface that is used by the scheduler. - */ - -#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \ - ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1))) - -static inline unsigned int task_timeslice(task_t *p) -{ - return BASE_TIMESLICE(p); -} - -/* - * These are the runqueue data structures: - */ - -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) - -typedef struct runqueue runqueue_t; - -struct prio_array { - int nr_active; - unsigned long bitmap[BITMAP_SIZE]; - struct list_head queue[MAX_PRIO]; -}; - -/* * This is the main, per-CPU runqueue data structure. * * Locking rule: those places that want to lock multiple runqueues * (such as the load balancing or the thread migration code), lock * acquire operations must be ordered by ascending &runqueue. */ -struct runqueue { - spinlock_t lock; - unsigned long nr_running, nr_switches, expired_timestamp, - nr_uninterruptible; - task_t *curr, *idle; - struct mm_struct *prev_mm; - prio_array_t *active, *expired, arrays[2]; - int prev_cpu_load[NR_CPUS]; -#ifdef CONFIG_NUMA - atomic_t *node_nr_running; - int prev_node_load[MAX_NUMNODES]; -#endif - task_t *migration_thread; - struct list_head migration_queue; +DEFINE_PER_CPU(struct runqueue, runqueues); - atomic_t nr_iowait; +struct policy *policies[] = { + &rt_policy, + &ts_policy, + &batch_policy, + &idle_policy, + NULL, }; -static DEFINE_PER_CPU(struct runqueue, runqueues); - #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define rq_curr(rq) (rq)->__curr +#define cpu_curr(cpu) rq_curr(cpu_rq(cpu)) + +static inline struct policy *task_policy(task_t *task) +{ + unsigned long idx; + struct policy *policy; + idx = task->sched_info.policy; + __check_task_policy(idx); + policy = task_rq(task)->policies[idx]; + check_policy(policy); + return policy; +} + +static inline struct policy *rq_policy(runqueue_t *rq) +{ + unsigned long idx; + task_t *task; + struct policy *policy; + + task = rq_curr(rq); + BUG_ON(!task); + BUG_ON((unsigned long)task < PAGE_OFFSET); + idx = task->sched_info.policy; + __check_task_policy(idx); + policy = rq->policies[idx]; + check_policy(policy); + return policy; +} + +static int __task_nice(task_t *task) +{ + struct policy *policy = task_policy(task); + return policy->ops->nice(policy->queue, task); +} + +static inline void set_rq_curr(runqueue_t *rq, task_t *task) +{ + rq->curr = task->sched_info.policy; + __check_task_policy(rq->curr); + rq->__curr = task; +} + +static inline int task_preempts_curr(task_t *task, runqueue_t *rq) +{ + check_task_policy(rq_curr(rq)); + check_task_policy(task); + if (rq_curr(rq)->sched_info.policy != task->sched_info.policy) + return task->sched_info.policy < rq_curr(rq)->sched_info.policy; + else { + struct policy *policy = rq_policy(rq); + return policy->ops->preempt(policy->queue, task); + } +} /* * Default context-switch locking: @@ -227,7 +129,7 @@ static DEFINE_PER_CPU(struct runqueue, r #ifndef prepare_arch_switch # define prepare_arch_switch(rq, next) do { } while(0) # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) -# define task_running(rq, p) ((rq)->curr == (p)) +# define task_running(rq, p) (rq_curr(rq) == (p)) #endif #ifdef CONFIG_NUMA @@ -320,53 +222,32 @@ static inline void rq_unlock(runqueue_t } /* - * Adding/removing a task to/from a priority array: + * Adding/removing a task to/from a policy's queue. + * We dare not BUG_ON() a wrong task_queued() as boot-time + * calls may trip it. */ -static inline void dequeue_task(struct task_struct *p, prio_array_t *array) +static inline void dequeue_task(task_t *task, runqueue_t *rq) { - array->nr_active--; - list_del(&p->run_list); - if (list_empty(array->queue + p->prio)) - __clear_bit(p->prio, array->bitmap); + struct policy *policy = task_policy(task); + BUG_ON(!task_queued(task)); + policy->ops->dequeue(policy->queue, task); + if (!policy->ops->tasks(policy->queue)) { + BUG_ON(!test_bit(task->sched_info.policy, &rq->policy_bitmap)); + __clear_bit(task->sched_info.policy, &rq->policy_bitmap); + } + clear_task_queued(task); } -static inline void enqueue_task(struct task_struct *p, prio_array_t *array) +static inline void enqueue_task(task_t *task, runqueue_t *rq) { - list_add_tail(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; -} - -/* - * effective_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into the -5 ... 0 ... +5 bonus/penalty range. - * - * We use 25% of the full 0...39 priority range so that: - * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. - * - * Both properties are important to certain workloads. - */ -static int effective_prio(task_t *p) -{ - int bonus, prio; - - if (rt_task(p)) - return p->prio; - - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; - - prio = p->static_prio - bonus; - if (prio < MAX_RT_PRIO) - prio = MAX_RT_PRIO; - if (prio > MAX_PRIO-1) - prio = MAX_PRIO-1; - return prio; + struct policy *policy = task_policy(task); + BUG_ON(task_queued(task)); + if (!policy->ops->tasks(policy->queue)) { + BUG_ON(test_bit(task->sched_info.policy, &rq->policy_bitmap)); + __set_bit(task->sched_info.policy, &rq->policy_bitmap); + } + policy->ops->enqueue(policy->queue, task); + set_task_queued(task); } /* @@ -374,134 +255,34 @@ static int effective_prio(task_t *p) */ static inline void __activate_task(task_t *p, runqueue_t *rq) { - enqueue_task(p, rq->active); + enqueue_task(p, rq); nr_running_inc(rq); } -static void recalc_task_prio(task_t *p, unsigned long long now) -{ - unsigned long long __sleep_time = now - p->timestamp; - unsigned long sleep_time; - - if (__sleep_time > NS_MAX_SLEEP_AVG) - sleep_time = NS_MAX_SLEEP_AVG; - else - sleep_time = (unsigned long)__sleep_time; - - if (likely(sleep_time > 0)) { - /* - * User tasks that sleep a long time are categorised as - * idle and will get just interactive status to stay active & - * prevent them suddenly becoming cpu hogs and starving - * other processes. - */ - if (p->mm && p->activated != -1 && - sleep_time > JUST_INTERACTIVE_SLEEP(p)){ - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - - AVG_TIMESLICE); - if (!HIGH_CREDIT(p)) - p->interactive_credit++; - } else { - /* - * The lower the sleep avg a task has the more - * rapidly it will rise with sleep time. - */ - sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; - - /* - * Tasks with low interactive_credit are limited to - * one timeslice worth of sleep avg bonus. - */ - if (LOW_CREDIT(p) && - sleep_time > JIFFIES_TO_NS(task_timeslice(p))) - sleep_time = - JIFFIES_TO_NS(task_timeslice(p)); - - /* - * Non high_credit tasks waking from uninterruptible - * sleep are limited in their sleep_avg rise as they - * are likely to be cpu hogs waiting on I/O - */ - if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm){ - if (p->sleep_avg >= JUST_INTERACTIVE_SLEEP(p)) - sleep_time = 0; - else if (p->sleep_avg + sleep_time >= - JUST_INTERACTIVE_SLEEP(p)){ - p->sleep_avg = - JUST_INTERACTIVE_SLEEP(p); - sleep_time = 0; - } - } - - /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a task - * spends sleeping, the higher the average gets - and the - * higher the priority boost gets as well. - */ - p->sleep_avg += sleep_time; - - if (p->sleep_avg > NS_MAX_SLEEP_AVG){ - p->sleep_avg = NS_MAX_SLEEP_AVG; - if (!HIGH_CREDIT(p)) - p->interactive_credit++; - } - } - } - - p->prio = effective_prio(p); -} - /* * activate_task - move a task to the runqueue and do priority recalculation * * Update all the scheduling statistics stuff. (sleep average * calculation, priority modifiers, etc.) */ -static inline void activate_task(task_t *p, runqueue_t *rq) +static inline void activate_task(task_t *task, runqueue_t *rq) { - unsigned long long now = sched_clock(); - - recalc_task_prio(p, now); - - /* - * This checks to make sure it's not an uninterruptible task - * that is now waking up. - */ - if (!p->activated){ - /* - * Tasks which were woken up by interrupts (ie. hw events) - * are most likely of interactive nature. So we give them - * the credit of extending their sleep time to the period - * of time they spend on the runqueue, waiting for execution - * on a CPU, first time around: - */ - if (in_interrupt()) - p->activated = 2; - else - /* - * Normal first-time wakeups get a credit too for on-runqueue - * time, but it will be weighted down: - */ - p->activated = 1; - } - p->timestamp = now; - - __activate_task(p, rq); + struct policy *policy = task_policy(task); + policy->ops->wake(policy->queue, task); + __activate_task(task, rq); } /* * deactivate_task - remove a task from the runqueue. */ -static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +static inline void deactivate_task(task_t *task, runqueue_t *rq) { + struct policy *policy = task_policy(task); nr_running_dec(rq); - if (p->state == TASK_UNINTERRUPTIBLE) + if (task->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; - dequeue_task(p, p->array); - p->array = NULL; + policy->ops->sleep(policy->queue, task); + dequeue_task(task, rq); } /* @@ -625,7 +406,7 @@ repeat_lock_task: rq = task_rq_lock(p, &flags); old_state = p->state; if (old_state & state) { - if (!p->array) { + if (!task_queued(p)) { /* * Fast-migrate the task if it's not running or runnable * currently. Do not violate hard affinity. @@ -644,14 +425,13 @@ repeat_lock_task: * Tasks on involuntary sleep don't earn * sleep_avg beyond just interactive state. */ - p->activated = -1; } if (sync) __activate_task(p, rq); else { activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + if (task_preempts_curr(p, rq)) + resched_task(rq_curr(rq)); } success = 1; } @@ -679,68 +459,26 @@ int wake_up_state(task_t *p, unsigned in * This function will do some initial scheduler statistics housekeeping * that must be done for every newly created process. */ -void wake_up_forked_process(task_t * p) +void wake_up_forked_process(task_t *task) { unsigned long flags; runqueue_t *rq = task_rq_lock(current, &flags); - p->state = TASK_RUNNING; - /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. - */ - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - - p->interactive_credit = 0; - - p->prio = effective_prio(p); - set_task_cpu(p, smp_processor_id()); - - if (unlikely(!current->array)) - __activate_task(p, rq); - else { - p->prio = current->prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - nr_running_inc(rq); - } + task->state = TASK_RUNNING; + set_task_cpu(task, smp_processor_id()); + if (unlikely(!task_queued(current))) + __activate_task(task, rq); + else + activate_task(task, rq); task_rq_unlock(rq, &flags); } /* - * Potentially available exiting-child timeslices are - * retrieved here - this way the parent does not get - * penalized for creating too many threads. - * - * (this cannot be used to 'generate' timeslices - * artificially, because any timeslice recovered here - * was given away by the parent in the first place.) + * Policies that depend on trapping fork() and exit() may need to + * put a hook here. */ -void sched_exit(task_t * p) +void sched_exit(task_t *task) { - unsigned long flags; - - local_irq_save(flags); - if (p->first_time_slice) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > MAX_TIMESLICE)) - p->parent->time_slice = MAX_TIMESLICE; - } - local_irq_restore(flags); - /* - * If the child was a (relative-) CPU hog then decrease - * the sleep_avg of the parent as well. - */ - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / - (EXIT_WEIGHT + 1); } /** @@ -1128,18 +866,18 @@ out: * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) +static inline void pull_task(runqueue_t *src_rq, task_t *p, runqueue_t *this_rq, int this_cpu) { - dequeue_task(p, src_array); + dequeue_task(p, src_rq); nr_running_dec(src_rq); set_task_cpu(p, this_cpu); nr_running_inc(this_rq); - enqueue_task(p, this_rq->active); + enqueue_task(p, this_rq); /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - if (TASK_PREEMPTS_CURR(p, this_rq)) + if (task_preempts_curr(p, this_rq)) set_need_resched(); } @@ -1150,14 +888,14 @@ static inline void pull_task(runqueue_t * ((!idle || (NS_TO_JIFFIES(now - (p)->timestamp) > \ * cache_decay_ticks)) && !task_running(rq, p) && \ * cpu_isset(this_cpu, (p)->cpus_allowed)) + * + * Since there isn't a timestamp anymore, this needs adjustment. */ static inline int can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle) { - unsigned long delta = sched_clock() - tsk->timestamp; - - if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks))) + if (!idle) return 0; if (task_running(rq, tsk)) return 0; @@ -1176,11 +914,8 @@ can_migrate_task(task_t *tsk, runqueue_t */ static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask) { - int imbalance, idx, this_cpu = smp_processor_id(); + int imbalance, this_cpu = smp_processor_id(); runqueue_t *busiest; - prio_array_t *array; - struct list_head *head, *curr; - task_t *tmp; busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); if (!busiest) @@ -1192,37 +927,6 @@ static void load_balance(runqueue_t *thi */ imbalance /= 2; - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (busiest->expired->nr_active) - array = busiest->expired; - else - array = busiest->active; - -new_array: - /* Start searching at priority 0: */ - idx = 0; -skip_bitmap: - if (!idx) - idx = sched_find_first_bit(array->bitmap); - else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == busiest->expired) { - array = busiest->active; - goto new_array; - } - goto out_unlock; - } - - head = array->queue + idx; - curr = head->prev; -skip_queue: - tmp = list_entry(curr, task_t, run_list); /* * We do not migrate tasks that are: @@ -1231,21 +935,19 @@ skip_queue: * 3) are cache-hot on their current CPU. */ - curr = curr->prev; + do { + struct policy *policy; + task_t *task; + + policy = rq_migrate_policy(busiest); + if (!policy) + break; + task = policy->migrate(policy->queue); + if (!task) + break; + pull_task(busiest, task, this_rq, this_cpu); + } while (!idle && --imbalance); - if (!can_migrate_task(tmp, busiest, this_cpu, idle)) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } - pull_task(busiest, array, tmp, this_rq, this_cpu); - if (!idle && --imbalance) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } out_unlock: spin_unlock(&busiest->lock); out: @@ -1356,10 +1058,10 @@ EXPORT_PER_CPU_SYMBOL(kstat); */ void scheduler_tick(int user_ticks, int sys_ticks) { - int cpu = smp_processor_id(); + int idle, cpu = smp_processor_id(); struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + struct policy *policy; runqueue_t *rq = this_rq(); - task_t *p = current; if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_ticks); @@ -1373,98 +1075,28 @@ void scheduler_tick(int user_ticks, int sys_ticks = 0; } - if (p == rq->idle) { - if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait += sys_ticks; - else - cpustat->idle += sys_ticks; - rebalance_tick(rq, 1); - return; - } - if (TASK_NICE(p) > 0) - cpustat->nice += user_ticks; - else - cpustat->user += user_ticks; - cpustat->system += sys_ticks; - - /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq->active) { - set_tsk_need_resched(p); - goto out; - } spin_lock(&rq->lock); - /* - * The task was running during this tick - update the - * time slice counter. Note: we do not update a thread's - * priority until it either goes to sleep or uses up its - * timeslice. This makes it possible for interactive tasks - * to use up their timeslices at their highest priority levels. - */ - if (unlikely(rt_task(p))) { - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - set_tsk_need_resched(p); - - /* put it at the end of the queue: */ - dequeue_task(p, rq->active); - enqueue_task(p, rq->active); - } - goto out_unlock; - } - if (!--p->time_slice) { - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { - enqueue_task(p, rq->expired); - } else - enqueue_task(p, rq->active); - } else { - /* - * Prevent a too long timeslice allowing a task to monopolize - * the CPU. We do this by splitting up the timeslice into - * smaller pieces. - * - * Note: this does not mean the task's timeslices expire or - * get lost in any way, they just might be preempted by - * another task of equal priority. (one with higher - * priority would have preempted this task already.) We - * requeue this task to the end of the list on this priority - * level, which is in essence a round-robin of tasks with - * equal priority. - * - * This only applies to tasks in the interactive - * delta range with at least TIMESLICE_GRANULARITY to requeue. - */ - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { - - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - enqueue_task(p, rq->active); - } - } -out_unlock: + policy = rq_policy(rq); + idle = policy->ops->tick(policy->queue, current, user_ticks, sys_ticks); spin_unlock(&rq->lock); -out: - rebalance_tick(rq, 0); + rebalance_tick(rq, idle); } void scheduling_functions_start_here(void) { } +static inline task_t *find_best_task(runqueue_t *rq) +{ + int idx; + struct policy *policy; + + BUG_ON(!rq->policy_bitmap); + idx = __ffs(rq->policy_bitmap); + __check_task_policy(idx); + policy = rq->policies[idx]; + check_policy(policy); + return policy->ops->best(policy->queue); +} + /* * schedule() is the main scheduler function. */ @@ -1472,11 +1104,7 @@ asmlinkage void schedule(void) { task_t *prev, *next; runqueue_t *rq; - prio_array_t *array; - struct list_head *queue; - unsigned long long now; - unsigned long run_time; - int idx; + struct policy *policy; /* * Test if we are atomic. Since do_exit() needs to call into @@ -1494,22 +1122,9 @@ need_resched: preempt_disable(); prev = current; rq = this_rq(); + policy = rq_policy(rq); release_kernel_lock(prev); - now = sched_clock(); - if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) - run_time = now - prev->timestamp; - else - run_time = NS_MAX_SLEEP_AVG; - - /* - * Tasks with interactive credits get charged less run_time - * at high sleep_avg to delay them losing their interactive - * status - */ - if (HIGH_CREDIT(prev)) - run_time /= (CURRENT_BONUS(prev) ? : 1); - spin_lock_irq(&rq->lock); /* @@ -1530,66 +1145,27 @@ need_resched: prev->nvcsw++; break; case TASK_RUNNING: + policy->ops->start_wait(policy->queue, prev); prev->nivcsw++; } + pick_next_task: - if (unlikely(!rq->nr_running)) { #ifdef CONFIG_SMP + if (unlikely(!rq->nr_running)) load_balance(rq, 1, cpu_to_node_mask(smp_processor_id())); - if (rq->nr_running) - goto pick_next_task; #endif - next = rq->idle; - rq->expired_timestamp = 0; - goto switch_tasks; - } - - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - rq->active = rq->expired; - rq->expired = array; - array = rq->active; - rq->expired_timestamp = 0; - } - - idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; - next = list_entry(queue->next, task_t, run_list); - - if (next->activated > 0) { - unsigned long long delta = now - next->timestamp; - - if (next->activated == 1) - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - - array = next->array; - dequeue_task(next, array); - recalc_task_prio(next, next->timestamp + delta); - enqueue_task(next, array); - } - next->activated = 0; -switch_tasks: + next = find_best_task(rq); + BUG_ON(!next); prefetch(next); clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0){ - prev->sleep_avg = 0; - if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) - prev->interactive_credit--; - } - prev->timestamp = now; - if (likely(prev != next)) { - next->timestamp = now; rq->nr_switches++; - rq->curr = next; - prepare_arch_switch(rq, next); + policy = task_policy(next); + policy->ops->set_curr(policy->queue, next); + set_rq_curr(rq, next); prev = context_switch(rq, prev, next); barrier(); @@ -1845,45 +1421,46 @@ void scheduling_functions_end_here(void) void set_user_nice(task_t *p, long nice) { unsigned long flags; - prio_array_t *array; runqueue_t *rq; - int old_prio, new_prio, delta; + struct policy *policy; + int delta, queued; - if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + if (nice < -20 || nice > 19) return; /* * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); + delta = nice - __task_nice(p); + if (!delta) { + if (p->pid == 0 || p->pid == 1) + printk("no change in nice, set_user_nice() nops!\n"); + goto out_unlock; + } + + policy = task_policy(p); + /* * The RT priorities are set via setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected * it wont have any effect on scheduling until the task is * not SCHED_NORMAL: */ - if (rt_task(p)) { - p->static_prio = NICE_TO_PRIO(nice); - goto out_unlock; - } - array = p->array; - if (array) - dequeue_task(p, array); - - old_prio = p->prio; - new_prio = NICE_TO_PRIO(nice); - delta = new_prio - old_prio; - p->static_prio = NICE_TO_PRIO(nice); - p->prio += delta; + queued = task_queued(p); + if (queued) + dequeue_task(p, rq); + + policy->ops->renice(policy->queue, p, nice); - if (array) { - enqueue_task(p, array); + if (queued) { + enqueue_task(p, rq); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: */ if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_task(rq->curr); + resched_task(rq_curr(rq)); } out_unlock: task_rq_unlock(rq, &flags); @@ -1919,7 +1496,7 @@ asmlinkage long sys_nice(int increment) if (increment > 40) increment = 40; - nice = PRIO_TO_NICE(current->static_prio) + increment; + nice = task_nice(current) + increment; if (nice < -20) nice = -20; if (nice > 19) @@ -1935,6 +1512,12 @@ asmlinkage long sys_nice(int increment) #endif +static int __task_prio(task_t *task) +{ + struct policy *policy = task_policy(task); + return policy->ops->prio(task); +} + /** * task_prio - return the priority value of a given task. * @p: the task in question. @@ -1943,29 +1526,111 @@ asmlinkage long sys_nice(int increment) * RT tasks are offset by -200. Normal tasks are centered * around 0, value goes from -16 to +15. */ -int task_prio(task_t *p) +int task_prio(task_t *task) { - return p->prio - MAX_RT_PRIO; + int prio; + unsigned long flags; + runqueue_t *rq; + + rq = task_rq_lock(task, &flags); + prio = __task_prio(task); + task_rq_unlock(rq, &flags); + return prio; } /** * task_nice - return the nice value of a given task. * @p: the task in question. */ -int task_nice(task_t *p) +int task_nice(task_t *task) { - return TASK_NICE(p); + int nice; + unsigned long flags; + runqueue_t *rq; + + + rq = task_rq_lock(task, &flags); + nice = __task_nice(task); + task_rq_unlock(rq, &flags); + return nice; } EXPORT_SYMBOL(task_nice); +int task_sched_policy(task_t *task) +{ + check_task_policy(task); + switch (task->sched_info.policy) { + case SCHED_POLICY_RT: + if (task->sched_info.cl_data.rt.rt_policy + == RT_POLICY_RR) + return SCHED_RR; + else + return SCHED_FIFO; + case SCHED_POLICY_TS: + return SCHED_NORMAL; + case SCHED_POLICY_BATCH: + return SCHED_BATCH; + case SCHED_POLICY_IDLE: + return SCHED_IDLE; + default: + BUG(); + return -1; + } +} +EXPORT_SYMBOL(task_sched_policy); + +void set_task_sched_policy(task_t *task, int policy) +{ + check_task_policy(task); + BUG_ON(task_queued(task)); + switch (policy) { + case SCHED_FIFO: + task->sched_info.policy = SCHED_POLICY_RT; + task->sched_info.cl_data.rt.rt_policy = RT_POLICY_FIFO; + break; + case SCHED_RR: + task->sched_info.policy = SCHED_POLICY_RT; + task->sched_info.cl_data.rt.rt_policy = RT_POLICY_RR; + break; + case SCHED_NORMAL: + task->sched_info.policy = SCHED_POLICY_TS; + break; + case SCHED_BATCH: + task->sched_info.policy = SCHED_POLICY_BATCH; + break; + case SCHED_IDLE: + task->sched_info.policy = SCHED_POLICY_IDLE; + break; + default: + BUG(); + break; + } + check_task_policy(task); +} +EXPORT_SYMBOL(set_task_sched_policy); + +int rt_task(task_t *task) +{ + check_task_policy(task); + return !!(task->sched_info.policy == SCHED_POLICY_RT); +} +EXPORT_SYMBOL(rt_task); + /** * idle_cpu - is a given cpu idle currently? * @cpu: the processor in question. */ int idle_cpu(int cpu) { - return cpu_curr(cpu) == cpu_rq(cpu)->idle; + int idle; + unsigned long flags; + runqueue_t *rq = cpu_rq(cpu); + + spin_lock_irqsave(&rq->lock, flags); + idle = !!(rq->curr == SCHED_POLICY_IDLE); + spin_unlock_irqrestore(&rq->lock, flags); + return idle; } EXPORT_SYMBOL_GPL(idle_cpu); @@ -1985,11 +1650,10 @@ static inline task_t *find_process_by_pi static int setscheduler(pid_t pid, int policy, struct sched_param __user *param) { struct sched_param lp; - int retval = -EINVAL; - int oldprio; - prio_array_t *array; + int queued, retval = -EINVAL; unsigned long flags; runqueue_t *rq; + struct policy *rq_policy; task_t *p; if (!param || pid < 0) @@ -2017,7 +1681,7 @@ static int setscheduler(pid_t pid, int p rq = task_rq_lock(p, &flags); if (policy < 0) - policy = p->policy; + policy = task_sched_policy(p); else { retval = -EINVAL; if (policy != SCHED_FIFO && policy != SCHED_RR && @@ -2047,29 +1711,23 @@ static int setscheduler(pid_t pid, int p if (retval) goto out_unlock; - array = p->array; - if (array) + queued = task_queued(p); + if (queued) deactivate_task(p, task_rq(p)); retval = 0; - p->policy = policy; - p->rt_priority = lp.sched_priority; - oldprio = p->prio; - if (policy != SCHED_NORMAL) - p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; - else - p->prio = p->static_prio; - if (array) { + set_task_sched_policy(p, policy); + check_task_policy(p); + rq_policy = rq->policies[p->sched_info.policy]; + check_policy(rq_policy); + rq_policy->ops->setprio(p, lp.sched_priority); + if (queued) { __activate_task(p, task_rq(p)); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (rq->curr == p) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else if (p->prio < rq->curr->prio) - resched_task(rq->curr); + resched_task(rq_curr(rq)); } out_unlock: @@ -2121,7 +1779,7 @@ asmlinkage long sys_sched_getscheduler(p if (p) { retval = security_task_getscheduler(p); if (!retval) - retval = p->policy; + retval = task_sched_policy(p); } read_unlock(&tasklist_lock); @@ -2153,7 +1811,7 @@ asmlinkage long sys_sched_getparam(pid_t if (retval) goto out_unlock; - lp.sched_priority = p->rt_priority; + lp.sched_priority = task_prio(p); read_unlock(&tasklist_lock); /* @@ -2262,32 +1920,13 @@ out_unlock: */ asmlinkage long sys_sched_yield(void) { + struct policy *policy; runqueue_t *rq = this_rq_lock(); - prio_array_t *array = current->array; - - /* - * We implement yielding by moving the task into the expired - * queue. - * - * (special rule: RT tasks will just roundrobin in the active - * array.) - */ - if (likely(!rt_task(current))) { - dequeue_task(current, array); - enqueue_task(current, rq->expired); - } else { - list_del(¤t->run_list); - list_add_tail(¤t->run_list, array->queue + current->prio); - } - /* - * Since we are going to call schedule() anyway, there's - * no need to preempt: - */ + policy = rq_policy(rq); + policy->ops->yield(policy->queue, current); _raw_spin_unlock(&rq->lock); preempt_enable_no_resched(); - schedule(); - return 0; } @@ -2387,6 +2026,19 @@ asmlinkage long sys_sched_get_priority_m return ret; } +static inline unsigned long task_timeslice(task_t *task) +{ + unsigned long flags, timeslice; + struct policy *policy; + runqueue_t *rq; + + rq = task_rq_lock(task, &flags); + policy = task_policy(task); + timeslice = policy->ops->timeslice(policy->queue, task); + task_rq_unlock(rq, &flags); + return timeslice; +} + /** * sys_sched_rr_get_interval - return the default timeslice of a process. * @pid: pid of the process. @@ -2414,8 +2066,7 @@ asmlinkage long sys_sched_rr_get_interva if (retval) goto out_unlock; - jiffies_to_timespec(p->policy & SCHED_FIFO ? - 0 : task_timeslice(p), &t); + jiffies_to_timespec(task_timeslice(p), &t); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: @@ -2523,17 +2174,22 @@ void show_state(void) void __init init_idle(task_t *idle, int cpu) { runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle)); + struct policy *policy; unsigned long flags; local_irq_save(flags); double_rq_lock(idle_rq, rq); - - idle_rq->curr = idle_rq->idle = idle; + policy = rq_policy(rq); + BUG_ON(policy != task_policy(idle)); + printk("deactivating, have %d tasks\n", + policy->ops->tasks(policy->queue)); deactivate_task(idle, rq); - idle->array = NULL; - idle->prio = MAX_PRIO; + set_task_sched_policy(idle, SCHED_IDLE); idle->state = TASK_RUNNING; set_task_cpu(idle, cpu); + activate_task(idle, rq); + nr_running_dec(rq); + set_rq_curr(rq, idle); double_rq_unlock(idle_rq, rq); set_tsk_need_resched(idle); local_irq_restore(flags); @@ -2804,38 +2460,27 @@ __init static void init_kstat(void) { void __init sched_init(void) { runqueue_t *rq; - int i, j, k; + int i, j; /* Init the kstat counters */ init_kstat(); for (i = 0; i < NR_CPUS; i++) { - prio_array_t *array; - rq = cpu_rq(i); - rq->active = rq->arrays; - rq->expired = rq->arrays + 1; spin_lock_init(&rq->lock); INIT_LIST_HEAD(&rq->migration_queue); atomic_set(&rq->nr_iowait, 0); nr_running_init(rq); - - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); - } + memcpy(rq->policies, policies, sizeof(policies)); + for (j = 0; j < BITS_PER_LONG && rq->policies[j]; ++j) + rq->policies[j]->ops->init(rq->policies[j], i); } /* * We have to do a little magic to get the first * thread right in SMP mode. */ rq = this_rq(); - rq->curr = current; - rq->idle = current; + set_task_sched_policy(current, SCHED_NORMAL); + set_rq_curr(rq, current); set_task_cpu(current, smp_processor_id()); wake_up_forked_process(current); diff -prauN linux-2.6.0-test11/lib/Makefile sched-2.6.0-test11-5/lib/Makefile --- linux-2.6.0-test11/lib/Makefile 2003-11-26 12:42:55.000000000 -0800 +++ sched-2.6.0-test11-5/lib/Makefile 2003-12-20 15:09:16.000000000 -0800 @@ -5,7 +5,7 @@ lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \ bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \ - kobject.o idr.o div64.o parser.o + kobject.o idr.o div64.o parser.o binomial.o lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o diff -prauN linux-2.6.0-test11/lib/binomial.c sched-2.6.0-test11-5/lib/binomial.c --- linux-2.6.0-test11/lib/binomial.c 1969-12-31 16:00:00.000000000 -0800 +++ sched-2.6.0-test11-5/lib/binomial.c 2003-12-20 17:32:09.000000000 -0800 @@ -0,0 +1,138 @@ +#include +#include + +struct binomial *binomial_minimum(struct binomial **heap) +{ + struct binomial *minimum, *tmp; + + for (minimum = NULL, tmp = *heap; tmp; tmp = tmp->sibling) { + if (!minimum || minimum->priority > tmp->priority) + minimum = tmp; + } + return minimum; +} + +static void binomial_link(struct binomial *left, struct binomial *right) +{ + left->parent = right; + left->sibling = right->child; + right->child = left; + right->degree++; +} + +static void binomial_merge(struct binomial **both, struct binomial **left, + struct binomial **right) +{ + while (*left && *right) { + if ((*left)->degree < (*right)->degree) { + *both = *left; + left = &(*left)->sibling; + } else { + *both = *right; + right = &(*right)->sibling; + } + both = &(*both)->sibling; + } + /* + * for more safety: + * *left = *right = NULL; + */ +} + +void binomial_union(struct binomial **both, struct binomial **left, + struct binomial **right) +{ + struct binomial *prev, *tmp, *next; + + binomial_merge(both, left, right); + if (!(tmp = *both)) + return; + + for (prev = NULL, next = tmp->sibling; next; next = tmp->sibling) { + if ((next->sibling && next->sibling->degree == tmp->degree) + || tmp->degree != next->degree) { + prev = tmp; + tmp = next; + } else if (tmp->priority <= next->priority) { + tmp->sibling = next->sibling; + binomial_link(next, tmp); + } else { + if (!prev) + *both = next; + else + prev->sibling = next; + binomial_link(tmp, next); + tmp = next; + } + } +} + +void binomial_insert(struct binomial **heap, struct binomial *element) +{ + element->parent = NULL; + element->child = NULL; + element->sibling = NULL; + element->degree = 0; + binomial_union(heap, heap, &element); +} + +static void binomial_reverse(struct binomial **in, struct binomial **out) +{ + while (*in) { + struct binomial *tmp = *in; + *in = (*in)->sibling; + tmp->sibling = *out; + *out = tmp; + } +} + +struct binomial *binomial_extract_min(struct binomial **heap) +{ + struct binomial *tmp, *minimum, *last, *min_last, *new_heap; + + minimum = last = min_last = new_heap = NULL; + for (tmp = *heap; tmp; last = tmp, tmp = tmp->sibling) { + if (!minimum || tmp->priority < minimum->priority) { + minimum = tmp; + min_last = last; + } + } + if (min_last && minimum) + min_last->sibling = minimum->sibling; + else if (minimum) + (*heap)->sibling = minimum->sibling; + else + return NULL; + binomial_reverse(&minimum->child, &new_heap); + binomial_union(heap, heap, &new_heap); + return minimum; +} + +void binomial_decrease(struct binomial **heap, struct binomial *element, + unsigned increment) +{ + struct binomial *tmp, *last = NULL; + + element->priority -= min(element->priority, increment); + last = element; + tmp = last->parent; + while (tmp && last->priority < tmp->priority) { + unsigned tmp_prio = tmp->priority; + tmp->priority = last->priority; + last->priority = tmp_prio; + last = tmp; + tmp = tmp->parent; + } +} + +void binomial_delete(struct binomial **heap, struct binomial *element) +{ + struct binomial *tmp, *last = element; + for (tmp = last->parent; tmp; last = tmp, tmp = tmp->parent) { + unsigned tmp_prio = tmp->priority; + tmp->priority = last->priority; + last->priority = tmp_prio; + } + binomial_reverse(&last->child, &tmp); + binomial_union(heap, heap, &tmp); +} diff -prauN linux-2.6.0-test11/mm/oom_kill.c sched-2.6.0-test11-5/mm/oom_kill.c --- linux-2.6.0-test11/mm/oom_kill.c 2003-11-26 12:44:16.000000000 -0800 +++ sched-2.6.0-test11-5/mm/oom_kill.c 2003-12-17 07:07:53.000000000 -0800 @@ -158,7 +158,6 @@ static void __oom_kill_task(task_t *p) * all the memory it needs. That way it should be able to * exit() and clear out its resources quickly... */ - p->time_slice = HZ; p->flags |= PF_MEMALLOC | PF_MEMDIE; /* This process has hardware access, be more careful. */